# spacetrackのテスト その4

大量の軌道要素ファイルをまとめて取り扱いやすくするテスト。

軌道要素データは既に download_gp_date_json.py で download/YYYY/ 以下にダウンロード済みであるものとする。

In [1]:
import pandas as pd
import os
import glob
import cProfile
import h5py
import sqlite3

In [2]:
# データの出力先 (十分な空き容量のあるストレージを指定する)
outputpath = '/work/'

# 既にダウンロードしxzで圧縮されたデータを用いる (download_gp_date_json.py でダウンロード)
allfiles = sorted(glob.glob('download/1980/*.json.xz'))
print(len(allfiles))

366


In [3]:
pd.set_option('display.max_columns', 50)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 80)

In [4]:
# 各columnの型
# https://www.space-track.org/basicspacedata/modeldef/class/gp/format/html も参照のこと
# decimal型のcolumnはfloat64として取り扱うことにする
# 型として str を指定すると、元データが null のとき、"None" という文字列になってしまうので、object を指定する
# 実際の元データでnullとなっているものがあるのは COUNTRY_CODE, DECAY_DATE, LAUNCH_DATE, OBJECT_ID, RCS_SIZE, SITE
dtype = {'CCSDS_OMM_VERS': object,  'COMMENT': object,  'CREATION_DATE': 'datetime64[ns]',  'ORIGINATOR': object, 
       'OBJECT_NAME': object,  'OBJECT_ID': object,  'CENTER_NAME': object,  'REF_FRAME': object, 
       'TIME_SYSTEM': object,  'MEAN_ELEMENT_THEORY': object,  'EPOCH': 'datetime64[ns]',  'MEAN_MOTION': 'float64', 
       'ECCENTRICITY': 'float64',  'INCLINATION': 'float64',  'RA_OF_ASC_NODE': 'float64', 
       'ARG_OF_PERICENTER': 'float64',  'MEAN_ANOMALY': 'float64',  'EPHEMERIS_TYPE': 'int8', 
       'CLASSIFICATION_TYPE': object,  'NORAD_CAT_ID': 'uint32',  'ELEMENT_SET_NO': 'uint16', 
       'REV_AT_EPOCH': 'uint32',  'BSTAR': 'float64',  'MEAN_MOTION_DOT': 'float64',  'MEAN_MOTION_DDOT': 'float64', 
       'SEMIMAJOR_AXIS': 'float64',  'PERIOD': 'float64',  'APOAPSIS': 'float64',  'PERIAPSIS': 'float64',  'OBJECT_TYPE': object, 
       'RCS_SIZE': object,  'COUNTRY_CODE': object,  'LAUNCH_DATE': 'datetime64[ns]',  'SITE': object,  'DECAY_DATE': 'datetime64[ns]', 
       'FILE': 'uint64',  'GP_ID': 'uint32',  'TLE_LINE0': object,  'TLE_LINE1': object,  'TLE_LINE2': object}

# 以下のcolumnは日時として解釈する (元データが空欄の場合は NaT になる)
convert_dates = ['EPOCH', 'CREATION_DATE', 'LAUNCH_DATE', 'DECAY_DATE']

In [5]:
# column名
columns = list(dtype.keys())

# DBに出力するcolumn
#columns_out = ['CREATION_DATE', 'EPOCH', 'OBJECT_ID', 'MEAN_MOTION', 'ECCENTRICITY', 'INCLINATION', 'RA_OF_ASC_NODE',
#    'ARG_OF_PERICENTER', 'MEAN_ANOMALY', 'NORAD_CAT_ID', 'REV_AT_EPOCH', 'BSTAR', 'SEMIMAJOR_AXIS',
#    'PERIOD', 'APOAPSIS', 'PERIAPSIS', 'GP_ID', 'TLE_LINE0', 'TLE_LINE1', 'TLE_LINE2']

# indexに用いるcolumn
columns_index = ['CREATION_DATE', 'EPOCH', 'OBJECT_ID', 'MEAN_MOTION', 'ECCENTRICITY', 'INCLINATION', 'RA_OF_ASC_NODE',
    'ARG_OF_PERICENTER', 'MEAN_ANOMALY', 'NORAD_CAT_ID', 'REV_AT_EPOCH', 'BSTAR', 'SEMIMAJOR_AXIS',
    'PERIOD', 'APOAPSIS', 'PERIAPSIS', 'GP_ID']

In [6]:
# nullデータを含むテストデータ
# int型のcolumnにnullを含めると、DataFrameの列がobjectになり、to_hdfがエラーとなるので、適当な値を入れておく
# gp APIが返すデータには、int型のcolumnにnullが含まれた実績はない(?)
json_null = '[{"CCSDS_OMM_VERS":null,"COMMENT":null,"CREATION_DATE":null,"ORIGINATOR":null,"OBJECT_NAME":null,"OBJECT_ID":null,"CENTER_NAME":null,"REF_FRAME":null,"TIME_SYSTEM":null,"MEAN_ELEMENT_THEORY":null,"EPOCH":null,"MEAN_MOTION":null,"ECCENTRICITY":null,"INCLINATION":null,"RA_OF_ASC_NODE":null,"ARG_OF_PERICENTER":null,"MEAN_ANOMALY":null,"EPHEMERIS_TYPE":"0","CLASSIFICATION_TYPE":null,"NORAD_CAT_ID":"0","ELEMENT_SET_NO":"0","REV_AT_EPOCH":"0","BSTAR":null,"MEAN_MOTION_DOT":null,"MEAN_MOTION_DDOT":null,"SEMIMAJOR_AXIS":null,"PERIOD":null,"APOAPSIS":null,"PERIAPSIS":null,"OBJECT_TYPE":null,"RCS_SIZE":null,"COUNTRY_CODE":null,"LAUNCH_DATE":null,"SITE":null,"DECAY_DATE":null,"FILE":"0","GP_ID":"4294967295","TLE_LINE0":null,"TLE_LINE1":null,"TLE_LINE2":null}]'
df_null = pd.read_json(json_null, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records')

In [7]:
# テスト保存先ファイル名を定義
file_json = outputpath + 'test.json'
file_json2 = outputpath + 'test2.json'
file_pickle = outputpath + 'test.pickle'
file_pickle2 = outputpath + 'test.pickle.gz'
file_parquet = outputpath + 'test.parquet'
file_parquet2 = outputpath + 'test2.parquet'
file_hdf = outputpath + 'test.hdf5'
file_hdf2 = outputpath + 'test2.hdf5'
file_sqlite = outputpath + 'test.sqlite3'

## ファイル1個を読む

In [8]:
print(allfiles[0])

download/1980/19800101.json.xz


In [9]:
# ファイル1個を読む速度 (型を自動判定)
def readtest1(file):
    df_tmp = pd.read_json(file, convert_dates = convert_dates, precise_float = True, orient = 'records')
    return df_tmp

%timeit -r 5 df = readtest1(allfiles[0])

62.4 ms ± 194 µs per loop (mean ± std. dev. of 5 runs, 10 loops each)


In [10]:
# ファイル1個を読む速度 (型を指定)
def readtest1(file):
    df_tmp = pd.read_json(file, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records')
    return df_tmp

%timeit -r 5 df = readtest1(allfiles[0])

55.5 ms ± 115 µs per loop (mean ± std. dev. of 5 runs, 10 loops each)


## DataFrame に大量のデータを追加する速度を比較
ファイル1個ごとにappendしていくと遅いので、最小回数で作成する。

In [11]:
# 以降で使うテスト用データ
df_test = pd.read_json(allfiles[0], convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records')

In [12]:
# appendでつなげていく (O(n^2)なのでデータ数が増えると使い物にならない)
def appendtest(df_test, n):
    df = pd.DataFrame()
    for i in range(0, n):
        df = df.append(df_test[columns], ignore_index=True)
    return df
        
%timeit -r 5 df = appendtest(df_test, 10)
%timeit -r 5 df = appendtest(df_test, 100)

90 ms ± 130 µs per loop (mean ± std. dev. of 5 runs, 10 loops each)
5.41 s ± 12 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [13]:
# 全てのデータを1回のconcatで連結する
def concattest(df_test, n):
    df_list = []
    for i in range(0, n):
        df_list.append(df_test[columns])
    df = pd.concat(df_list)
    return df

%timeit -r 5 df = concattest(df_test, 10)
%timeit -r 5 df = concattest(df_test, 100)
%timeit -r 5 df = concattest(df_test, 1000)

25.4 ms ± 13.3 µs per loop (mean ± std. dev. of 5 runs, 10 loops each)
256 ms ± 686 µs per loop (mean ± std. dev. of 5 runs, 1 loop each)
3 s ± 179 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [14]:
# 列ごとにlistを作ってから、DataFrameに変換する (listを値とするdictを作成している)
# from_dict に時間がかかっている
def listtest(df_test, n):
    d = {column: [] for column in columns}
    for i in range(0, n):
        for column in columns:
            d[column].extend(df_test[column].values.tolist())
            #d[column] += df_test[column].values.tolist()
    df = pd.DataFrame.from_dict(d)
    #df =  pd.DataFrame(data = d, columns = columns)  # 少し遅い
    return df

%timeit -r 5 df = listtest(df_test, 10)
%timeit -r 5 df = listtest(df_test, 100)

68.3 ms ± 1.01 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)
756 ms ± 239 µs per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [15]:
cProfile.run('appendtest(df_test, 100)')

         1620729 function calls (1616839 primitive calls) in 5.739 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      396    0.001    0.000    0.004    0.000 <__array_function__ internals>:2(argsort)
      396    0.000    0.000    0.003    0.000 <__array_function__ internals>:2(atleast_2d)
      200    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(can_cast)
     2574    0.003    0.000    1.264    0.000 <__array_function__ internals>:2(concatenate)
      398    0.000    0.000    0.002    0.000 <__array_function__ internals>:2(min_scalar_type)
      396    0.001    0.000    0.645    0.002 <__array_function__ internals>:2(vstack)
    10134    0.008    0.000    0.012    0.000 <frozen importlib._bootstrap>:1017(_handle_fromlist)
        1    0.314    0.314    5.733    5.733 <ipython-input-12-9eea22fa9f20>:2(appendtest)
        1    0.006    0.006    5.739    5.739 <string>:1(<module>)
     5685    0.002   

In [16]:
cProfile.run('concattest(df_test, 100)')

         727720 function calls (726117 primitive calls) in 0.473 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      200    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(can_cast)
       19    0.000    0.000    0.015    0.001 <__array_function__ internals>:2(concatenate)
      200    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(min_scalar_type)
     4118    0.003    0.000    0.004    0.000 <frozen importlib._bootstrap>:1017(_handle_fromlist)
        1    0.000    0.000    0.450    0.450 <ipython-input-13-83d7805a51aa>:2(concattest)
        1    0.023    0.023    0.472    0.472 <string>:1(<module>)
     3999    0.001    0.000    0.005    0.000 _asarray.py:16(asarray)
      400    0.001    0.000    0.002    0.000 _asarray.py:223(require)
      400    0.000    0.000    0.000    0.000 _asarray.py:300(<setcomp>)
      200    0.000    0.000    0.000    0.000 _asarray.py:88(asanyarray)
  

In [17]:
cProfile.run('listtest(df_test, 100)')

         76799 function calls (76753 primitive calls) in 0.772 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       40    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(copyto)
       44    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1017(_handle_fromlist)
        1    0.011    0.011    0.719    0.719 <ipython-input-14-dd9b0dc6fdcb>:3(listtest)
        1    0.000    0.000    0.000    0.000 <ipython-input-14-dd9b0dc6fdcb>:4(<dictcomp>)
        1    0.053    0.053    0.772    0.772 <string>:1(<module>)
       42    0.000    0.000    0.000    0.000 _asarray.py:16(asarray)
        3    0.000    0.000    0.000    0.000 _asarray.py:223(require)
        3    0.000    0.000    0.000    0.000 _asarray.py:300(<setcomp>)
       18    0.000    0.000    0.000    0.000 _dtype.py:319(_name_includes_bit_suffix)
       18    0.000    0.000    0.000    0.000 _dtype.py:333(_name_get)
       18    0.000 

## 1年分のデータを読んでみる
実行には相当な時間がかかる。

In [18]:
# 単純にファイルを読む速度
def readtest2(files):
    for file in files:
        df_tmp = pd.read_json(file, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records')
        
%timeit -r 3 df = readtest2(allfiles)

23.5 s ± 187 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [19]:
# 全てのデータを1回のconcatで連結する
def concattest2(files):
    df_list = []
    for file in files:
        df_list.append(pd.read_json(file, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records'))
    df = pd.concat(df_list)
    return df

%timeit -r 3 df = concattest2(allfiles)

28.2 s ± 412 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


## 1年分のデータをまとめて保存する

JSON、CSV、Pickleは、一部のデータのみが必要な場合でもファイル全体を読む必要があるので、巨大データの保存には向いていない。
ParquetとPickleは読み書き共に速いので一時作業保存用に便利。解析には必要なデータのみを取り出せるSQLite3が便利。

HDFはNoneを含むint型の列の取り扱いについて要検討。gp APIは問題ない(はず)が、satcat等の他のAPIにはそのようなデータが存在する。

In [20]:
%%time
# テスト用のデータを準備する (欠損値のふるまいを調べるため、nullを含むダミーデータを末尾に追加しておく)
df_list = []
for file in allfiles:
    df_list.append(pd.read_json(file, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records'))
df_list.append(df_null)
df = pd.concat(df_list)

CPU times: user 27.8 s, sys: 425 ms, total: 28.3 s
Wall time: 28.3 s


In [21]:
print(len(df))

624983


In [22]:
# nullデータはfloat はnan、datetime64[ns]はNaT、それ以外はNoneとなる (read_jsonのdtypeにstr を指定すると "None" という文字列になってしまう)
print(df.tail(2).values)

[['2.0' 'GENERATED VIA SPACE-TRACK.ORG API'
  Timestamp('2004-08-16 23:12:35') '18 SPCS' 'DELTA 1 DEB' '1977-065GB'
  'EARTH' 'TEME' 'UTC' 'SGP4' Timestamp('1980-12-31 15:41:36.007871')
  12.00906572 0.0852058 30.9733 123.512 61.6999 306.6869 0 'U' 19638 999
  13637 0.014414 3.472e-05 0.0 8054.946 119.909 2363.139 990.482 'DEBRIS'
  'SMALL' 'US' Timestamp('1977-07-14 00:00:00') 'AFETR' NaT 34278
  11691699 '0 DELTA 1 DEB'
  '1 19638U 77065 GB 80366.65388898  .00003472 +00000-0 +14414-1 0  9991'
  '2 19638 030.9733 123.5120 0852058 061.6999 306.6869 12.00906572136376']
 [None None NaT None None None None None None None NaT nan nan nan nan
  nan nan 0 None 0 0 0 nan nan nan nan nan nan nan None None None NaT
  None NaT 0 4294967295 None None None]]


In [23]:
print(df.dtypes)

CCSDS_OMM_VERS                 object
COMMENT                        object
CREATION_DATE          datetime64[ns]
ORIGINATOR                     object
OBJECT_NAME                    object
OBJECT_ID                      object
CENTER_NAME                    object
REF_FRAME                      object
TIME_SYSTEM                    object
MEAN_ELEMENT_THEORY            object
EPOCH                  datetime64[ns]
MEAN_MOTION                   float64
ECCENTRICITY                  float64
INCLINATION                   float64
RA_OF_ASC_NODE                float64
ARG_OF_PERICENTER             float64
MEAN_ANOMALY                  float64
EPHEMERIS_TYPE                   int8
CLASSIFICATION_TYPE            object
NORAD_CAT_ID                   uint32
ELEMENT_SET_NO                 uint16
REV_AT_EPOCH                   uint32
BSTAR                         float64
MEAN_MOTION_DOT               float64
MEAN_MOTION_DDOT              float64
SEMIMAJOR_AXIS                float64
PERIOD      

In [24]:
print(df.tail(2).applymap(type))

          CCSDS_OMM_VERS             COMMENT  \
1839       <class 'str'>       <class 'str'>   
0     <class 'NoneType'>  <class 'NoneType'>   

                                           CREATION_DATE          ORIGINATOR  \
1839  <class 'pandas._libs.tslibs.timestamps.Timestamp'>       <class 'str'>   
0          <class 'pandas._libs.tslibs.nattype.NaTType'>  <class 'NoneType'>   

             OBJECT_NAME           OBJECT_ID         CENTER_NAME  \
1839       <class 'str'>       <class 'str'>       <class 'str'>   
0     <class 'NoneType'>  <class 'NoneType'>  <class 'NoneType'>   

               REF_FRAME         TIME_SYSTEM MEAN_ELEMENT_THEORY  \
1839       <class 'str'>       <class 'str'>       <class 'str'>   
0     <class 'NoneType'>  <class 'NoneType'>  <class 'NoneType'>   

                                                   EPOCH      MEAN_MOTION  \
1839  <class 'pandas._libs.tslibs.timestamps.Timestamp'>  <class 'float'>   
0          <class 'pandas._libs.tslibs.nattype.NaT

In [25]:
%%time
# 重複データ
dup = df.duplicated()
print(dup.sum())
#print(df[dup].index.values)

0
CPU times: user 4.35 s, sys: 166 ms, total: 4.52 s
Wall time: 4.51 s


In [26]:
%%time
# 重複データ(GP_IDのみで判定) ← ひとまずこちらで十分
dup2 = df.duplicated(subset = ['GP_ID'])
print(dup2.sum())
#print(df[dup].index.values)

0
CPU times: user 38.5 ms, sys: 11.9 ms, total: 50.4 ms
Wall time: 48.2 ms


In [27]:
%%time
df.drop_duplicates(subset = ['GP_ID'], ignore_index = True, inplace=True)
print(len(df))

624983
CPU times: user 1.57 s, sys: 125 ms, total: 1.69 s
Wall time: 1.69 s


In [28]:
%%time
# json で保存 (行指向 records)
# date_format を指定しないと、datetime64はシリアル値(64bit整数)として記録される
# date_unit で時刻の精度を指定する
df.to_json(file_json, orient = 'records', date_format='iso', date_unit='us')
print(os.path.getsize(file_json))

695286796
CPU times: user 4.83 s, sys: 1.23 s, total: 6.06 s
Wall time: 6.11 s


In [29]:
%%time
# json で保存 (列指向 columns)
# date_format を指定しないと、datetime64はシリアル値(64bit整数)として記録される
# date_unit で時刻の精度を指定する
df.to_json(file_json2, orient = 'columns', date_format='iso', date_unit='us')
print(os.path.getsize(file_json2))

569596326
CPU times: user 5.18 s, sys: 1.12 s, total: 6.3 s
Wall time: 6.32 s


In [30]:
%%time
# pickle で保存 (無圧縮)
df.to_pickle(file_pickle)
print(os.path.getsize(file_pickle))

291863412
CPU times: user 3.91 s, sys: 522 ms, total: 4.43 s
Wall time: 4.45 s


In [31]:
%%time
# pickle で保存 (gzip圧縮)
df.to_pickle(file_pickle2)
print(os.path.getsize(file_pickle2))

80091362
CPU times: user 28 s, sys: 381 ms, total: 28.4 s
Wall time: 28.4 s


In [32]:
%%time
# parquet で保存 (default: snappy圧縮)
df.to_parquet(file_parquet)
print(os.path.getsize(file_parquet))

114114066
CPU times: user 3.55 s, sys: 442 ms, total: 3.99 s
Wall time: 3.79 s


In [33]:
%%time
# parquet で保存 (zstd圧縮)
df.to_parquet(file_parquet2, compression='zstd')
print(os.path.getsize(file_parquet2))

89789995
CPU times: user 3.59 s, sys: 355 ms, total: 3.95 s
Wall time: 3.69 s


In [34]:
%%time
# HDFで保存 (pandas.DataFrame.to_hdf を使用)
# あとで検索に使えるよう、主要なcolumnにindexをつけておく
# format = 'fixed' とすると、レコード数が多いときにエラーが発生する
# object型のcolumnにintが含まれているとエラーが発生する
df.to_hdf(file_hdf, 'test', mode = 'w', format = 'table', data_columns = columns_index)
print(os.path.getsize(file_hdf))

852055616
CPU times: user 23.6 s, sys: 4.67 s, total: 28.2 s
Wall time: 28.5 s


In [35]:
%%time
# HDFで保存 (pandas.HDFStore を使用) → to_hdf と同じモノが生成される
# あとで検索に使えるよう、主要なcolumnにindexをつけておく
# object型のcolumnにintが含まれているとエラーが発生する
store = pd.HDFStore(file_hdf2)
store.append('test', df, data_columns = columns_index)
store.close()
print(os.path.getsize(file_hdf2))

1708069520
CPU times: user 23.8 s, sys: 6.83 s, total: 30.7 s
Wall time: 30.7 s


In [36]:
%%time
# SQLite3で保存
# 型は、real, integer, text, timestamp に集約される
# indexのつけ方については要検討
with sqlite3.connect(file_sqlite) as conn:
    #c = conn.cursor()
    df.to_sql('elset', conn, if_exists='replace', index=None)
    conn.execute('CREATE UNIQUE INDEX index_elset_gp_id ON elset (GP_ID)')
    conn.execute('CREATE INDEX index_elset_epoch ON elset (EPOCH)')
    conn.execute('CREATE INDEX index_elset_norad_cat_id ON elset (NORAD_CAT_ID)')
    conn.commit()
print(os.path.getsize(file_sqlite))

358428672
CPU times: user 15.3 s, sys: 1.54 s, total: 16.8 s
Wall time: 16.9 s


## 保存したデータを読み込む

In [37]:
%%time
# JSONを読み込む (行指向 records)
# orient はつけなくてもほとんどの場合自動判別してくれる
df_tmp = pd.read_json(file_json, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records')
print(len(df_tmp))
print(df_tmp.tail(2).values)

624983
[['2.0' 'GENERATED VIA SPACE-TRACK.ORG API'
  Timestamp('2004-08-16 23:12:35') '18 SPCS' 'DELTA 1 DEB' '1977-065GB'
  'EARTH' 'TEME' 'UTC' 'SGP4' Timestamp('1980-12-31 15:41:36.007871')
  12.00906572 0.0852058 30.9733 123.512 61.6999 306.6869 0 'U' 19638 999
  13637 0.014414 3.472e-05 0.0 8054.946 119.909 2363.139 990.482 'DEBRIS'
  'SMALL' 'US' Timestamp('1977-07-14 00:00:00') 'AFETR' NaT 34278
  11691699 '0 DELTA 1 DEB'
  '1 19638U 77065 GB 80366.65388898  .00003472 +00000-0 +14414-1 0  9991'
  '2 19638 030.9733 123.5120 0852058 061.6999 306.6869 12.00906572136376']
 [None None NaT None None None None None None None NaT nan nan nan nan
  nan nan 0 None 0 0 0 nan nan nan nan nan nan nan None None None NaT
  None NaT 0 4294967295 None None None]]
CPU times: user 17.7 s, sys: 2.91 s, total: 20.7 s
Wall time: 20.6 s


In [38]:
%%time
# JSONを読み込む (列指向 columns)
# orient はつけなくてもほとんどの場合自動判別してくれる
df_tmp = pd.read_json(file_json2, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'columns')
print(len(df_tmp))
print(df_tmp.tail(2).values)

624983
[['2.0' 'GENERATED VIA SPACE-TRACK.ORG API'
  Timestamp('2004-08-16 23:12:35') '18 SPCS' 'DELTA 1 DEB' '1977-065GB'
  'EARTH' 'TEME' 'UTC' 'SGP4' Timestamp('1980-12-31 15:41:36.007871')
  12.00906572 0.0852058 30.9733 123.512 61.6999 306.6869 0 'U' 19638 999
  13637 0.014414 3.472e-05 0.0 8054.946 119.909 2363.139 990.482 'DEBRIS'
  'SMALL' 'US' Timestamp('1977-07-14 00:00:00') 'AFETR' NaT 34278
  11691699 '0 DELTA 1 DEB'
  '1 19638U 77065 GB 80366.65388898  .00003472 +00000-0 +14414-1 0  9991'
  '2 19638 030.9733 123.5120 0852058 061.6999 306.6869 12.00906572136376']
 [None None NaT None None None None None None None NaT nan nan nan nan
  nan nan 0 None 0 0 0 nan nan nan nan nan nan nan None None None NaT
  None NaT 0 4294967295 None None None]]
CPU times: user 26.6 s, sys: 1.59 s, total: 28.2 s
Wall time: 28.2 s


In [39]:
%%time
# pickleを読み込む (無圧縮)
df_tmp = pd.read_pickle(file_pickle)
print(len(df_tmp))
print(df_tmp.tail(2).values)

624983
[['2.0' 'GENERATED VIA SPACE-TRACK.ORG API'
  Timestamp('2004-08-16 23:12:35') '18 SPCS' 'DELTA 1 DEB' '1977-065GB'
  'EARTH' 'TEME' 'UTC' 'SGP4' Timestamp('1980-12-31 15:41:36.007871')
  12.00906572 0.0852058 30.9733 123.512 61.6999 306.6869 0 'U' 19638 999
  13637 0.014414 3.472e-05 0.0 8054.946 119.909 2363.139 990.482 'DEBRIS'
  'SMALL' 'US' Timestamp('1977-07-14 00:00:00') 'AFETR' NaT 34278
  11691699 '0 DELTA 1 DEB'
  '1 19638U 77065 GB 80366.65388898  .00003472 +00000-0 +14414-1 0  9991'
  '2 19638 030.9733 123.5120 0852058 061.6999 306.6869 12.00906572136376']
 [None None NaT None None None None None None None NaT nan nan nan nan
  nan nan 0 None 0 0 0 nan nan nan nan nan nan nan None None None NaT
  None NaT 0 4294967295 None None None]]
CPU times: user 1.4 s, sys: 139 ms, total: 1.54 s
Wall time: 1.54 s


In [40]:
%%time
# pickleを読み込む (gzip圧縮)
df_tmp = pd.read_pickle(file_pickle2)
print(len(df_tmp))
print(df_tmp.tail(2).values)

624983
[['2.0' 'GENERATED VIA SPACE-TRACK.ORG API'
  Timestamp('2004-08-16 23:12:35') '18 SPCS' 'DELTA 1 DEB' '1977-065GB'
  'EARTH' 'TEME' 'UTC' 'SGP4' Timestamp('1980-12-31 15:41:36.007871')
  12.00906572 0.0852058 30.9733 123.512 61.6999 306.6869 0 'U' 19638 999
  13637 0.014414 3.472e-05 0.0 8054.946 119.909 2363.139 990.482 'DEBRIS'
  'SMALL' 'US' Timestamp('1977-07-14 00:00:00') 'AFETR' NaT 34278
  11691699 '0 DELTA 1 DEB'
  '1 19638U 77065 GB 80366.65388898  .00003472 +00000-0 +14414-1 0  9991'
  '2 19638 030.9733 123.5120 0852058 061.6999 306.6869 12.00906572136376']
 [None None NaT None None None None None None None NaT nan nan nan nan
  nan nan 0 None 0 0 0 nan nan nan nan nan nan nan None None None NaT
  None NaT 0 4294967295 None None None]]
CPU times: user 2.48 s, sys: 85 ms, total: 2.56 s
Wall time: 2.56 s


In [41]:
%%time
# parquetを読み込む (default: snappy圧縮)
df_tmp = pd.read_parquet(file_parquet)
print(len(df_tmp))
print(df_tmp.tail(2).values)

624983
[['2.0' 'GENERATED VIA SPACE-TRACK.ORG API'
  Timestamp('2004-08-16 23:12:35') '18 SPCS' 'DELTA 1 DEB' '1977-065GB'
  'EARTH' 'TEME' 'UTC' 'SGP4' Timestamp('1980-12-31 15:41:36.007871')
  12.00906572 0.0852058 30.9733 123.512 61.6999 306.6869 0 'U' 19638 999
  13637 0.014414 3.472e-05 0.0 8054.946 119.909 2363.139 990.482 'DEBRIS'
  'SMALL' 'US' Timestamp('1977-07-14 00:00:00') 'AFETR' NaT 34278
  11691699 '0 DELTA 1 DEB'
  '1 19638U 77065 GB 80366.65388898  .00003472 +00000-0 +14414-1 0  9991'
  '2 19638 030.9733 123.5120 0852058 061.6999 306.6869 12.00906572136376']
 [None None NaT None None None None None None None NaT nan nan nan nan
  nan nan 0 None 0 0 0 nan nan nan nan nan nan nan None None None NaT
  None NaT 0 4294967295 None None None]]
CPU times: user 2.37 s, sys: 1.01 s, total: 3.38 s
Wall time: 1.69 s


In [42]:
%%time
# parquetを読み込む (zstd圧縮)
df_tmp = pd.read_parquet(file_parquet2)
print(len(df_tmp))
print(df_tmp.tail(2).values)

624983
[['2.0' 'GENERATED VIA SPACE-TRACK.ORG API'
  Timestamp('2004-08-16 23:12:35') '18 SPCS' 'DELTA 1 DEB' '1977-065GB'
  'EARTH' 'TEME' 'UTC' 'SGP4' Timestamp('1980-12-31 15:41:36.007871')
  12.00906572 0.0852058 30.9733 123.512 61.6999 306.6869 0 'U' 19638 999
  13637 0.014414 3.472e-05 0.0 8054.946 119.909 2363.139 990.482 'DEBRIS'
  'SMALL' 'US' Timestamp('1977-07-14 00:00:00') 'AFETR' NaT 34278
  11691699 '0 DELTA 1 DEB'
  '1 19638U 77065 GB 80366.65388898  .00003472 +00000-0 +14414-1 0  9991'
  '2 19638 030.9733 123.5120 0852058 061.6999 306.6869 12.00906572136376']
 [None None NaT None None None None None None None NaT nan nan nan nan
  nan nan 0 None 0 0 0 nan nan nan nan nan nan nan None None None NaT
  None NaT 0 4294967295 None None None]]
CPU times: user 2.74 s, sys: 920 ms, total: 3.66 s
Wall time: 1.23 s


In [43]:
%%time
# HDFを読み込む
# 文字列型のcolumnの欠損値がnanになっていることに注意
df_tmp = pd.read_hdf(file_hdf, 'test')
print(len(df_tmp))
print(df_tmp.tail(2).values)

624983
[['2.0' 'GENERATED VIA SPACE-TRACK.ORG API'
  Timestamp('2004-08-16 23:12:35') '18 SPCS' 'DELTA 1 DEB' '1977-065GB'
  'EARTH' 'TEME' 'UTC' 'SGP4' Timestamp('1980-12-31 15:41:36.007871')
  12.00906572 0.0852058 30.9733 123.512 61.6999 306.6869 0 'U' 19638 999
  13637 0.014414 3.472e-05 0.0 8054.946 119.909 2363.139 990.482 'DEBRIS'
  'SMALL' 'US' Timestamp('1977-07-14 00:00:00') 'AFETR' NaT 34278
  11691699 '0 DELTA 1 DEB'
  '1 19638U 77065 GB 80366.65388898  .00003472 +00000-0 +14414-1 0  9991'
  '2 19638 030.9733 123.5120 0852058 061.6999 306.6869 12.00906572136376']
 [nan nan NaT nan nan nan nan nan nan nan NaT nan nan nan nan nan nan 0
  nan 0 0 0 nan nan nan nan nan nan nan nan nan nan NaT nan NaT 0
  4294967295 nan nan nan]]
CPU times: user 9.18 s, sys: 1.57 s, total: 10.8 s
Wall time: 10.7 s


In [44]:
%%time
# HDFを読み込む  (pandas.HDFStore を使用)
# 文字列型のcolumnの欠損値がnanになっていることに注意
store = pd.HDFStore(file_hdf2)
df_tmp = store.get('test')
store.close()
print(len(df_tmp))
print(df_tmp.tail(2).values)

1249966
[['2.0' 'GENERATED VIA SPACE-TRACK.ORG API'
  Timestamp('2004-08-16 23:12:35') '18 SPCS' 'DELTA 1 DEB' '1977-065GB'
  'EARTH' 'TEME' 'UTC' 'SGP4' Timestamp('1980-12-31 15:41:36.007871')
  12.00906572 0.0852058 30.9733 123.512 61.6999 306.6869 0 'U' 19638 999
  13637 0.014414 3.472e-05 0.0 8054.946 119.909 2363.139 990.482 'DEBRIS'
  'SMALL' 'US' Timestamp('1977-07-14 00:00:00') 'AFETR' NaT 34278
  11691699 '0 DELTA 1 DEB'
  '1 19638U 77065 GB 80366.65388898  .00003472 +00000-0 +14414-1 0  9991'
  '2 19638 030.9733 123.5120 0852058 061.6999 306.6869 12.00906572136376']
 [nan nan NaT nan nan nan nan nan nan nan NaT nan nan nan nan nan nan 0
  nan 0 0 0 nan nan nan nan nan nan nan nan nan nan NaT nan NaT 0
  4294967295 nan nan nan]]
CPU times: user 18.7 s, sys: 2.95 s, total: 21.6 s
Wall time: 21.6 s


In [45]:
%%time
# SQLite3から読み込む
# 型は、real => float64, integer => int64, text => object(str), timestamp => datetime64[ns] となる
with sqlite3.connect(file_sqlite) as conn:
    df_tmp = pd.read_sql_query("SELECT * FROM elset", conn, parse_dates = convert_dates)
print(len(df_tmp))
print(df_tmp.tail(2).values)

624983
[['2.0' 'GENERATED VIA SPACE-TRACK.ORG API'
  Timestamp('2004-08-16 23:12:35') '18 SPCS' 'DELTA 1 DEB' '1977-065GB'
  'EARTH' 'TEME' 'UTC' 'SGP4' Timestamp('1980-12-31 15:41:36.007871')
  12.00906572 0.0852058 30.9733 123.512 61.6999 306.6869 0 'U' 19638 999
  13637 0.014414 3.472e-05 0.0 8054.946 119.909 2363.139 990.482 'DEBRIS'
  'SMALL' 'US' Timestamp('1977-07-14 00:00:00') 'AFETR' NaT 34278
  11691699 '0 DELTA 1 DEB'
  '1 19638U 77065 GB 80366.65388898  .00003472 +00000-0 +14414-1 0  9991'
  '2 19638 030.9733 123.5120 0852058 061.6999 306.6869 12.00906572136376']
 [None None NaT None None None None None None None NaT nan nan nan nan
  nan nan 0 None 0 0 0 nan nan nan nan nan nan nan None None None NaT
  None NaT 0 4294967295 None None None]]
CPU times: user 13.2 s, sys: 1.24 s, total: 14.4 s
Wall time: 14.4 s


## 一部のデータのみを読んでみる

In [46]:
%%time
# parquetを読み込む (default: snappy圧縮)
df_tmp = pd.read_parquet(file_parquet, columns = ['EPOCH', 'PERIAPSIS', 'NORAD_CAT_ID'])
print(len(df_tmp))
df_tmp.head(3)

624983
CPU times: user 506 ms, sys: 187 ms, total: 694 ms
Wall time: 630 ms


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
0,1980-01-01 23:49:31.538495,556.253,11
1,1980-01-01 09:22:09.017759,555.13,12
2,1980-01-01 23:33:02.931552,512.009,20


In [47]:
%%time
# parquetを読み込む (zstd圧縮)
# 圧縮されているにも関わらず速い。素晴らしい
df_tmp = pd.read_parquet(file_parquet2,  columns = ['EPOCH', 'PERIAPSIS', 'NORAD_CAT_ID'])
print(len(df_tmp))
df_tmp.head(3)

624983
CPU times: user 89.4 ms, sys: 56 ms, total: 145 ms
Wall time: 66.9 ms


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
0,1980-01-01 23:49:31.538495,556.253,11
1,1980-01-01 09:22:09.017759,555.13,12
2,1980-01-01 23:33:02.931552,512.009,20


In [48]:
%%time
# HDFから一部の列のみを読む
# 全体を読むのと同じだけの時間がかかる
df_tmp = pd.read_hdf(file_hdf, 'test', columns = ['EPOCH', 'PERIAPSIS', 'NORAD_CAT_ID'])
print(len(df_tmp))
df_tmp.head(3)

624983
CPU times: user 9.04 s, sys: 1.52 s, total: 10.6 s
Wall time: 10.5 s


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
0,1980-01-01 23:49:31.538495,556.253,11
1,1980-01-01 09:22:09.017759,555.13,12
2,1980-01-01 23:33:02.931552,512.009,20


In [49]:
%%time
# HDFから一部の行のみを読む
df_tmp = pd.read_hdf(file_hdf, 'test', start = 10000, stop = 19999, columns = ['EPOCH', 'PERIAPSIS', 'NORAD_CAT_ID'])
print(len(df_tmp))
df_tmp.head(3)

9999
CPU times: user 177 ms, sys: 8.05 ms, total: 185 ms
Wall time: 181 ms


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
10000,1980-01-10 07:22:27.352991,1000.781,5691
10001,1980-01-10 14:23:13.991136,778.811,5693
10002,1980-01-10 04:11:43.504511,998.643,5694


In [50]:
%%time
# HDFから条件にマッチした行のみを読む
# indexがついている列のみ条件を指定するのに利用できる (to_hdf の data_columns オプション)
df_tmp = pd.read_hdf(file_hdf, 'test', where = 'PERIAPSIS<200', columns = ['EPOCH', 'PERIAPSIS', 'NORAD_CAT_ID'])
print(len(df_tmp))
df_tmp.head(5)

6855
CPU times: user 261 ms, sys: 168 ms, total: 429 ms
Wall time: 424 ms


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
604,1980-01-01 01:40:17.353055,178.36,4760
605,1980-01-01 12:05:40.756416,178.448,4760
746,1980-01-01 12:08:52.446912,168.13,5407
1506,1980-01-01 00:55:45.002783,196.483,12908
1507,1980-01-01 00:55:49.652831,195.572,12908


In [51]:
%%time
# pandas.DataFrame.to_hdf で保存したHDFの構造
def hdfprint(name, obj):
    if isinstance(obj, h5py.Dataset):
        print(name, '\t', obj)

with h5py.File(file_hdf, 'r') as f:
    f.visititems(hdfprint)

test/_i_table/APOAPSIS/abounds 	 <HDF5 dataset "abounds": shape (432,), type "<f8">
test/_i_table/APOAPSIS/bounds 	 <HDF5 dataset "bounds": shape (2, 215), type "<f8">
test/_i_table/APOAPSIS/indices 	 <HDF5 dataset "indices": shape (2, 221184), type "<u4">
test/_i_table/APOAPSIS/indicesLR 	 <HDF5 dataset "indicesLR": shape (221184,), type "<u4">
test/_i_table/APOAPSIS/mbounds 	 <HDF5 dataset "mbounds": shape (432,), type "<f8">
test/_i_table/APOAPSIS/mranges 	 <HDF5 dataset "mranges": shape (2,), type "<f8">
test/_i_table/APOAPSIS/ranges 	 <HDF5 dataset "ranges": shape (2, 2), type "<f8">
test/_i_table/APOAPSIS/sorted 	 <HDF5 dataset "sorted": shape (2, 221184), type "<f8">
test/_i_table/APOAPSIS/sortedLR 	 <HDF5 dataset "sortedLR": shape (221401,), type "<f8">
test/_i_table/APOAPSIS/zbounds 	 <HDF5 dataset "zbounds": shape (432,), type "<f8">
test/_i_table/ARG_OF_PERICENTER/abounds 	 <HDF5 dataset "abounds": shape (432,), type "<f8">
test/_i_table/ARG_OF_PERICENTER/bounds 	 <HDF5 data

In [52]:
%%time
# 単純に読むだけなら速い
with h5py.File(file_hdf, 'r') as f:
    data = f['test/table'][()]
    print(type(data))
    print(len(data))

<class 'numpy.ndarray'>
624983
CPU times: user 866 ms, sys: 375 ms, total: 1.24 s
Wall time: 1.23 s


In [53]:
%%time
# 1行目を読んでみる
# 型ごとにまとめられたリストになっているのでそのままでは使えない
with h5py.File(file_hdf, 'r') as f:
    data = f['test/table'][0]
    print(type(data))
    print(len(data))
    print(data)

<class 'numpy.void'>
24
(0, [b'2.0', b'EARTH', b'U', b'GENERATED VIA SPACE-TRACK.ORG API', b'US', b'SGP4', b'VANGUARD 2', b'PAYLOAD', b'18 SPCS', b'MEDIUM', b'TEME', b'AFETR', b'UTC', b'0 VANGUARD 2', b'1 00011U 59001  A 80001.99272614  .00001182 +00000-0 +68517-3 0  9998', b'2 00011 032.8934 225.7732 1604011 342.2735 012.7600 11.56641400878142'], [-9223372036854775808,  -343094400000000000], [999], [0], [34111], [0.000e+00, 1.182e-05], 1092613538000000000, 315618571538495000, b'1959-001A', 11.566414, 0.1604011, 32.8934, 225.7732, 342.2735, 12.76, 11, 87814, 0.00068517, 8259.167, 124.498, 3205.812, 556.253, 11069028)
CPU times: user 3.94 ms, sys: 3 ms, total: 6.94 ms
Wall time: 6.39 ms


In [54]:
%%time
# SQLite3から読み込む
# 型は、real => float64, integer => int64, text => object(str), timestamp => datetime64[ns] となる
with sqlite3.connect(file_sqlite) as conn:
    df_tmp = pd.read_sql_query('''SELECT EPOCH, PERIAPSIS, NORAD_CAT_ID 
        FROM elset WHERE NORAD_CAT_ID BETWEEN 80000 AND 89000''', conn, parse_dates = convert_dates)
print(len(df_tmp))
df_tmp.head(3)

0
CPU times: user 3.09 ms, sys: 803 µs, total: 3.89 ms
Wall time: 3.08 ms


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID


In [55]:
%%time
# SQLite3から読み込む (indexがついていないと遅い例)
# 型は、real => float64, integer => int64, text => object(str), timestamp => datetime64[ns] となる
with sqlite3.connect(file_sqlite) as conn:
    df_tmp = pd.read_sql_query('''SELECT EPOCH, PERIAPSIS, NORAD_CAT_ID 
                               FROM elset WHERE PERIAPSIS < 200''', conn, parse_dates = convert_dates)
print(len(df_tmp))
df_tmp.head(3)

6855
CPU times: user 132 ms, sys: 137 ms, total: 269 ms
Wall time: 267 ms


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
0,1980-01-01 01:40:17.353055,178.36,4760
1,1980-01-01 12:05:40.756416,178.448,4760
2,1980-01-01 12:08:52.446912,168.13,5407
