# spacetrackのテスト その4

大量の軌道要素ファイルをまとめて取り扱いやすくするテスト。

軌道要素データは既に download_gp_date_json.py で download/YYYY/ 以下にダウンロード済みであるものとする。

In [1]:
import pandas as pd
import os
import glob
import cProfile
import h5py
import sqlite3

In [2]:
# データの出力先 (十分な空き容量のあるストレージを指定する)
outputpath = '/work/'

# 既にダウンロードしxzで圧縮されたデータを用いる (download_gp_date_json.py でダウンロード)
allfiles = sorted(glob.glob('download/2019/*.json.xz'))
print(len(allfiles))

365


In [3]:
pd.set_option('display.max_columns', 50)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 80)

In [4]:
# 各columnの型
# https://www.space-track.org/basicspacedata/modeldef/class/gp/format/html も参照のこと
# decimal型のcolumnはfloat64として取り扱うことにする
dtype = {'CCSDS_OMM_VERS': str,  'COMMENT': str,  'CREATION_DATE': 'datetime64[ns]',  'ORIGINATOR': str, 
       'OBJECT_NAME': str,  'OBJECT_ID': str,  'CENTER_NAME': str,  'REF_FRAME': str, 
       'TIME_SYSTEM': str,  'MEAN_ELEMENT_THEORY': str,  'EPOCH': 'datetime64[ns]',  'MEAN_MOTION': 'float64', 
       'ECCENTRICITY': 'float64',  'INCLINATION': 'float64',  'RA_OF_ASC_NODE': 'float64', 
       'ARG_OF_PERICENTER': 'float64',  'MEAN_ANOMALY': 'float64',  'EPHEMERIS_TYPE': 'int8', 
       'CLASSIFICATION_TYPE': str,  'NORAD_CAT_ID': 'uint32',  'ELEMENT_SET_NO': 'uint16', 
       'REV_AT_EPOCH': 'uint32',  'BSTAR': 'float64',  'MEAN_MOTION_DOT': 'float64',  'MEAN_MOTION_DDOT': 'float64', 
       'SEMIMAJOR_AXIS': 'float64',  'PERIOD': 'float64',  'APOAPSIS': 'float64',  'PERIAPSIS': 'float64',  'OBJECT_TYPE': str, 
       'RCS_SIZE': str,  'COUNTRY_CODE': str,  'LAUNCH_DATE': 'datetime64[ns]',  'SITE': str,  'DECAY_DATE': 'datetime64[ns]', 
       'FILE': 'uint64',  'GP_ID': 'uint32',  'TLE_LINE0': str,  'TLE_LINE1': str,  'TLE_LINE2': str}

# 以下のcolumnは日時として解釈する (元データが空欄の場合は NaT になる)
convert_dates = ['EPOCH', 'CREATION_DATE', 'LAUNCH_DATE', 'DECAY_DATE']

In [5]:
# column名
columns = list(dtype.keys())

# DBに出力するcolumn
#columns_out = ['CREATION_DATE', 'EPOCH', 'OBJECT_ID', 'MEAN_MOTION', 'ECCENTRICITY', 'INCLINATION', 'RA_OF_ASC_NODE',
#    'ARG_OF_PERICENTER', 'MEAN_ANOMALY', 'NORAD_CAT_ID', 'REV_AT_EPOCH', 'BSTAR', 'SEMIMAJOR_AXIS',
#    'PERIOD', 'APOAPSIS', 'PERIAPSIS', 'GP_ID', 'TLE_LINE0', 'TLE_LINE1', 'TLE_LINE2']

# indexに用いるcolumn
columns_index = ['CREATION_DATE', 'EPOCH', 'OBJECT_ID', 'MEAN_MOTION', 'ECCENTRICITY', 'INCLINATION', 'RA_OF_ASC_NODE',
    'ARG_OF_PERICENTER', 'MEAN_ANOMALY', 'NORAD_CAT_ID', 'REV_AT_EPOCH', 'BSTAR', 'SEMIMAJOR_AXIS',
    'PERIOD', 'APOAPSIS', 'PERIAPSIS', 'GP_ID']

In [6]:
# テスト保存先ファイル名を定義
file_json = outputpath + 'test.json'
file_json2 = outputpath + 'test2.json'
file_pickle = outputpath + 'test.pickle'
file_pickle2 = outputpath + 'test.pickle.gz'
file_parquet = outputpath + 'test.parquet'
file_parquet2 = outputpath + 'test2.parquet'
file_hdf = outputpath + 'test.hdf5'
file_hdf2 = outputpath + 'test2.hdf5'
file_sqlite = outputpath + 'test.sqlite3'

## ファイル1個を読む

In [7]:
print(allfiles[0])

download/2019/20190101.json.xz


In [8]:
# ファイル1個を読む速度 (型を自動判定)
def readtest1(file):
    df_tmp = pd.read_json(file, convert_dates = convert_dates, precise_float = True, orient = 'records')
    return df_tmp

%timeit -r 5 df = readtest1(allfiles[0])

1.24 s ± 9.54 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [9]:
# ファイル1個を読む速度 (型を指定)
def readtest1(file):
    df_tmp = pd.read_json(file, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records')
    return df_tmp

%timeit -r 5 df = readtest1(allfiles[0])

1.29 s ± 19.6 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


## DataFrame に大量のデータを追加する速度を比較
ファイル1個ごとにappendしていくと遅いので、最小回数で作成する。

In [10]:
# 以降で使うテスト用データ
df_test = pd.read_json(allfiles[0], convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records')

In [11]:
# appendでつなげていく (O(n^2)なのでデータ数が増えると使い物にならない)
def appendtest(df_test, n):
    df = pd.DataFrame()
    for i in range(0, n):
        df = df.append(df_test[columns], ignore_index=True)
    return df
        
%timeit -r 5 df = appendtest(df_test, 10)
%timeit -r 5 df = appendtest(df_test, 100)

3.04 s ± 1.99 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)
4min 5s ± 175 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [12]:
# 全てのデータを1回のconcatで連結する
def concattest(df_test, n):
    df_list = []
    for i in range(0, n):
        df_list.append(df_test[columns])
    df = pd.concat(df_list)
    return df

%timeit -r 5 df = concattest(df_test, 10)
%timeit -r 5 df = concattest(df_test, 100)
%timeit -r 5 df = concattest(df_test, 1000)

513 ms ± 383 µs per loop (mean ± std. dev. of 5 runs, 1 loop each)
4.98 s ± 29.8 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)
48.3 s ± 2.04 s per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [13]:
# 列ごとにlistを作ってから、DataFrameに変換する (listを値とするdictを作成している)
# from_dict に時間がかかっている
def listtest(df_test, n):
    d = {column: [] for column in columns}
    for i in range(0, n):
        for column in columns:
            d[column].extend(df_test[column].values.tolist())
            #d[column] += df_test[column].values.tolist()
    df = pd.DataFrame.from_dict(d)
    #df =  pd.DataFrame(data = d, columns = columns)  # 少し遅い
    return df

%timeit -r 5 df = listtest(df_test, 10)
%timeit -r 5 df = listtest(df_test, 100)

1.78 s ± 1.61 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)
18.8 s ± 6.54 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [14]:
cProfile.run('appendtest(df_test, 100)')

         1620729 function calls (1616839 primitive calls) in 195.678 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      396    0.001    0.000    0.008    0.000 <__array_function__ internals>:2(argsort)
      396    0.001    0.000    0.004    0.000 <__array_function__ internals>:2(atleast_2d)
      200    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(can_cast)
     2574    0.011    0.000   47.524    0.018 <__array_function__ internals>:2(concatenate)
      398    0.001    0.000    0.003    0.000 <__array_function__ internals>:2(min_scalar_type)
      396    0.001    0.000   24.074    0.061 <__array_function__ internals>:2(vstack)
    10134    0.013    0.000    0.018    0.000 <frozen importlib._bootstrap>:1017(_handle_fromlist)
        1   16.433   16.433  195.387  195.387 <ipython-input-11-9eea22fa9f20>:2(appendtest)
        1    0.291    0.291  195.678  195.678 <string>:1(<module>)
     5685    0.003 

In [15]:
cProfile.run('concattest(df_test, 100)')

         727720 function calls (726117 primitive calls) in 4.511 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      200    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(can_cast)
       19    0.000    0.000    0.566    0.030 <__array_function__ internals>:2(concatenate)
      200    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(min_scalar_type)
     4118    0.003    0.000    0.004    0.000 <frozen importlib._bootstrap>:1017(_handle_fromlist)
        1    0.001    0.001    3.677    3.677 <ipython-input-12-83d7805a51aa>:2(concattest)
        1    0.834    0.834    4.511    4.511 <string>:1(<module>)
     3999    0.001    0.000    0.006    0.000 _asarray.py:16(asarray)
      400    0.002    0.000    0.003    0.000 _asarray.py:223(require)
      400    0.000    0.000    0.001    0.000 _asarray.py:300(<setcomp>)
      200    0.000    0.000    0.001    0.000 _asarray.py:88(asanyarray)
  

In [16]:
cProfile.run('listtest(df_test, 100)')

         76842 function calls (76796 primitive calls) in 19.138 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       40    0.000    0.000    0.014    0.000 <__array_function__ internals>:2(copyto)
       44    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:1017(_handle_fromlist)
        1    0.240    0.240   17.798   17.798 <ipython-input-13-dd9b0dc6fdcb>:3(listtest)
        1    0.000    0.000    0.000    0.000 <ipython-input-13-dd9b0dc6fdcb>:4(<dictcomp>)
        1    1.340    1.340   19.138   19.138 <string>:1(<module>)
       42    0.000    0.000    0.000    0.000 _asarray.py:16(asarray)
        3    0.000    0.000    0.000    0.000 _asarray.py:223(require)
        3    0.000    0.000    0.000    0.000 _asarray.py:300(<setcomp>)
       19    0.000    0.000    0.000    0.000 _dtype.py:319(_name_includes_bit_suffix)
       19    0.000    0.000    0.000    0.000 _dtype.py:333(_name_get)
       19    0.000

## 1年分のデータを読んでみる
実行には相当な時間がかかる。

In [17]:
# 単純にファイルを読む速度
def readtest2(files):
    for file in files:
        df_tmp = pd.read_json(file, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records')
        
%timeit -r 3 df = readtest2(allfiles)

6min 31s ± 109 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [18]:
# 全てのデータを1回のconcatで連結する
def concattest2(files):
    df_list = []
    for file in files:
        df_list.append(pd.read_json(file, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records'))
    df = pd.concat(df_list)
    return df

%timeit -r 3 df = concattest2(allfiles)

7min 33s ± 1.23 s per loop (mean ± std. dev. of 3 runs, 1 loop each)


## 1年分のデータをまとめて保存する

JSON、CSV、Pickleは、一部のデータのみが必要な場合でもファイル全体を読む必要があるので、巨大データの保存には向いていない。
ParquetとPickleは読み書き共に速いので一時作業保存用に便利。解析には必要なデータのみを取り出せるSQLite3が便利。

日時, None, NaN, NaT のマッピングについては要検討

In [19]:
%%time
# テスト用のデータを準備する
df_list = []
for file in allfiles:
    df_list.append(pd.read_json(file, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records'))
df = pd.concat(df_list)

CPU times: user 7min 26s, sys: 5.45 s, total: 7min 31s
Wall time: 7min 30s


In [20]:
print(len(df))

7890677


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7890677 entries, 0 to 21128
Data columns (total 40 columns):
 #   Column               Dtype         
---  ------               -----         
 0   CCSDS_OMM_VERS       object        
 1   COMMENT              object        
 2   CREATION_DATE        datetime64[ns]
 3   ORIGINATOR           object        
 4   OBJECT_NAME          object        
 5   OBJECT_ID            object        
 6   CENTER_NAME          object        
 7   REF_FRAME            object        
 8   TIME_SYSTEM          object        
 9   MEAN_ELEMENT_THEORY  object        
 10  EPOCH                datetime64[ns]
 11  MEAN_MOTION          float64       
 12  ECCENTRICITY         float64       
 13  INCLINATION          float64       
 14  RA_OF_ASC_NODE       float64       
 15  ARG_OF_PERICENTER    float64       
 16  MEAN_ANOMALY         float64       
 17  EPHEMERIS_TYPE       int8          
 18  CLASSIFICATION_TYPE  object        
 19  NORAD_CAT_ID         ui

In [22]:
%%time
# 重複データ
dup = df.duplicated()
print(dup.sum())
#print(df[dup].index.values)

15
CPU times: user 1min 16s, sys: 4.65 s, total: 1min 21s
Wall time: 1min 21s


In [23]:
%%time
# 重複データ(GP_IDのみで判定) ← ひとまずこちらで十分
dup2 = df.duplicated(subset = ['GP_ID'])
print(dup2.sum())
#print(df[dup].index.values)

15
CPU times: user 1.36 s, sys: 40.1 ms, total: 1.4 s
Wall time: 1.39 s


In [24]:
%%time
df.drop_duplicates(subset = ['GP_ID'], ignore_index = True, inplace=True)
print(len(df))

7890662
CPU times: user 23.1 s, sys: 1.5 s, total: 24.6 s
Wall time: 24.5 s


In [25]:
%%time
# json で保存 (行指向 records)
# date_format を指定しないと、datetime64はシリアル値(64bit整数)として記録される
# date_unit で時刻の精度を指定する
df.to_json(file_json, orient = 'records', date_format='iso', date_unit='us')
print(os.path.getsize(file_json))

8703332545
CPU times: user 1min 5s, sys: 12.3 s, total: 1min 17s
Wall time: 1min 16s


In [26]:
%%time
# json で保存 (列指向 columns)
# date_format を指定しないと、datetime64はシリアル値(64bit整数)として記録される
# date_unit で時刻の精度を指定する
df.to_json(file_json2, orient = 'columns', date_format='iso', date_unit='us')
print(os.path.getsize(file_json2))

7443726829
CPU times: user 1min 6s, sys: 10.5 s, total: 1min 16s
Wall time: 1min 16s


In [27]:
%%time
# pickle で保存 (無圧縮)
df.to_pickle(file_pickle)
print(os.path.getsize(file_pickle))

3662733520
CPU times: user 56.9 s, sys: 5.59 s, total: 1min 2s
Wall time: 1min 1s


In [28]:
%%time
# pickle で保存 (gzip圧縮)
df.to_pickle(file_pickle2)
print(os.path.getsize(file_pickle2))

940723316
CPU times: user 6min 20s, sys: 4.47 s, total: 6min 24s
Wall time: 6min 23s


In [29]:
%%time
# parquet で保存 (default: snappy圧縮)
df.to_parquet(file_parquet)
print(os.path.getsize(file_parquet))

1366590723
CPU times: user 37.6 s, sys: 3.66 s, total: 41.3 s
Wall time: 39.1 s


In [30]:
%%time
# parquet で保存 (zstd圧縮)
df.to_parquet(file_parquet2, compression='zstd')
print(os.path.getsize(file_parquet2))

1055515150
CPU times: user 40.6 s, sys: 4.29 s, total: 44.9 s
Wall time: 41.9 s


In [31]:
%%time
# HDFで保存 (pandas.DataFrame.to_hdf を使用)
# あとで検索に使えるよう、主要なcolumnにindexをつけておく
# format = 'fixed' とすると、レコード数が多いときにエラーが発生する
df.to_hdf(file_hdf, 'test', mode = 'w', format = 'table', data_columns = columns_index)
print(os.path.getsize(file_hdf))

10573301207
CPU times: user 5min 23s, sys: 48.3 s, total: 6min 11s
Wall time: 6min 10s


In [32]:
%%time
# HDFで保存 (pandas.HDFStore を使用) → to_hdf と同じモノが生成される
# あとで検索に使えるよう、主要なcolumnにindexをつけておく
store = pd.HDFStore(file_hdf2)
store.append('test', df, data_columns = columns_index)
store.close()
print(os.path.getsize(file_hdf2))

10573301207
CPU times: user 5min 24s, sys: 47 s, total: 6min 11s
Wall time: 6min 11s


In [33]:
%%time
# SQLite3で保存
# 型は、real, integer, text, timestamp に集約される
# indexのつけ方については要検討
with sqlite3.connect(file_sqlite) as conn:
    #c = conn.cursor()
    df.to_sql('elset', conn, if_exists='replace', index=None)
    conn.execute('CREATE UNIQUE INDEX index_elset_gp_id ON elset (GP_ID)')
    conn.execute('CREATE INDEX index_elset_epoch ON elset (EPOCH)')
    conn.execute('CREATE INDEX index_elset_norad_cat_id ON elset (NORAD_CAT_ID)')
    conn.commit()
print(os.path.getsize(file_sqlite))

4560252928
CPU times: user 4min 15s, sys: 32.5 s, total: 4min 47s
Wall time: 4min 47s


## 保存したデータを読み込む

In [34]:
%%time
# JSONを読み込む (行指向 records)
# orient はつけなくてもほとんどの場合自動判別してくれる
df_tmp = pd.read_json(file_json, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'records')
print(len(df_tmp))
df_tmp.head(1)

7890662
CPU times: user 4min, sys: 34.8 s, total: 4min 34s
Wall time: 4min 34s


Unnamed: 0,CCSDS_OMM_VERS,COMMENT,CREATION_DATE,ORIGINATOR,OBJECT_NAME,OBJECT_ID,CENTER_NAME,REF_FRAME,TIME_SYSTEM,MEAN_ELEMENT_THEORY,EPOCH,MEAN_MOTION,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPHEMERIS_TYPE,CLASSIFICATION_TYPE,NORAD_CAT_ID,ELEMENT_SET_NO,REV_AT_EPOCH,BSTAR,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,SEMIMAJOR_AXIS,PERIOD,APOAPSIS,PERIAPSIS,OBJECT_TYPE,RCS_SIZE,COUNTRY_CODE,LAUNCH_DATE,SITE,DECAY_DATE,FILE,GP_ID,TLE_LINE0,TLE_LINE1,TLE_LINE2
0,2.0,GENERATED VIA SPACE-TRACK.ORG API,2019-01-01 22:34:06,18 SPCS,VANGUARD 1,1958-002B,EARTH,TEME,UTC,SGP4,2019-01-01 12:03:23.466816,10.847953,0.184661,34.2477,69.8173,114.3456,266.0586,0,U,5,999,14737,-0.000178,-2e-06,0.0,8619.921,132.743,3833.553,650.019,PAYLOAD,MEDIUM,US,1958-03-17,AFETR,NaT,2387660,125692873,0 VANGUARD 1,1 5U 58002B 19001.50235494 -.00000155 +00000-0 -17793-3 0 9996,2 5 034.2477 069.8173 1846614 114.3456 266.0586 10.84795302147376


In [35]:
%%time
# JSONを読み込む (列指向 columns)
# orient はつけなくてもほとんどの場合自動判別してくれる
df_tmp = pd.read_json(file_json2, convert_dates = convert_dates, dtype = dtype, precise_float = True, orient = 'columns')
print(len(df_tmp))
df_tmp.head(1)

7890662
CPU times: user 7min 36s, sys: 23 s, total: 7min 59s
Wall time: 7min 57s


Unnamed: 0,CCSDS_OMM_VERS,COMMENT,CREATION_DATE,ORIGINATOR,OBJECT_NAME,OBJECT_ID,CENTER_NAME,REF_FRAME,TIME_SYSTEM,MEAN_ELEMENT_THEORY,EPOCH,MEAN_MOTION,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPHEMERIS_TYPE,CLASSIFICATION_TYPE,NORAD_CAT_ID,ELEMENT_SET_NO,REV_AT_EPOCH,BSTAR,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,SEMIMAJOR_AXIS,PERIOD,APOAPSIS,PERIAPSIS,OBJECT_TYPE,RCS_SIZE,COUNTRY_CODE,LAUNCH_DATE,SITE,DECAY_DATE,FILE,GP_ID,TLE_LINE0,TLE_LINE1,TLE_LINE2
0,2.0,GENERATED VIA SPACE-TRACK.ORG API,2019-01-01 22:34:06,18 SPCS,VANGUARD 1,1958-002B,EARTH,TEME,UTC,SGP4,2019-01-01 12:03:23.466816,10.847953,0.184661,34.2477,69.8173,114.3456,266.0586,0,U,5,999,14737,-0.000178,-2e-06,0.0,8619.921,132.743,3833.553,650.019,PAYLOAD,MEDIUM,US,1958-03-17,AFETR,NaT,2387660,125692873,0 VANGUARD 1,1 5U 58002B 19001.50235494 -.00000155 +00000-0 -17793-3 0 9996,2 5 034.2477 069.8173 1846614 114.3456 266.0586 10.84795302147376


In [36]:
%%time
# pickleを読み込む (無圧縮)
df_tmp = pd.read_pickle(file_pickle)
print(len(df_tmp))
df_tmp.head(1)

7890662
CPU times: user 14.5 s, sys: 1.15 s, total: 15.7 s
Wall time: 15.6 s


Unnamed: 0,CCSDS_OMM_VERS,COMMENT,CREATION_DATE,ORIGINATOR,OBJECT_NAME,OBJECT_ID,CENTER_NAME,REF_FRAME,TIME_SYSTEM,MEAN_ELEMENT_THEORY,EPOCH,MEAN_MOTION,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPHEMERIS_TYPE,CLASSIFICATION_TYPE,NORAD_CAT_ID,ELEMENT_SET_NO,REV_AT_EPOCH,BSTAR,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,SEMIMAJOR_AXIS,PERIOD,APOAPSIS,PERIAPSIS,OBJECT_TYPE,RCS_SIZE,COUNTRY_CODE,LAUNCH_DATE,SITE,DECAY_DATE,FILE,GP_ID,TLE_LINE0,TLE_LINE1,TLE_LINE2
0,2.0,GENERATED VIA SPACE-TRACK.ORG API,2019-01-01 22:34:06,18 SPCS,VANGUARD 1,1958-002B,EARTH,TEME,UTC,SGP4,2019-01-01 12:03:23.466816,10.847953,0.184661,34.2477,69.8173,114.3456,266.0586,0,U,5,999,14737,-0.000178,-2e-06,0.0,8619.921,132.743,3833.553,650.019,PAYLOAD,MEDIUM,US,1958-03-17,AFETR,NaT,2387660,125692873,0 VANGUARD 1,1 5U 58002B 19001.50235494 -.00000155 +00000-0 -17793-3 0 9996,2 5 034.2477 069.8173 1846614 114.3456 266.0586 10.84795302147376


In [37]:
%%time
# pickleを読み込む (gzip圧縮)
df_tmp = pd.read_pickle(file_pickle2)
print(len(df_tmp))
df_tmp.head(1)

7890662
CPU times: user 26.9 s, sys: 1.23 s, total: 28.1 s
Wall time: 28.1 s


Unnamed: 0,CCSDS_OMM_VERS,COMMENT,CREATION_DATE,ORIGINATOR,OBJECT_NAME,OBJECT_ID,CENTER_NAME,REF_FRAME,TIME_SYSTEM,MEAN_ELEMENT_THEORY,EPOCH,MEAN_MOTION,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPHEMERIS_TYPE,CLASSIFICATION_TYPE,NORAD_CAT_ID,ELEMENT_SET_NO,REV_AT_EPOCH,BSTAR,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,SEMIMAJOR_AXIS,PERIOD,APOAPSIS,PERIAPSIS,OBJECT_TYPE,RCS_SIZE,COUNTRY_CODE,LAUNCH_DATE,SITE,DECAY_DATE,FILE,GP_ID,TLE_LINE0,TLE_LINE1,TLE_LINE2
0,2.0,GENERATED VIA SPACE-TRACK.ORG API,2019-01-01 22:34:06,18 SPCS,VANGUARD 1,1958-002B,EARTH,TEME,UTC,SGP4,2019-01-01 12:03:23.466816,10.847953,0.184661,34.2477,69.8173,114.3456,266.0586,0,U,5,999,14737,-0.000178,-2e-06,0.0,8619.921,132.743,3833.553,650.019,PAYLOAD,MEDIUM,US,1958-03-17,AFETR,NaT,2387660,125692873,0 VANGUARD 1,1 5U 58002B 19001.50235494 -.00000155 +00000-0 -17793-3 0 9996,2 5 034.2477 069.8173 1846614 114.3456 266.0586 10.84795302147376


In [38]:
%%time
# parquetを読み込む (default: snappy圧縮)
df_tmp = pd.read_parquet(file_parquet)
print(len(df_tmp))
df_tmp.head(1)

7890662
CPU times: user 24.1 s, sys: 8.86 s, total: 33 s
Wall time: 12.9 s


Unnamed: 0,CCSDS_OMM_VERS,COMMENT,CREATION_DATE,ORIGINATOR,OBJECT_NAME,OBJECT_ID,CENTER_NAME,REF_FRAME,TIME_SYSTEM,MEAN_ELEMENT_THEORY,EPOCH,MEAN_MOTION,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPHEMERIS_TYPE,CLASSIFICATION_TYPE,NORAD_CAT_ID,ELEMENT_SET_NO,REV_AT_EPOCH,BSTAR,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,SEMIMAJOR_AXIS,PERIOD,APOAPSIS,PERIAPSIS,OBJECT_TYPE,RCS_SIZE,COUNTRY_CODE,LAUNCH_DATE,SITE,DECAY_DATE,FILE,GP_ID,TLE_LINE0,TLE_LINE1,TLE_LINE2
0,2.0,GENERATED VIA SPACE-TRACK.ORG API,2019-01-01 22:34:06,18 SPCS,VANGUARD 1,1958-002B,EARTH,TEME,UTC,SGP4,2019-01-01 12:03:23.466816,10.847953,0.184661,34.2477,69.8173,114.3456,266.0586,0,U,5,999,14737,-0.000178,-2e-06,0.0,8619.921,132.743,3833.553,650.019,PAYLOAD,MEDIUM,US,1958-03-17,AFETR,NaT,2387660,125692873,0 VANGUARD 1,1 5U 58002B 19001.50235494 -.00000155 +00000-0 -17793-3 0 9996,2 5 034.2477 069.8173 1846614 114.3456 266.0586 10.84795302147376


In [39]:
%%time
# parquetを読み込む (zstd圧縮)
df_tmp = pd.read_parquet(file_parquet2)
print(len(df_tmp))
df_tmp.head(1)

7890662
CPU times: user 26.4 s, sys: 11.5 s, total: 37.9 s
Wall time: 13.4 s


Unnamed: 0,CCSDS_OMM_VERS,COMMENT,CREATION_DATE,ORIGINATOR,OBJECT_NAME,OBJECT_ID,CENTER_NAME,REF_FRAME,TIME_SYSTEM,MEAN_ELEMENT_THEORY,EPOCH,MEAN_MOTION,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPHEMERIS_TYPE,CLASSIFICATION_TYPE,NORAD_CAT_ID,ELEMENT_SET_NO,REV_AT_EPOCH,BSTAR,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,SEMIMAJOR_AXIS,PERIOD,APOAPSIS,PERIAPSIS,OBJECT_TYPE,RCS_SIZE,COUNTRY_CODE,LAUNCH_DATE,SITE,DECAY_DATE,FILE,GP_ID,TLE_LINE0,TLE_LINE1,TLE_LINE2
0,2.0,GENERATED VIA SPACE-TRACK.ORG API,2019-01-01 22:34:06,18 SPCS,VANGUARD 1,1958-002B,EARTH,TEME,UTC,SGP4,2019-01-01 12:03:23.466816,10.847953,0.184661,34.2477,69.8173,114.3456,266.0586,0,U,5,999,14737,-0.000178,-2e-06,0.0,8619.921,132.743,3833.553,650.019,PAYLOAD,MEDIUM,US,1958-03-17,AFETR,NaT,2387660,125692873,0 VANGUARD 1,1 5U 58002B 19001.50235494 -.00000155 +00000-0 -17793-3 0 9996,2 5 034.2477 069.8173 1846614 114.3456 266.0586 10.84795302147376


In [40]:
%%time
# HDFを読み込む
df_tmp = pd.read_hdf(file_hdf, 'test')
print(len(df_tmp))
df_tmp.head(1)

7890662
CPU times: user 2min 8s, sys: 15.1 s, total: 2min 23s
Wall time: 2min 22s


Unnamed: 0,CCSDS_OMM_VERS,COMMENT,CREATION_DATE,ORIGINATOR,OBJECT_NAME,OBJECT_ID,CENTER_NAME,REF_FRAME,TIME_SYSTEM,MEAN_ELEMENT_THEORY,EPOCH,MEAN_MOTION,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPHEMERIS_TYPE,CLASSIFICATION_TYPE,NORAD_CAT_ID,ELEMENT_SET_NO,REV_AT_EPOCH,BSTAR,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,SEMIMAJOR_AXIS,PERIOD,APOAPSIS,PERIAPSIS,OBJECT_TYPE,RCS_SIZE,COUNTRY_CODE,LAUNCH_DATE,SITE,DECAY_DATE,FILE,GP_ID,TLE_LINE0,TLE_LINE1,TLE_LINE2
0,2.0,GENERATED VIA SPACE-TRACK.ORG API,2019-01-01 22:34:06,18 SPCS,VANGUARD 1,1958-002B,EARTH,TEME,UTC,SGP4,2019-01-01 12:03:23.466816,10.847953,0.184661,34.2477,69.8173,114.3456,266.0586,0,U,5,999,14737,-0.000178,-2e-06,0.0,8619.921,132.743,3833.553,650.019,PAYLOAD,MEDIUM,US,1958-03-17,AFETR,NaT,2387660,125692873,0 VANGUARD 1,1 5U 58002B 19001.50235494 -.00000155 +00000-0 -17793-3 0 9996,2 5 034.2477 069.8173 1846614 114.3456 266.0586 10.84795302147376


In [41]:
%%time
# HDFを読み込む  (pandas.HDFStore を使用)
store = pd.HDFStore(file_hdf2)
df_tmp = store.get('test')
store.close()
print(len(df_tmp))
df_tmp.head(1)

7890662
CPU times: user 2min 2s, sys: 16.3 s, total: 2min 18s
Wall time: 2min 18s


Unnamed: 0,CCSDS_OMM_VERS,COMMENT,CREATION_DATE,ORIGINATOR,OBJECT_NAME,OBJECT_ID,CENTER_NAME,REF_FRAME,TIME_SYSTEM,MEAN_ELEMENT_THEORY,EPOCH,MEAN_MOTION,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPHEMERIS_TYPE,CLASSIFICATION_TYPE,NORAD_CAT_ID,ELEMENT_SET_NO,REV_AT_EPOCH,BSTAR,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,SEMIMAJOR_AXIS,PERIOD,APOAPSIS,PERIAPSIS,OBJECT_TYPE,RCS_SIZE,COUNTRY_CODE,LAUNCH_DATE,SITE,DECAY_DATE,FILE,GP_ID,TLE_LINE0,TLE_LINE1,TLE_LINE2
0,2.0,GENERATED VIA SPACE-TRACK.ORG API,2019-01-01 22:34:06,18 SPCS,VANGUARD 1,1958-002B,EARTH,TEME,UTC,SGP4,2019-01-01 12:03:23.466816,10.847953,0.184661,34.2477,69.8173,114.3456,266.0586,0,U,5,999,14737,-0.000178,-2e-06,0.0,8619.921,132.743,3833.553,650.019,PAYLOAD,MEDIUM,US,1958-03-17,AFETR,NaT,2387660,125692873,0 VANGUARD 1,1 5U 58002B 19001.50235494 -.00000155 +00000-0 -17793-3 0 9996,2 5 034.2477 069.8173 1846614 114.3456 266.0586 10.84795302147376


In [42]:
%%time
# SQLite3から読み込む
# 型は、real => float64, integer => int64, text => object(str), timestamp => datetime64[ns] となる
with sqlite3.connect(file_sqlite) as conn:
    df_tmp = pd.read_sql_query("SELECT * FROM elset", conn, parse_dates = convert_dates)
print(len(df_tmp))
df_tmp.head(1)

7890662
CPU times: user 2min 43s, sys: 17.4 s, total: 3min 1s
Wall time: 3min 1s


Unnamed: 0,CCSDS_OMM_VERS,COMMENT,CREATION_DATE,ORIGINATOR,OBJECT_NAME,OBJECT_ID,CENTER_NAME,REF_FRAME,TIME_SYSTEM,MEAN_ELEMENT_THEORY,EPOCH,MEAN_MOTION,ECCENTRICITY,INCLINATION,RA_OF_ASC_NODE,ARG_OF_PERICENTER,MEAN_ANOMALY,EPHEMERIS_TYPE,CLASSIFICATION_TYPE,NORAD_CAT_ID,ELEMENT_SET_NO,REV_AT_EPOCH,BSTAR,MEAN_MOTION_DOT,MEAN_MOTION_DDOT,SEMIMAJOR_AXIS,PERIOD,APOAPSIS,PERIAPSIS,OBJECT_TYPE,RCS_SIZE,COUNTRY_CODE,LAUNCH_DATE,SITE,DECAY_DATE,FILE,GP_ID,TLE_LINE0,TLE_LINE1,TLE_LINE2
0,2.0,GENERATED VIA SPACE-TRACK.ORG API,2019-01-01 22:34:06,18 SPCS,VANGUARD 1,1958-002B,EARTH,TEME,UTC,SGP4,2019-01-01 12:03:23.466816,10.847953,0.184661,34.2477,69.8173,114.3456,266.0586,0,U,5,999,14737,-0.000178,-2e-06,0.0,8619.921,132.743,3833.553,650.019,PAYLOAD,MEDIUM,US,1958-03-17,AFETR,NaT,2387660,125692873,0 VANGUARD 1,1 5U 58002B 19001.50235494 -.00000155 +00000-0 -17793-3 0 9996,2 5 034.2477 069.8173 1846614 114.3456 266.0586 10.84795302147376


## 一部のデータのみを読んでみる

In [43]:
%%time
# parquetを読み込む (default: snappy圧縮)
df_tmp = pd.read_parquet(file_parquet, columns = ['EPOCH', 'PERIAPSIS', 'NORAD_CAT_ID'])
print(len(df_tmp))
df_tmp.head(3)

7890662
CPU times: user 723 ms, sys: 575 ms, total: 1.3 s
Wall time: 641 ms


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
0,2019-01-01 12:03:23.466816,650.019,5
1,2019-01-01 03:37:20.673408,553.119,11
2,2019-01-01 13:43:43.277952,553.125,11


In [44]:
%%time
# parquetを読み込む (zstd圧縮)
# 圧縮されているにも関わらず速い。素晴らしい
df_tmp = pd.read_parquet(file_parquet2,  columns = ['EPOCH', 'PERIAPSIS', 'NORAD_CAT_ID'])
print(len(df_tmp))
df_tmp.head(3)

7890662
CPU times: user 838 ms, sys: 364 ms, total: 1.2 s
Wall time: 570 ms


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
0,2019-01-01 12:03:23.466816,650.019,5
1,2019-01-01 03:37:20.673408,553.119,11
2,2019-01-01 13:43:43.277952,553.125,11


In [45]:
%%time
# HDFから一部の列のみを読む
# 全体を読むのと同じだけの時間がかかる
df_tmp = pd.read_hdf(file_hdf, 'test', columns = ['EPOCH', 'PERIAPSIS', 'NORAD_CAT_ID'])
print(len(df_tmp))
df_tmp.head(3)

7890662
CPU times: user 2min 4s, sys: 18.2 s, total: 2min 22s
Wall time: 2min 22s


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
0,2019-01-01 12:03:23.466816,650.019,5
1,2019-01-01 03:37:20.673408,553.119,11
2,2019-01-01 13:43:43.277952,553.125,11


In [46]:
%%time
# HDFから一部の行のみを読む
df_tmp = pd.read_hdf(file_hdf, 'test', start = 10000, stop = 19999, columns = ['EPOCH', 'PERIAPSIS', 'NORAD_CAT_ID'])
print(len(df_tmp))
df_tmp.head(3)

9999
CPU times: user 167 ms, sys: 6.02 ms, total: 173 ms
Wall time: 169 ms


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
10000,2019-01-01 08:25:06.324096,1081.751,21542
10001,2019-01-01 21:36:07.733664,1081.751,21542
10002,2019-01-01 23:40:27.191712,832.929,21543


In [47]:
%%time
# HDFから条件にマッチした行のみを読む
# indexがついている列のみ条件を指定するのに利用できる (to_hdf の data_columns オプション)
df_tmp = pd.read_hdf(file_hdf, 'test', where = 'PERIAPSIS<200', columns = ['EPOCH', 'PERIAPSIS', 'NORAD_CAT_ID'])
print(len(df_tmp))
df_tmp.head(5)

43286
CPU times: user 1.88 s, sys: 1.78 s, total: 3.66 s
Wall time: 3.68 s


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
1617,2019-01-01 03:30:44.744544,169.959,4486
1618,2019-01-01 13:04:31.215072,169.932,4486
1619,2019-01-01 14:59:14.902656,169.936,4486
3308,2019-01-01 13:36:29.787552,194.652,7865
3309,2019-01-01 18:01:54.419520,183.236,7865


In [48]:
%%time
# pandas.DataFrame.to_hdf で保存したHDFの構造
def hdfprint(name, obj):
    if isinstance(obj, h5py.Dataset):
        print(name, '\t', obj)

with h5py.File(file_hdf, 'r') as f:
    f.visititems(hdfprint)

test/_i_table/APOAPSIS/abounds 	 <HDF5 dataset "abounds": shape (3712,), type "<f8">
test/_i_table/APOAPSIS/bounds 	 <HDF5 dataset "bounds": shape (16, 231), type "<f8">
test/_i_table/APOAPSIS/indices 	 <HDF5 dataset "indices": shape (16, 475136), type "<u4">
test/_i_table/APOAPSIS/indicesLR 	 <HDF5 dataset "indicesLR": shape (475136,), type "<u4">
test/_i_table/APOAPSIS/mbounds 	 <HDF5 dataset "mbounds": shape (3712,), type "<f8">
test/_i_table/APOAPSIS/mranges 	 <HDF5 dataset "mranges": shape (16,), type "<f8">
test/_i_table/APOAPSIS/ranges 	 <HDF5 dataset "ranges": shape (16, 2), type "<f8">
test/_i_table/APOAPSIS/sorted 	 <HDF5 dataset "sorted": shape (16, 475136), type "<f8">
test/_i_table/APOAPSIS/sortedLR 	 <HDF5 dataset "sortedLR": shape (475369,), type "<f8">
test/_i_table/APOAPSIS/zbounds 	 <HDF5 dataset "zbounds": shape (3712,), type "<f8">
test/_i_table/ARG_OF_PERICENTER/abounds 	 <HDF5 dataset "abounds": shape (3712,), type "<f8">
test/_i_table/ARG_OF_PERICENTER/bounds 	 <

In [49]:
%%time
# 単純に読むだけなら速い
with h5py.File(file_hdf, 'r') as f:
    data = f['test/table'][()]
    print(type(data))
    print(len(data))

<class 'numpy.ndarray'>
7890662
CPU times: user 12.9 s, sys: 7.79 s, total: 20.7 s
Wall time: 20.4 s


In [50]:
%%time
# 1行目を読んでみる
# 型ごとにまとめられたリストになっているのでそのままでは使えない
with h5py.File(file_hdf, 'r') as f:
    data = f['test/table'][0]
    print(type(data))
    print(len(data))
    print(data)

<class 'numpy.void'>
24
(0, [b'2.0', b'EARTH', b'U', b'GENERATED VIA SPACE-TRACK.ORG API', b'US', b'SGP4', b'VANGUARD 1', b'PAYLOAD', b'18 SPCS', b'MEDIUM', b'TEME', b'AFETR', b'UTC', b'0 VANGUARD 1', b'1     5U 58002B   19001.50235494 -.00000155 +00000-0 -17793-3 0  9996', b'2     5 034.2477 069.8173 1846614 114.3456 266.0586 10.84795302147376'], [-9223372036854775808,  -372211200000000000], [999], [0], [2387660], [ 0.00e+00, -1.55e-06], 1546382046000000000, 1546344203466816000, b'1958-002B', 10.84795302, 0.1846614, 34.2477, 69.8173, 114.3456, 266.0586, 5, 14737, -0.00017793, 8619.921, 132.743, 3833.553, 650.019, 125692873)
CPU times: user 4.39 ms, sys: 870 ms, total: 874 ms
Wall time: 868 ms


In [51]:
%%time
# SQLite3から読み込む
# 型は、real => float64, integer => int64, text => object(str), timestamp => datetime64[ns] となる
with sqlite3.connect(file_sqlite) as conn:
    df_tmp = pd.read_sql_query('''SELECT EPOCH, PERIAPSIS, NORAD_CAT_ID 
        FROM elset WHERE NORAD_CAT_ID BETWEEN 80000 AND 89000''', conn, parse_dates = convert_dates)
print(len(df_tmp))
df_tmp.head(3)

85401
CPU times: user 256 ms, sys: 62.1 ms, total: 318 ms
Wall time: 318 ms


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
0,2019-01-16 20:14:05.993952,314.565,81014
1,2019-01-23 12:40:22.884960,313.78,81014
2,2019-01-28 19:57:04.265568,313.846,81014


In [52]:
%%time
# SQLite3から読み込む (indexがついていないと遅い例)
# 型は、real => float64, integer => int64, text => object(str), timestamp => datetime64[ns] となる
with sqlite3.connect(file_sqlite) as conn:
    df_tmp = pd.read_sql_query('''SELECT EPOCH, PERIAPSIS, NORAD_CAT_ID 
                               FROM elset WHERE PERIAPSIS < 200''', conn, parse_dates = convert_dates)
print(len(df_tmp))
df_tmp.head(3)

43286
CPU times: user 4.53 s, sys: 3.94 s, total: 8.48 s
Wall time: 8.47 s


Unnamed: 0,EPOCH,PERIAPSIS,NORAD_CAT_ID
0,2019-01-01 03:30:44.744544,169.959,4486
1,2019-01-01 13:04:31.215072,169.932,4486
2,2019-01-01 14:59:14.902656,169.936,4486
