In [1]:
import pandas as pd
import datetime
import numpy as np

# local file paths
input_file_path = f'../../../siads591 data/filtered_raw/gabbard.pkl.gz'
out_path = f'../../../siads591 data/processed_data/gabbard/'

# freq = frequency of datapoints after reindexing
# interpolate_lim = limits of interpolation based on the frequency
#                   (1D frequency, 30 limit = 30 days 1 way, effectively a max of 60 days between points)
# out_file_name = output file name

freq, interpolate_lim, out_file_name = '1D', 45, 'gabbard_1D.pkl'
# freq, interpolate_lim, out_file_name = '12H', 90, 'gabbard_12H.pkl'
# freq, interpolate_lim, out_file_name = '5D', 12, 'gabbard_5D.pkl'

In [2]:
raw_df = pd.read_pickle(f'{input_file_path}', compression="gzip")

In [3]:
display(raw_df.head(4))
display(raw_df.info())

Unnamed: 0_level_0,NORAD_CAT_ID,PERIOD,APOAPSIS,PERIAPSIS
EPOCH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-04-27 14:18:48.216960,18549,112.038002,1863.027954,777.705017
2004-04-27 15:59:40.727904,18727,113.903999,1474.209961,1336.97998
2004-04-27 19:45:13.686048,19027,103.922997,1006.145996,881.767029
2004-04-27 15:43:11.393472,19128,104.991997,1147.142944,841.101013


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 87875444 entries, 2004-04-27 14:18:48.216960 to 2021-01-25 23:08:08.404224
Data columns (total 4 columns):
 #   Column        Dtype  
---  ------        -----  
 0   NORAD_CAT_ID  uint32 
 1   PERIOD        float32
 2   APOAPSIS      float32
 3   PERIAPSIS     float32
dtypes: float32(3), uint32(1)
memory usage: 2.0 GB


None

In [None]:
# create a new dataframe with min/max epoch of each satellite
reindexed_df = raw_df.reset_index().groupby(by="NORAD_CAT_ID").agg({'EPOCH':['min','max']})

# rename columns and add a np.nan column
reindexed_df.columns = ["_".join(x) for x in reindexed_df.columns.ravel()]
reindexed_df['key'] = np.nan

# generate valid EPOCH entries for each satellite
reindexed_df = reindexed_df.reset_index().merge(pd.DataFrame({'EPOCH':pd.date_range(start=raw_df.index.floor(freq=freq).min(), end=raw_df.index.max(), freq=freq),'key':np.nan}),on='key').drop('key',1)
reindexed_df = reindexed_df[(reindexed_df.EPOCH_min < reindexed_df.EPOCH) & (reindexed_df.EPOCH_max > reindexed_df.EPOCH)][['NORAD_CAT_ID','EPOCH']]

# save this value for later use
num_rows = len(reindexed_df)

# combine the generated table with the raw data
reindexed_df = reindexed_df.append(raw_df.reset_index(), ignore_index=True).sort_values(by=['NORAD_CAT_ID','EPOCH'])

# interpolate values for the generate EPOCH
reindexed_df = reindexed_df.interpolate(method='linear', limit_area="inside", limit=interpolate_lim, limit_direction='both')

# remove the raw data
reindexed_df = reindexed_df[reindexed_df.index < num_rows]

# set index
reindexed_df = reindexed_df.set_index('EPOCH')

In [None]:
display(reindexed_df.head(4))
display(reindexed_df.info())

In [None]:
reindexed_df.to_pickle(f"{out_path}{out_file_name}")

In [None]:
idx_vals = reindexed_df.index.value_counts()

In [None]:
idx_vals.plot()