# Preprocessing Validation

In [1]:
import utils.constants as c
import os
import pandas as pd
import utils.io as io

from tqdm import tqdm

Recombine data splits

In [2]:
data_dir = io.resolve_path(c.DATA_DIR)
train_dir = data_dir / c.TRAIN_SUBDIR
eval_dir = data_dir / c.EVAL_SUBDIR
test_dir = data_dir / c.TEST_SUBDIR
train_orbits = pd.DataFrame({"dir": c.TRAIN_SUBDIR, "file": os.listdir(train_dir)})
eval_orbits = pd.DataFrame({"dir": c.EVAL_SUBDIR, "file": os.listdir(eval_dir)})
test_orbits = pd.DataFrame({"dir": c.TEST_SUBDIR, "file": os.listdir(test_dir)})
all_orbits = pd.concat([train_orbits, eval_orbits, test_orbits])
all_orbits.sort_values(by="file", inplace=True)
all_orbits.head()

Unnamed: 0,dir,file
395,eval,messenger-0002.csv
596,eval,messenger-0003.csv
612,train,messenger-0004.csv
179,train,messenger-0005.csv
2084,train,messenger-0006.csv


In [3]:
df_total = pd.concat(pd.read_csv(
    data_dir / row["dir"] / row["file"],
    index_col=c.DATE_COL,
    parse_dates=True
) for _, row in tqdm(all_orbits.iterrows()))

3153it [09:02,  5.81it/s]


Summarize basic information

In [4]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 99126933 entries, 2011-03-24 07:42:00 to 2015-04-30 06:45:32
Data columns (total 30 columns):
 #   Column        Dtype  
---  ------        -----  
 0   X_MSO         float64
 1   Y_MSO         float64
 2   Z_MSO         float64
 3   BX_MSO        float64
 4   BY_MSO        float64
 5   BZ_MSO        float64
 6   DBX_MSO       float64
 7   DBY_MSO       float64
 8   DBZ_MSO       float64
 9   RHO_DIPOLE    float64
 10  PHI_DIPOLE    float64
 11  THETA_DIPOLE  float64
 12  BABS_DIPOLE   float64
 13  BX_DIPOLE     float64
 14  BY_DIPOLE     float64
 15  BZ_DIPOLE     float64
 16  RHO           float64
 17  RXY           float64
 18  X             float64
 19  Y             float64
 20  Z             float64
 21  VX            float64
 22  VY            float64
 23  VZ            float64
 24  VABS          float64
 25  D             float64
 26  COSALPHA      float64
 27  EXTREMA       int64  
 28  ORBIT         int64  
 29  LABEL      

In [5]:
orbit_ids = df_total[c.ORBIT_COL].unique()
print(f"#orbits: {len(orbit_ids)}")

#orbits: 3153


At first glance, there appears to be a high deviation in orbit length.

In [6]:
sizes = df_total.groupby(c.ORBIT_COL).size()
sizes.describe()

count     3153.000000
mean     31438.925785
std       5314.769120
min      27937.000000
25%      28806.000000
50%      28809.000000
75%      29572.000000
max      43466.000000
dtype: float64

The reason for this is that MESSENGER changed from a 12-hour orbit to 8-hour orbit in April 2012.

In [7]:
drop_idx = sizes.diff().abs().idxmax()
drop_day = df_total.index[df_total[c.ORBIT_COL] == drop_idx][0].strftime("%Y-%m-%d")
print(f"orbit lengths dropped starting with orbit #{drop_idx} on {drop_day}")
sizes.loc[drop_idx-5:drop_idx+5]

orbit lengths dropped starting with orbit #790 on 2012-04-16


ORBIT
785    41778
786    41777
787    41778
788    41777
790    32687
791    32685
792    32686
793    32686
794    32686
795    32686
dtype: int64

When taking this into account, the initial deviation disappears.

In [8]:
sizes.loc[:drop_idx-1].describe()

count      566.000000
mean     42762.409894
std        558.423890
min      41777.000000
25%      42432.000000
50%      42486.000000
75%      43204.000000
max      43466.000000
dtype: float64

In [9]:
sizes.loc[drop_idx:].describe()

count     2587.000000
mean     28961.503286
std        396.403636
min      27937.000000
25%      28805.000000
50%      28808.000000
75%      28813.000000
max      32687.000000
dtype: float64

Verify that no NaN values exist.

In [10]:
nan_count = df_total.isnull().sum().sum()
print(f"There are {nan_count} NaN values.")

There are 0 NaN values.
