# Training Data Validation

In [1]:
import os
import pandas as pd
import utils.constants as c
import utils.io as io

from tqdm import tqdm

Recombine data splits

In [2]:
data_dir = io.resolve_path(c.DATA_DIR)
train_dir = data_dir / c.TRAIN_SUBDIR
eval_dir = data_dir / c.EVAL_SUBDIR
test_dir = data_dir / c.TEST_SUBDIR
train_orbits = pd.DataFrame({"dir": c.TRAIN_SUBDIR, "file": os.listdir(train_dir)})
eval_orbits = pd.DataFrame({"dir": c.EVAL_SUBDIR, "file": os.listdir(eval_dir)})
test_orbits = pd.DataFrame({"dir": c.TEST_SUBDIR, "file": os.listdir(test_dir)})
all_orbits = pd.concat([train_orbits, eval_orbits, test_orbits])
all_orbits.sort_values(by="file", inplace=True)
all_orbits.head()

Unnamed: 0,dir,file
395,eval,messenger-0002.csv
596,eval,messenger-0003.csv
612,train,messenger-0004.csv
179,train,messenger-0005.csv
2084,train,messenger-0006.csv


In [None]:
df_total = pd.concat(pd.read_csv(
    data_dir / row["dir"] / row["file"],
    index_col=c.DATE_COL,
    parse_dates=True
) for _, row in tqdm(all_orbits.iterrows()))

680it [02:24,  6.52it/s]

Summarize basic information

In [None]:
df_total.info()

In [None]:
orbit_ids = df_total[c.ORBIT_COL].unique()
print(f"#orbits: {len(orbit_ids)}")

At first glance, there appears to be a high deviation in orbit length.

In [None]:
sizes = df_total.groupby(c.ORBIT_COL).size()
sizes.describe()

The reason for this is that MESSENGER changed from a 12-hour orbit to 8-hour orbit in April 2012.

In [None]:
drop_idx = sizes.diff().abs().idxmax()
drop_day = df_total.index[df_total[c.ORBIT_COL] == drop_idx][0].strftime("%Y-%m-%d")
print(f"orbit lengths dropped starting with orbit #{drop_idx} on {drop_day}")
sizes.loc[drop_idx-5:drop_idx+5]

When taking this into account, the initial deviation disappears.

In [None]:
sizes.loc[:drop_idx-1].describe()

In [None]:
sizes.loc[drop_idx:].describe()

Verify that no NaN values exist.

In [None]:
nan_count = df_total.isnull().sum().sum()
print(f"There are {nan_count} NaN values.")