# Training Data Generation

In [1]:
import os
import pandas as pd

from tqdm import tqdm
from constants import *

Define main procedures

In [2]:
def prepare_orbit(orbit_id):
    """Load the specified orbit and preprocess it for merging."""

    file = os.path.join(ORBIT_DIR, ORBIT_FILE(orbit_id))
    df_stats.loc[orbit_id, "does_not_exist"] = not os.path.exists(file)

    if not df_stats.loc[orbit_id, "does_not_exist"]:
        df_orbit = pd.read_csv(file, index_col=DATE_COL, parse_dates=True)

        # determine validity criteria for orbit
        df_stats.loc[orbit_id, "total_nan_count"] = (
                df_orbit.isnull().sum().sum()
        )
        df_stats.loc[orbit_id, "missing_entry_count"] = (
                (df_orbit.index[-1] - df_orbit.index[0]).total_seconds() + 1 - len(df_orbit)
        )
        df_stats.loc[orbit_id, "has_outside_labels"] = (
                df_labels.loc[orbit_id, "SK outer in"] < df_orbit.index[0]
                or df_orbit.index[-1] < df_labels.loc[orbit_id, "SK outer out"]
        )
        df_stats.loc[orbit_id, "flux_norm_maximum"] = (
                df_orbit[FLUX_COLS].pow(2).sum(axis=1).pow(0.5).max()
        )
        
        if df_stats.loc[orbit_id].iloc[0:-1].sum() == 0:
            df_orbit[ORBIT_COL] = orbit_id  # add orbit id
            return df_orbit
        else:
            return None  # rule out invalid orbit

In [3]:
def assign_labels(df_train, df_labels):
    """Assign labels to the training time series using event boundary labels."""

    # interplanetary magnetic field
    df_train[LABEL_COL] = 0

    for row in tqdm(range(len(df_labels))):
        event = lambda col: df_labels.iloc[row][col]

        # bow shock crossings
        df_train.loc[event("SK outer in"):event("SK inner in"), LABEL_COL] = 1
        df_train.loc[event("SK inner out"):event("SK outer out"), LABEL_COL] = 1

        # magnetosheath
        df_train.loc[event("SK inner in"):event("MP outer in"), LABEL_COL] = 2
        df_train.loc[event("MP outer out"):event("SK inner out"), LABEL_COL] = 2

        # magnetopause crossings
        df_train.loc[event("MP outer in"):event("MP inner in"), LABEL_COL] = 3
        df_train.loc[event("MP inner out"):event("MP outer out"), LABEL_COL] = 3

        # magnetosphere
        df_train.loc[event("MP inner in"):event("MP inner out"), LABEL_COL] = 4

Load label descriptor file

In [4]:
df_labels = pd.read_csv(os.path.join(LABEL_DIR, LABEL_FILE),
                        index_col=ORBIT_COL,
                        parse_dates=EVENT_COLS)
df_labels = df_labels.dropna()  # disregard incompletely labeled orbits
print(f"#healthy orbits: {len(df_labels)}")

#healthy orbits: 4019


Prepare orbit validity statistics

In [5]:
df_stats = pd.DataFrame(
    index=df_labels.index,
    columns=["does_not_exist",
             "total_nan_count",
             "missing_entry_count",
             "has_outside_labels",
             "flux_norm_maximum"]
)

Combine orbit data into one big frame

In [6]:
df_train = pd.concat(map(prepare_orbit, tqdm(df_labels.index)))
df_train.iloc[43462:43467]

100%|██████████| 4019/4019 [20:22<00:00,  3.29it/s]


Unnamed: 0_level_0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Y,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-03-24 19:46:22,6897.941,5403.502,-15374.343,29.343,-2.064,-9.552,0.299,1.637,1.395,18118.125361,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049779,55.749454,49155130.0,0.670984,0,2
2011-03-24 19:46:23,6898.235,5403.903,-15374.07,29.54,-1.713,-8.633,0.809,1.914,2.789,18118.117945,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049796,55.749455,49155130.0,0.670984,2,2
2011-03-24 19:46:23,6898.235,5403.903,-15374.07,29.54,-1.713,-8.633,0.809,1.914,2.789,18118.117945,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049796,55.749455,49155130.0,0.670984,2,3
2011-03-24 19:46:24,6898.53,5404.304,-15373.797,29.849,-0.977,-10.833,1.065,4.469,3.514,18118.110927,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049746,55.749453,49155130.0,0.670984,0,3
2011-03-24 19:46:25,6898.825,5404.705,-15373.523,29.333,-2.752,-12.014,1.25,5.516,5.414,18118.103052,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049724,55.749452,49155130.0,0.670984,0,3


Remove extreme outliers

In [24]:
outlier_orbits = df_stats.index[df_stats["flux_norm_maximum"] > df_stats["flux_norm_maximum"].mean() + 3 * df_stats["flux_norm_maximum"].std()]
print(outlier_orbits)

Int64Index([386, 1375, 1615, 1729, 1899, 1946, 3713, 3714, 3718, 4068], dtype='int64', name='ORBIT')


In [25]:
df_train = df_train[~df_train[ORBIT_COL].isin(outlier_orbits)]

Save orbit validity statistics

In [27]:
df_stats.to_excel(os.path.join(MERGED_DIR, "orbit_statistics.xlsx"))

Assign labels to the instances of the training data

In [None]:
assign_labels(df_train, df_labels)
df_train.iloc[16605:16610]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[LABEL_COL] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
 60%|█████▉    | 2400/4019 [2:28:22<1:37:45,  3.62s/it]

Save the final training dataframe

In [None]:
df_train.to_csv(os.path.join(MERGED_DIR, TRAIN_FILE))

Generate descriptive overall statistics

In [None]:
df_train_description = df_train.describe()
df_train_description.to_excel(os.path.join(MERGED_DIR, "df_train_total_description.xlsx"))
df_train_description

Generate descriptive label-wise statistics

In [None]:
df_train_description = df_train.groupby([LABEL_COL]).describe()
df_train_description.to_excel(os.path.join(MERGED_DIR, "df_train_label_description.xlsx"))
df_train_description