# Training Data Generation

In [1]:
import os
import pandas as pd

from tqdm import tqdm
from constants import *

Define main procedures

In [2]:
def prepare_labels():
    """Load the label descriptor file and preprocess it for merging."""

    df_labels = pd.read_csv(os.path.join(LABEL_DIR, LABEL_FILE),
                            index_col=ORBIT_COL,
                            parse_dates=EVENT_COLS)

    # disregard incompletely labeled orbits
    df_labels = df_labels.dropna()

    # add columns for predecessor and successor events
    df_labels["Prev out"] = df_labels["SK outer out"].shift()
    df_labels["Next in"] = df_labels["SK outer in"].shift(-1)

    return df_labels

In [3]:
def prepare_orbit(orbit_id):
    """Load the specified orbit and preprocess it for merging."""

    file = os.path.join(ORBIT_DIR, ORBIT_FILE(orbit_id))
    df_stats.loc[orbit_id, "does_not_exist"] = not os.path.exists(file)

    if not df_stats.loc[orbit_id, "does_not_exist"]:
        df_orbit = pd.read_csv(file, index_col=DATE_COL, parse_dates=True)

        # determine validity criteria for orbit
        df_stats.loc[orbit_id, "total_nan_count"] = (
                df_orbit.isnull().sum().sum()
        )
        df_stats.loc[orbit_id, "missing_entry_count"] = (
                (df_orbit.index[-1] - df_orbit.index[0]).total_seconds() + 1 - len(df_orbit)
        )
        df_stats.loc[orbit_id, "has_special_conditions"] = (
                df_labels.loc[orbit_id, "SK outer in"] <= df_orbit.index[0]
                or df_orbit.index[0] <= df_labels.loc[orbit_id, "Prev out"]
                or df_labels.loc[orbit_id, "Next in"] <= df_orbit.index[-1]
                or df_orbit.index[-1] <= df_labels.loc[orbit_id, "SK outer out"]
        )
        df_stats.loc[orbit_id, "flux_norm_maximum"] = (
                df_orbit[FLUX_COLS].pow(2).sum(axis=1).pow(0.5).max()
        )
        
        if df_stats.loc[orbit_id].iloc[0:-1].sum() == 0:
            df_orbit[ORBIT_COL] = orbit_id  # add orbit id
            return df_orbit
        else:
            return None  # rule out invalid orbit

In [4]:
def assign_labels(df_train, df_labels):
    """Assign labels to the training time series using event boundary labels."""

    # interplanetary magnetic field
    df_train[LABEL_COL] = 0

    for row in tqdm(range(len(df_labels))):
        event = lambda col: df_labels.iloc[row, df_labels.columns.get_loc(col)]

        # bow shock crossings
        df_train.loc[event("SK outer in"):event("SK inner in"), LABEL_COL] = 1
        df_train.loc[event("SK inner out"):event("SK outer out"), LABEL_COL] = 1

        # magnetosheath
        df_train.loc[event("SK inner in"):event("MP outer in"), LABEL_COL] = 2
        df_train.loc[event("MP outer out"):event("SK inner out"), LABEL_COL] = 2

        # magnetopause crossings
        df_train.loc[event("MP outer in"):event("MP inner in"), LABEL_COL] = 3
        df_train.loc[event("MP inner out"):event("MP outer out"), LABEL_COL] = 3

        # magnetosphere
        df_train.loc[event("MP inner in"):event("MP inner out"), LABEL_COL] = 4

Prepare label descriptor file

In [5]:
df_labels = prepare_labels()
print(f"#healthy orbits: {len(df_labels)}")

#healthy orbits: 4019


Set up orbit validity statistics

In [6]:
df_stats = pd.DataFrame(
    index=df_labels.index,
    columns=["does_not_exist",
             "total_nan_count",
             "missing_entry_count",
             "has_special_conditions",
             "flux_norm_maximum"]
)

Combine orbit data into one big frame

In [7]:
df_train = pd.concat(map(prepare_orbit, tqdm(df_labels.index)))
df_train.iloc[43462:43467]

100%|██████████| 4019/4019 [19:36<00:00,  3.42it/s]


Unnamed: 0_level_0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Y,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-03-24 19:46:22,6897.941,5403.502,-15374.343,29.343,-2.064,-9.552,0.299,1.637,1.395,18118.125361,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049779,55.749454,49155130.0,0.670984,0,2
2011-03-24 19:46:23,6898.235,5403.903,-15374.07,29.54,-1.713,-8.633,0.809,1.914,2.789,18118.117945,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049796,55.749455,49155130.0,0.670984,2,2
2011-03-24 19:46:23,6898.235,5403.903,-15374.07,29.54,-1.713,-8.633,0.809,1.914,2.789,18118.117945,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049796,55.749455,49155130.0,0.670984,2,3
2011-03-24 19:46:24,6898.53,5404.304,-15373.797,29.849,-0.977,-10.833,1.065,4.469,3.514,18118.110927,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049746,55.749453,49155130.0,0.670984,0,3
2011-03-24 19:46:25,6898.825,5404.705,-15373.523,29.333,-2.752,-12.014,1.25,5.516,5.414,18118.103052,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049724,55.749452,49155130.0,0.670984,0,3


Remove extreme outliers

In [8]:
df_max = df_stats["flux_norm_maximum"]
outlier_orbits = df_stats.index[df_max > df_max.mean() + 3 * df_max.std()]
print(list(outlier_orbits))

[386, 1375, 1615, 1729, 1899, 1946, 3713, 3714, 3718, 4068]


In [9]:
df_train = df_train[~df_train[ORBIT_COL].isin(outlier_orbits)]

Save orbit validity statistics

In [10]:
df_stats.to_excel(os.path.join(MERGED_DIR, "orbit_statistics.xlsx"))

Assign labels to the instances of the training data

In [11]:
assign_labels(df_train, df_labels)
df_train.iloc[16605:16610]

100%|██████████| 4019/4019 [2:21:59<00:00,  2.12s/it]  


Unnamed: 0_level_0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT,LABEL
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-03-24 12:18:45,5804.272,7081.276,-2428.74,-44.922,16.665,4.784,4.239,6.375,3.369,9608.230719,...,5855501.0,-48.713393,-27.442143,2.22768,55.955593,48946400.0,0.633532,0,2,0
2011-03-24 12:18:46,5803.633,7080.696,-2427.421,-37.371,8.136,7.607,2.316,3.672,3.443,9607.017439,...,5855501.0,-48.713393,-27.442143,2.227663,55.955593,48946400.0,0.633532,0,2,0
2011-03-24 12:18:47,5802.993,7080.117,-2426.101,-34.104,4.656,14.43,1.949,0.97,1.545,9605.804096,...,5855501.0,-48.713393,-27.442143,2.22766,55.955593,48946400.0,0.633532,0,2,1
2011-03-24 12:18:48,5802.353,7079.538,-2424.782,-37.385,-0.449,23.1,1.797,3.131,4.607,9604.59116,...,5855501.0,-48.713393,-27.442143,2.227669,55.955593,48946400.0,0.633532,0,2,1
2011-03-24 12:18:49,5801.714,7078.958,-2423.462,-31.174,2.786,35.948,3.817,7.081,2.152,9603.377895,...,5855501.0,-48.713393,-27.442143,2.227612,55.955591,48946400.0,0.633532,0,2,1


Save the final training dataframe

In [12]:
df_train.to_csv(os.path.join(MERGED_DIR, TRAIN_FILE))

Generate descriptive overall statistics

In [13]:
df_train_description = df_train.describe()
df_train_description.to_excel(os.path.join(MERGED_DIR, "df_train_total_description.xlsx"))
df_train_description

Unnamed: 0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT,LABEL
count,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,...,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0,99126930.0
mean,133.6537,-173.5532,-7902.264,0.3670526,-0.1850193,-4.418589,1.061097,1.399393,1.398633,10517.57,...,-1468158.0,-0.06708391,3.648269,0.3042887,47.52801,58952990.0,-0.004050004,6.361541e-05,1863.071,0.9588953
std,3663.897,3896.013,5243.407,40.66517,40.02489,56.44536,1.652641,2.090043,2.095408,3981.771,...,4869809.0,34.23781,33.26465,4.137488,7.097806,8470830.0,0.6861735,0.02110413,1171.023,1.468669
min,-10743.48,-11314.78,-16627.32,-395.574,-554.096,-579.198,0.0,0.0,0.0,2015.986,...,-7654416.0,-58.4862,-46.61918,-4.894014,38.85789,45999970.0,-1.0,-2.0,2.0,0.0
25%,-2286.245,-2820.123,-11967.75,-17.583,-11.151,-8.23,0.18,0.216,0.238,7762.218,...,-6257032.0,-32.68717,-27.5676,-3.702871,40.59049,50276540.0,-0.6640699,0.0,813.0,0.0
50%,-87.185,-80.93,-9314.936,-0.419,0.078,0.567,0.514,0.697,0.71,11377.14,...,-2151302.0,9.515335,4.450861,-0.4182666,46.09395,60099970.0,-0.009113506,0.0,1800.0,0.0
75%,2541.941,2522.989,-4028.148,15.535,11.102,9.935,1.286,1.772,1.749,13029.24,...,3342049.0,32.50964,35.94925,4.23545,54.65855,67378280.0,0.6696617,0.0,2800.0,2.0
max,11430.29,11110.89,3154.425,558.988,520.669,395.668,333.282,296.084,286.698,18120.33,...,6239501.0,38.94385,50.89473,7.036499,58.97771,69817820.0,0.9999954,2.0,4093.0,4.0


Generate descriptive label-wise statistics

In [14]:
df_train_description = df_train.groupby([LABEL_COL]).describe()
df_train_description.to_excel(os.path.join(MERGED_DIR, "df_train_label_description.xlsx"))
df_train_description

Unnamed: 0_level_0,X_MSO,X_MSO,X_MSO,X_MSO,X_MSO,X_MSO,X_MSO,X_MSO,Y_MSO,Y_MSO,...,EXTREMA,EXTREMA,ORBIT,ORBIT,ORBIT,ORBIT,ORBIT,ORBIT,ORBIT,ORBIT
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,64868418.0,1328.709071,3400.127286,-9972.731,-1200.15475,1092.349,3729.939,11430.286,64868418.0,-364.065138,...,0.0,2.0,64868418.0,1836.70306,1205.649831,2.0,739.0,1752.0,2827.0,4093.0
1,3667178.0,-1804.555668,3155.25971,-10142.408,-3668.66075,-2098.407,51.3635,10403.462,3667178.0,22.846575,...,0.0,1.0,3667178.0,1994.269877,1121.437101,2.0,1075.0,1997.0,2883.0,4093.0
2,14346807.0,-2541.306124,3273.380119,-10743.482,-4682.836,-2921.892,-467.993,10397.185,14346807.0,309.575659,...,0.0,1.0,14346807.0,1889.098537,1109.776997,2.0,972.0,1837.0,2745.0,4093.0
3,2286557.0,-2237.364815,3367.251966,-10611.648,-4787.901,-2492.33,569.021,5104.713,2286557.0,230.131479,...,0.0,1.0,2286557.0,1935.025817,1127.884413,2.0,989.0,1917.0,2810.0,4093.0
4,13957973.0,-1773.140865,2566.189258,-10289.062,-3773.34,-1442.585,265.639,4869.525,13957973.0,97.515342,...,0.0,1.0,13957973.0,1912.601005,1080.17329,2.0,1047.0,1871.0,2724.0,4093.0
