# Training Data Generation

In [1]:
import os
import pandas as pd

from tqdm import tqdm
from constants import *

Define main procedures

In [2]:
def prepare_orbit(orbit_id):
    """Load the specified orbit and preprocess it for merging."""

    file = os.path.join(ORBIT_DIR, ORBIT_FILE(orbit_id))
    df_stats.loc[orbit_id, "does_not_exist"] = not os.path.exists(file)

    if not df_stats.loc[orbit_id, "does_not_exist"]:
        df_orbit = pd.read_csv(file, index_col=DATE_COL, parse_dates=True)

        # determine validity criteria for orbit
        df_stats.loc[orbit_id, "total_nan_count"] = (
                df_orbit.isnull().sum().sum()
        )
        df_stats.loc[orbit_id, "missing_entry_count"] = (
                (df_orbit.index[-1] - df_orbit.index[0]).total_seconds() + 1 - len(df_orbit)
        )
        df_stats.loc[orbit_id, "has_outside_labels"] = (
                df_labels.loc[orbit_id, "SK outer in"] < df_orbit.index[0]
                or df_orbit.index[-1] < df_labels.loc[orbit_id, "SK outer out"]
        )
        
        if df_stats.loc[orbit_id].sum() == 0:
            df_orbit[ORBIT_COL] = orbit_id  # add orbit id
            return df_orbit
        else:
            return None  # rule out invalid orbit

In [3]:
def assign_labels(df_train, df_labels):
    """Assign labels to the training time series using event boundary labels."""

    # interplanetary magnetic field
    df_train[LABEL_COL] = 0

    for row in tqdm(range(len(df_labels))):
        event = lambda col: df_labels.iloc[row][col]

        # bow shock crossings
        df_train.loc[event("SK outer in"):event("SK inner in"), LABEL_COL] = 1
        df_train.loc[event("SK inner out"):event("SK outer out"), LABEL_COL] = 1

        # magnetosheath
        df_train.loc[event("SK inner in"):event("MP outer in"), LABEL_COL] = 2
        df_train.loc[event("MP outer out"):event("SK inner out"), LABEL_COL] = 2

        # magnetopause crossings
        df_train.loc[event("MP outer in"):event("MP inner in"), LABEL_COL] = 3
        df_train.loc[event("MP inner out"):event("MP outer out"), LABEL_COL] = 3

        # magnetosphere
        df_train.loc[event("MP inner in"):event("MP inner out"), LABEL_COL] = 4

Load label descriptor file

In [4]:
df_labels = pd.read_csv(os.path.join(LABEL_DIR, LABEL_FILE),
                        index_col=ORBIT_COL,
                        parse_dates=list(range(1, 9)))
df_labels = df_labels.dropna()  # disregard incompletely labeled orbits
print(f"#healthy orbits: {len(df_labels)}")

#healthy orbits: 4019


Combine orbit data into one big frame

In [5]:
df_stats = pd.DataFrame(
    index=df_labels.index,
    columns=["does_not_exist",
             "total_nan_count",
             "missing_entry_count",
             "has_outside_labels"]
)

df_train = pd.concat(map(prepare_orbit, tqdm(df_labels.index)))
df_train.iloc[43462:43467]

100%|██████████| 4019/4019 [14:23<00:00,  4.66it/s]


Unnamed: 0_level_0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Y,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-03-24 19:46:22,6897.941,5403.502,-15374.343,29.343,-2.064,-9.552,0.299,1.637,1.395,18118.125361,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049779,55.749454,49155130.0,0.670984,0,2
2011-03-24 19:46:23,6898.235,5403.903,-15374.07,29.54,-1.713,-8.633,0.809,1.914,2.789,18118.117945,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049796,55.749455,49155130.0,0.670984,2,2
2011-03-24 19:46:23,6898.235,5403.903,-15374.07,29.54,-1.713,-8.633,0.809,1.914,2.789,18118.117945,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049796,55.749455,49155130.0,0.670984,2,3
2011-03-24 19:46:24,6898.53,5404.304,-15373.797,29.849,-0.977,-10.833,1.065,4.469,3.514,18118.110927,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049746,55.749453,49155130.0,0.670984,0,3
2011-03-24 19:46:25,6898.825,5404.705,-15373.523,29.333,-2.752,-12.014,1.25,5.516,5.414,18118.103052,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049724,55.749452,49155130.0,0.670984,0,3


Save orbit validity statistics

In [7]:
df_stats.to_excel(os.path.join(MERGED_DIR, "orbit_statistics.xlsx"))

Assign labels to the instances of the training data

In [11]:
assign_labels(df_train, df_labels)
df_train.iloc[16605:16610]

100%|██████████| 4019/4019 [5:55:17<00:00,  5.30s/it]  


Unnamed: 0_level_0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT,LABEL
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-03-24 12:18:45,5804.272,7081.276,-2428.74,-44.922,16.665,4.784,4.239,6.375,3.369,9608.230719,...,5855501.0,-48.713393,-27.442143,2.22768,55.955593,48946400.0,0.633532,0,2,0
2011-03-24 12:18:46,5803.633,7080.696,-2427.421,-37.371,8.136,7.607,2.316,3.672,3.443,9607.017439,...,5855501.0,-48.713393,-27.442143,2.227663,55.955593,48946400.0,0.633532,0,2,0
2011-03-24 12:18:47,5802.993,7080.117,-2426.101,-34.104,4.656,14.43,1.949,0.97,1.545,9605.804096,...,5855501.0,-48.713393,-27.442143,2.22766,55.955593,48946400.0,0.633532,0,2,1
2011-03-24 12:18:48,5802.353,7079.538,-2424.782,-37.385,-0.449,23.1,1.797,3.131,4.607,9604.59116,...,5855501.0,-48.713393,-27.442143,2.227669,55.955593,48946400.0,0.633532,0,2,1
2011-03-24 12:18:49,5801.714,7078.958,-2423.462,-31.174,2.786,35.948,3.817,7.081,2.152,9603.377895,...,5855501.0,-48.713393,-27.442143,2.227612,55.955591,48946400.0,0.633532,0,2,1


Save the final training dataframe

In [12]:
df_train.to_csv(os.path.join(MERGED_DIR, "df_train.csv"))

Generate descriptive overall statistics

In [13]:
df_train_description = df_train.describe()
df_train_description.to_excel(os.path.join(MERGED_DIR, "df_train_total_description.xlsx"))
df_train_description

Unnamed: 0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT,LABEL
count,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,...,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0,102342500.0
mean,74.59365,-157.9842,-7887.372,0.2998895,-0.2245389,-4.415617,1.068106,1.406365,1.404429,10501.74,...,-1420325.0,0.2152367,2.984551,0.2241546,47.4435,59049860.0,-0.004905159,6.372717e-05,1883.164,0.9728163
std,3672.142,3881.64,5234.757,40.87478,40.15112,56.69018,3.951213,4.972515,5.02986,3972.189,...,4876415.0,34.05189,33.39439,4.128471,7.06254,8431919.0,0.6877777,0.0211226,1171.942,1.470978
min,-10743.48,-11314.78,-16627.32,-395.574,-9754.34,-1320.918,0.0,0.0,0.0,2015.986,...,-7654416.0,-58.4862,-46.61941,-4.894014,38.85789,45999970.0,-1.0,-2.0,2.0,0.0
25%,-2353.351,-2792.669,-11958.03,-17.652,-11.253,-8.293,0.182,0.218,0.24,7753.175,...,-6238016.0,-31.99752,-28.63935,-3.773057,40.59378,50518640.0,-0.6690732,0.0,833.0,0.0
50%,-138.502,-76.876,-9301.12,-0.429,0.054,0.582,0.52,0.702,0.714,11365.94,...,-2047131.0,9.954904,3.146155,-0.5571596,45.94845,60282840.0,-0.01029296,0.0,1832.0,0.0
75%,2490.67,2533.189,-4018.852,15.593,11.155,10.027,1.295,1.779,1.755,13024.81,...,3388680.0,32.41589,35.51466,4.118227,54.42669,67373730.0,0.6697104,0.0,2838.0,2.0
max,11430.29,11110.89,3154.425,6960.477,520.669,8817.431,17002.65,24369.53,22448.95,18120.33,...,6239501.0,38.94385,50.89473,7.036499,58.97771,69817820.0,0.9999954,2.0,4093.0,4.0


Generate descriptive label-wise statistics

In [14]:
df_train_description = df_train.groupby([LABEL_COL]).describe()
df_train_description.to_excel(os.path.join(MERGED_DIR, "df_train_label_description.xlsx"))
df_train_description

Unnamed: 0_level_0,X_MSO,X_MSO,X_MSO,X_MSO,X_MSO,X_MSO,X_MSO,X_MSO,Y_MSO,Y_MSO,...,EXTREMA,EXTREMA,ORBIT,ORBIT,ORBIT,ORBIT,ORBIT,ORBIT,ORBIT,ORBIT
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,66215306.0,1293.78526,3401.199762,-9972.731,-1219.836,1061.517,3689.65275,11430.286,66215306.0,-359.347332,...,0.0,2.0,66215306.0,1850.1585,1206.457428,2.0,745.0,1773.0,2877.0,4093.0
1,4116617.0,-1803.073803,3138.79858,-10142.408,-3688.732,-2071.174,52.715,10403.462,4116617.0,72.923658,...,0.0,2.0,4116617.0,2033.333254,1115.099951,2.0,1096.0,2078.0,2897.0,4093.0
2,15102125.0,-2587.614541,3301.847868,-10743.482,-4761.905,-2973.792,-491.634,10397.185,15102125.0,334.036583,...,0.0,2.0,15102125.0,1920.799025,1113.688286,2.0,1007.0,1884.0,2813.0,4093.0
3,2394351.0,-2303.91785,3390.13902,-10611.648,-4873.474,-2579.606,512.3675,5104.713,2394351.0,251.150979,...,0.0,1.0,2394351.0,1964.880551,1130.236703,2.0,1008.0,1957.0,2866.0,4093.0
4,14514142.0,-1792.517474,2588.160964,-10289.062,-3807.19875,-1465.7615,266.321,4869.525,14514142.0,115.720813,...,0.0,1.0,14514142.0,1938.505964,1083.136946,2.0,1075.0,1912.0,2779.0,4093.0
