# Training Data Generation

In [1]:
import pandas as pd
import constants as c
import utils

from tqdm import tqdm

Define main procedures

In [2]:
def prepare_labels():
    """Load the label descriptor file and preprocess it for merging."""

    df_labels = pd.read_csv(utils.resolve_path(c.DATA_DIR) / c.LABEL_SUBDIR / c.LABEL_FILE,
                            index_col=c.ORBIT_COL,
                            parse_dates=c.EVENT_COLS)

    # disregard incompletely labeled orbits
    df_labels = df_labels.dropna()

    # add columns for predecessor and successor events
    df_labels["Prev out"] = df_labels["SK outer out"].shift()
    df_labels["Next in"] = df_labels["SK outer in"].shift(-1)

    return df_labels

In [3]:
def prepare_orbit(orbit_id):
    """Load the specified orbit and preprocess it for merging."""

    orbit_file = utils.resolve_path(c.DATA_DIR) / c.RAW_SUBDIR / c.ORBIT_FILE(orbit_id)
    df_health.loc[orbit_id, "does_not_exist"] = not orbit_file.exists()

    if not df_health.loc[orbit_id, "does_not_exist"]:
        df_orbit = pd.read_csv(orbit_file, index_col=c.DATE_COL, parse_dates=True)

        # determine validity criteria for orbit
        df_health.loc[orbit_id, "total_nan_count"] = (
                df_orbit.isnull().sum().sum()
        )
        df_health.loc[orbit_id, "missing_entry_count"] = (
                (df_orbit.index[-1] - df_orbit.index[0]).total_seconds() + 1 - len(df_orbit)
        )
        df_health.loc[orbit_id, "has_special_conditions"] = (
                df_labels.loc[orbit_id, "SK outer in"] <= df_orbit.index[0]
                or df_orbit.index[0] <= df_labels.loc[orbit_id, "Prev out"]
                or df_labels.loc[orbit_id, "Next in"] <= df_orbit.index[-1]
                or df_orbit.index[-1] <= df_labels.loc[orbit_id, "SK outer out"]
        )
        df_health.loc[orbit_id, "flux_norm_maximum"] = (
                df_orbit[c.FLUX_COLS].pow(2).sum(axis=1).pow(0.5).max()
        )
        
        if df_health.loc[orbit_id].iloc[0:-1].sum() == 0:
            df_orbit[c.ORBIT_COL] = orbit_id  # add orbit id
            return df_orbit

    return None  # rule out invalid orbit

In [4]:
def assign_labels(df_total, df_labels):
    """Assign labels to the training time series using event boundary labels."""

    # interplanetary magnetic field
    df_total[c.LABEL_COL] = 0

    for row in tqdm(range(len(df_labels))):
        event = lambda col: df_labels.iloc[row, df_labels.columns.get_loc(col)]

        # bow shock crossings
        df_total.loc[event("SK outer in"):event("SK inner in"), c.LABEL_COL] = 1
        df_total.loc[event("SK inner out"):event("SK outer out"), c.LABEL_COL] = 1

        # magnetosheath
        df_total.loc[event("SK inner in"):event("MP outer in"), c.LABEL_COL] = 2
        df_total.loc[event("MP outer out"):event("SK inner out"), c.LABEL_COL] = 2

        # magnetopause crossings
        df_total.loc[event("MP outer in"):event("MP inner in"), c.LABEL_COL] = 3
        df_total.loc[event("MP inner out"):event("MP outer out"), c.LABEL_COL] = 3

        # magnetosphere
        df_total.loc[event("MP inner in"):event("MP inner out"), c.LABEL_COL] = 4

Prepare label descriptor file

In [5]:
df_labels = prepare_labels()
print(f"# validly labeled orbits: {len(df_labels)}")

# validly labeled orbits: 4019


Set up orbit validity statistics

In [6]:
df_health = pd.DataFrame(
    index=df_labels.index,
    columns=["does_not_exist",
             "total_nan_count",
             "missing_entry_count",
             "has_special_conditions",
             "flux_norm_maximum"]
)

Combine orbit data into one big frame

In [7]:
df_total = pd.concat(map(prepare_orbit, tqdm(df_labels.index)))
df_total.iloc[43462:43467]

100%|██████████| 4019/4019 [12:17<00:00,  5.45it/s]


Unnamed: 0_level_0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Y,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-03-24 19:46:22,6897.941,5403.502,-15374.343,29.343,-2.064,-9.552,0.299,1.637,1.395,18118.125361,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049779,55.749454,49155130.0,0.670984,0,2
2011-03-24 19:46:23,6898.235,5403.903,-15374.07,29.54,-1.713,-8.633,0.809,1.914,2.789,18118.117945,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049796,55.749455,49155130.0,0.670984,2,2
2011-03-24 19:46:23,6898.235,5403.903,-15374.07,29.54,-1.713,-8.633,0.809,1.914,2.789,18118.117945,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049796,55.749455,49155130.0,0.670984,2,3
2011-03-24 19:46:24,6898.53,5404.304,-15373.797,29.849,-0.977,-10.833,1.065,4.469,3.514,18118.110927,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049746,55.749453,49155130.0,0.670984,0,3
2011-03-24 19:46:25,6898.825,5404.705,-15373.523,29.333,-2.752,-12.014,1.25,5.516,5.414,18118.103052,...,38027530.0,5912986.0,-47.808573,-28.603154,2.049724,55.749452,49155130.0,0.670984,0,3


Remove extreme outliers by three-sigma rule

In [8]:
df_max = df_health["flux_norm_maximum"]
outlier_orbits = df_health.index[df_max > df_max.mean() + 3 * df_max.std()]
df_total = df_total[~df_total[c.ORBIT_COL].isin(outlier_orbits)]
print(list(outlier_orbits))

[386, 1375, 1615, 1729, 1899, 1946, 3713, 3714, 3718, 4068]


Save orbit validity statistics

In [9]:
df_health.to_csv(utils.resolve_path(c.DATA_DIR) / c.HEALTH_FILE)
orbit_ids = df_total[c.ORBIT_COL].unique()
print(f"# healthy orbits: {len(orbit_ids)}")

# healthy orbits: 3153


Assign labels to the instances of the training data

In [10]:
assign_labels(df_total, df_labels)
df_total.iloc[16605:16610]

100%|██████████| 4019/4019 [5:01:42<00:00,  4.50s/it]  


Unnamed: 0_level_0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT,LABEL
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-03-24 12:18:45,5804.272,7081.276,-2428.74,-44.922,16.665,4.784,4.239,6.375,3.369,9608.230719,...,5855501.0,-48.713393,-27.442143,2.22768,55.955593,48946400.0,0.633532,0,2,0
2011-03-24 12:18:46,5803.633,7080.696,-2427.421,-37.371,8.136,7.607,2.316,3.672,3.443,9607.017439,...,5855501.0,-48.713393,-27.442143,2.227663,55.955593,48946400.0,0.633532,0,2,0
2011-03-24 12:18:47,5802.993,7080.117,-2426.101,-34.104,4.656,14.43,1.949,0.97,1.545,9605.804096,...,5855501.0,-48.713393,-27.442143,2.22766,55.955593,48946400.0,0.633532,0,2,1
2011-03-24 12:18:48,5802.353,7079.538,-2424.782,-37.385,-0.449,23.1,1.797,3.131,4.607,9604.59116,...,5855501.0,-48.713393,-27.442143,2.227669,55.955593,48946400.0,0.633532,0,2,1
2011-03-24 12:18:49,5801.714,7078.958,-2423.462,-31.174,2.786,35.948,3.817,7.081,2.152,9603.377895,...,5855501.0,-48.713393,-27.442143,2.227612,55.955591,48946400.0,0.633532,0,2,1


Save class distribution information

In [11]:
df_freqs = df_total.groupby(c.LABEL_COL).size()
df_freqs.to_csv(utils.resolve_path(c.DATA_DIR) / c.FREQS_FILE)
df_freqs

LABEL
0    64868418
1     3667178
2    14346807
3     2286557
4    13957973
dtype: int64

Partition data into $90\%$ training and $10\%$ testing set

In [12]:
test_orbits = pd.Series(orbit_ids).sample(len(orbit_ids) // 10)
df_train = df_total[~df_total[c.ORBIT_COL].isin(test_orbits)]
df_test = df_total[df_total[c.ORBIT_COL].isin(test_orbits)]

Save descriptive statistics about the training set

In [13]:
df_stats = df_train.describe()
df_stats.index.name = c.STAT_COL
df_stats.to_csv(utils.resolve_path(c.DATA_DIR) / c.STATS_FILE)
df_stats

Unnamed: 0_level_0,X_MSO,Y_MSO,Z_MSO,BX_MSO,BY_MSO,BZ_MSO,DBX_MSO,DBY_MSO,DBZ_MSO,RHO_DIPOLE,...,Z,VX,VY,VZ,VABS,D,COSALPHA,EXTREMA,ORBIT,LABEL
STAT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
count,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,...,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0,89172890.0
mean,131.5765,-163.9584,-7899.505,0.4337857,-0.2653793,-4.454915,1.060072,1.400845,1.399623,10512.22,...,-1477714.0,-0.02943107,3.703317,0.3053322,47.52608,58951490.0,0.002349638,6.366285e-05,1862.997,0.9604381
std,3665.622,3885.022,5242.04,40.71519,39.99325,56.46241,1.653099,2.094693,2.100094,3978.662,...,4885600.0,34.17986,33.31217,4.125495,7.075591,8439905.0,0.687476,0.02111009,1169.493,1.469557
min,-10743.48,-11314.78,-16627.32,-395.574,-554.096,-579.198,0.0,0.0,0.0,2015.986,...,-7654416.0,-58.4862,-46.61918,-4.894014,38.85789,45999970.0,-1.0,-2.0,2.0,0.0
25%,-2319.364,-2806.297,-11964.99,-17.516,-11.217,-8.23,0.18,0.215,0.237,7759.019,...,-6329423.0,-32.4912,-27.89521,-3.679371,40.6504,50282070.0,-0.6630508,0.0,818.0,0.0
50%,-82.944,-74.301,-9311.913,-0.248,-0.053,0.537,0.513,0.697,0.71,11373.09,...,-2101660.0,9.672237,5.127849,-0.4220127,45.99979,60218270.0,0.001899174,0.0,1790.0,0.0
75%,2554.231,2530.306,-4026.452,15.565,11.011,9.863,1.286,1.775,1.751,13027.37,...,3339362.0,32.31478,35.9944,4.215396,54.65329,67295110.0,0.6773884,0.0,2794.0,2.0
max,11423.45,11110.89,3154.425,558.988,520.669,395.668,333.282,296.084,286.698,18120.33,...,6239501.0,38.94385,50.89473,7.036499,58.97771,69817820.0,0.9999954,2.0,4092.0,4.0


Save the final training data frames

In [14]:
train_dir = utils.resolve_path(c.DATA_DIR) / c.TRAIN_SUBDIR
utils.ensure_directory(train_dir)

for n, orbit in tqdm(df_train.groupby(c.ORBIT_COL)):
    orbit.to_csv(train_dir / c.ORBIT_FILE(n))

100%|██████████| 2838/2838 [36:08<00:00,  1.31it/s]


Save the final testing data frames

In [15]:
test_dir = utils.resolve_path(c.DATA_DIR) / c.TEST_SUBDIR
utils.ensure_directory(test_dir)

for n, orbit in tqdm(df_test.groupby(c.ORBIT_COL)):
    orbit.to_csv(test_dir / c.ORBIT_FILE(n))

100%|██████████| 315/315 [04:02<00:00,  1.30it/s]
