# Orbit-Label Merger

In [1]:
import os
import pandas as pd

Specify relevant file system directories

In [2]:
SOURCE_DIR = "orbits"  # where the orbit data is located
LABELS_DIR = "labels"  # where the label mapping is located
TARGET_DIR = "merged"  # where the result is to be written

Specify required column names

In [3]:
EVENT_COLUMNS = ["SK outer in", "SK inner in","MP outer in", "MP inner in",
                 "MP inner out", "MP outer out", "SK inner out", "SK outer out"]

DATA_COLUMNS = ["BX_MSO","BY_MSO","BZ_MSO",
                "X_MSO","Y_MSO","Z_MSO",
                "VX","VY","VZ"]

Define main procedures

In [4]:
def normalize(df, columns):
    """Normalize the specified columns in the given data frame."""

    df[columns] = (df[columns] - df[columns].mean()) / df[columns].std()

In [5]:
def prepare_orbit(orbit_id):
    """Load the specified orbit and preprocess it for merging."""

    file = os.path.join(SOURCE_DIR, f"messenger-{orbit_id:04d}.csv")
    if os.path.exists(file):
        df_orbit = pd.read_csv(file, usecols=DATA_COLUMNS + ["DATE"], parse_dates=True)
        df_orbit.interpolate()  # interpolate missing values
        df_orbit["ORBIT"] = orbit_id  # add orbit id
        normalize(df_orbit, DATA_COLUMNS)
        return df_orbit

In [6]:
def apply_labels(df_train, df_labels):
    """Assign labels to the training time series using event boundary labels."""

    df_train["LABEL"] = 0 # interplanetary magnetic field is default

    for row in range(len(df_labels)):
        # bow shock crossing
        bs_selector = ((df_train["DATE"] >= df_labels.iloc[row]["SK outer in"])
                         & (df_train["DATE"] < df_labels.iloc[row]["SK inner in"]))\
                      | ((df_train["DATE"] >= df_labels.iloc[row]["SK inner out"])
                         & (df_train["DATE"] < df_labels.iloc[row]["SK outer out"]))
        df_train.loc[bs_selector, "LABEL"] = 1  # bow shock crossing
    
        #magnetosheath
        msh_selector = ((df_train["DATE"] >= df_labels.iloc[row]["SK inner in"])
                         & (df_train["DATE"] < df_labels.iloc[row]["MP outer in"])) \
                      | ((df_train["DATE"] >= df_labels.iloc[row]["MP outer out"])
                         & (df_train["DATE"] < df_labels.iloc[row]["SK inner out"]))
        df_train.loc[msh_selector, "LABEL"] = 2  # magnetosheath

        # magnetopause crossing
        mp_selector = ((df_train["DATE"] >= df_labels.iloc[row]["MP outer in"])
                        & (df_train["DATE"] < df_labels.iloc[row]["MP inner in"]))\
                       | ((df_train["DATE"] >= df_labels.iloc[row]["MP inner out"])
                        & (df_train["DATE"] < df_labels.iloc[row]["MP outer out"]))
        df_train.loc[mp_selector, "LABEL"] = 3  # MP crossing

        # magnetosphere
        msp_selector = (df_train["DATE"] >= df_labels.iloc[row]["MP inner in"])\
                         & (df_train["DATE"] < df_labels.iloc[row]["MP inner out"])
        df_train.loc[msp_selector, "LABEL"] = 4  # magnetosphere

Load label descriptor file

In [7]:
labels_path = os.path.join(LABELS_DIR, "messenger-0001_-0200_labelled.csv")
df_labels = pd.read_csv(labels_path, parse_dates=True)
df_labels = df_labels.dropna()  # remove invalid orbits
print(len(df_labels["Orbit"]))

178


Combine orbit data into one big frame

In [8]:
df_train = pd.concat(map(prepare_orbit, df_labels["Orbit"]))
print(df_train.iloc[43460:43465])

                      DATE     X_MSO     Y_MSO     Z_MSO    BX_MSO    BY_MSO  \
43460  2011-03-24 07:41:58  0.814358  0.462644 -0.958839 -0.186325 -0.292972   
43461  2011-03-24 07:41:59  0.814447  0.462742 -0.958792 -0.171665 -0.239008   
43462  2011-03-24 07:42:00  0.814535  0.462840 -0.958745 -0.238541 -0.270739   
0      2011-03-24 07:42:00  0.711619  0.531844 -0.958634 -0.645540  0.151359   
1      2011-03-24 07:42:01  0.711702  0.531946 -0.958587 -0.658417  0.113636   

         BZ_MSO        VX        VY        VZ  ORBIT  
43460  0.528521  1.746449 -1.715732 -1.729294      1  
43461  0.512033  1.746449 -1.715732 -1.729697      1  
43462  0.530772  1.751370 -1.720399 -1.734349      1  
0      0.263814 -1.714806  1.744409  1.730366      2  
1      0.231000 -1.714806  1.744409  1.730804      2  


Assign labels to the instances of the training data

In [9]:
apply_labels(df_train, df_labels)
print(df_train.iloc[32245:32255])

                      DATE     X_MSO     Y_MSO     Z_MSO    BX_MSO    BY_MSO  \
32245  2011-03-24 04:35:03 -0.622700 -0.901712 -0.735512 -0.052880  0.478132   
32246  2011-03-24 04:35:04 -0.622538 -0.901576 -0.735612 -0.049653  0.508592   
32247  2011-03-24 04:35:05 -0.622375 -0.901440 -0.735713 -0.136198  0.471295   
32248  2011-03-24 04:35:06 -0.622213 -0.901304 -0.735813 -0.123536  0.479493   
32249  2011-03-24 04:35:07 -0.622051 -0.901168 -0.735914 -0.070214  0.497914   
32250  2011-03-24 04:35:08 -0.621889 -0.901032 -0.736014 -0.127132  0.555599   
32251  2011-03-24 04:35:09 -0.621726 -0.900896 -0.736115 -0.131035  0.585636   
32252  2011-03-24 04:35:10 -0.621564 -0.900760 -0.736216 -0.078850  0.622509   
32253  2011-03-24 04:35:11 -0.621402 -0.900624 -0.736316 -0.040648  0.656600   
32254  2011-03-24 04:35:12 -0.621240 -0.900488 -0.736417 -0.098734  0.788999   

         BZ_MSO        VX        VY        VZ  ORBIT  LABEL  
32245 -0.052872  0.837614 -0.842127 -0.840231      1     

Save data and generate descriptive statistics

In [10]:
df_train.to_csv(os.path.join(TARGET_DIR, "df_train.csv"))

df_train_description = df_train[DATA_COLUMNS].describe()
df_train_description.to_excel(os.path.join(TARGET_DIR, "df_train_total_description.xlsx"))

df_train_description = df_train.groupby(["LABEL"])[DATA_COLUMNS].describe()
df_train_description.to_excel(os.path.join(TARGET_DIR, "df_train_label_description.xlsx"))