# Baseline

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from fastcore.basics import Path, AttrDict
from evaluation import NodeDetectionEvaluator
import os

In [2]:
config = AttrDict(
    challenge_dir = Path('../output'),
    valid_ratio = 0.1,
    lag_steps = 5,
    evaluation_tolerance = 6
)

In [3]:
# Define the list of feature columns
feature_cols = ['Eccentricity', 'Semimajor axis (km)', 'Inclination (deg)', 
                'RAAN (deg)', 'Argument of periapsis (deg)', 'True anomaly (deg)', 
                'Latitude (deg)', 'Longitude (deg)', 'Altitude (km)', 'J2k X (km)', 
                'J2k Y (km)', 'J2k Z (km)', 'J2k Vx (km/s)', 'J2k Vy (km/s)', 
                'J2k Vz (km/s)']

In [4]:
# Define the function to prepare the data
def prepare_data(data_dir, feature_cols, ground_truth=None, lag_steps=1, fill_na=True):
    merged_data = pd.DataFrame()
    for data_file in Path(data_dir).glob('*.csv'):
        data_df = pd.read_csv(data_file)
        data_df['ObjectID'] = int(data_file.stem)
        data_df['Time Index'] = range(len(data_df))
    
        lagged_features = []
        new_feature_cols = list(feature_cols)  # Create a copy of feature_cols
        # Create lagged features for each column in feature_cols
        for col in feature_cols:
            for i in range(1, lag_steps+1):
                lag_col_name = f'{col}_lag_{i}'
                data_df[lag_col_name] = data_df.groupby('ObjectID')[col].shift(i)
                new_feature_cols.append(lag_col_name)  # Add the lagged feature to new_feature_cols
        
        # Add the lagged features to the DataFrame all at once
        data_df = pd.concat([data_df] + lagged_features, axis=1)

        if ground_truth is None:
            merged_df = data_df
        else:
            ground_truth_object = ground_truth[ground_truth['ObjectID'] == data_df['ObjectID'][0]].copy()
            # Separate the 'EW' and 'NS' types in the ground truth
            ground_truth_EW = ground_truth_object[ground_truth_object['Direction'] == 'EW'].copy()
            ground_truth_NS = ground_truth_object[ground_truth_object['Direction'] == 'NS'].copy()
            
            # Create 'EW' and 'NS' labels and fill 'unknown' values
            ground_truth_EW['EW'] = ground_truth_EW['Node'] + '-' + ground_truth_EW['Type']
            ground_truth_NS['NS'] = ground_truth_NS['Node'] + '-' + ground_truth_NS['Type']
            ground_truth_EW.drop(['Node', 'Type', 'Direction'], axis=1, inplace=True)
            ground_truth_NS.drop(['Node', 'Type', 'Direction'], axis=1, inplace=True)

            # Merge the input data with the ground truth
            merged_df = pd.merge(data_df, 
                                ground_truth_EW.sort_values('Time Index'), 
                                on=['Time Index', 'ObjectID'],
                                how='left')
            merged_df = pd.merge_ordered(merged_df, 
                                        ground_truth_NS.sort_values('Time Index'), 
                                        on=['Time Index', 'ObjectID'],
                                        how='left')

            # Fill 'unknown' values in 'EW' and 'NS' columns that come before the first valid observation
            merged_df['EW'].fillna(method='ffill', inplace=True)
            merged_df['NS'].fillna(method='ffill', inplace=True)
            
        merged_data = pd.concat([merged_data, merged_df])

    # Fill missing values (for the lagged features)
    if fill_na:
        merged_data.fillna(method='bfill', inplace=True)
    
    return merged_data, new_feature_cols

In [5]:
# Define the directory paths
train_data_dir = config.challenge_dir / "train_data"

# Load the ground truth data
ground_truth = pd.read_csv(config.challenge_dir / 'ground_truth_train.csv')

# Apply the function to the ground truth data
data, updated_feature_cols = prepare_data(train_data_dir, feature_cols, 
                                          ground_truth, lag_steps=config.lag_steps)

# For each ObjectID, show the first rows of the columns Time Index, ObjectID, EW, and NS
data[['ObjectID', 'Time Index' , 'EW', 'NS']].groupby('ObjectID').head(2).head(10)

  merged_df['EW'].fillna(method='ffill', inplace=True)
  merged_df['NS'].fillna(method='ffill', inplace=True)
  merged_df['EW'].fillna(method='ffill', inplace=True)
  merged_df['NS'].fillna(method='ffill', inplace=True)
  merged_df['EW'].fillna(method='ffill', inplace=True)
  merged_df['NS'].fillna(method='ffill', inplace=True)
  merged_data.fillna(method='bfill', inplace=True)


Unnamed: 0,ObjectID,Time Index,EW,NS
0,5,0,SS-CK,SS-NK
1,5,1,SS-CK,SS-NK
0,1,0,SS-HK,SS-NK
1,1,1,SS-HK,SS-NK
0,2,0,SS-NK,SS-NK
1,2,1,SS-NK,SS-NK


In [6]:
# Create a validation set without mixing the ObjectIDs
object_ids = data['ObjectID'].unique()
train_ids, valid_ids = train_test_split(object_ids, 
                                        test_size=config.valid_ratio, 
                                        random_state=42)

train_data = data[data['ObjectID'].isin(train_ids)].copy()
valid_data = data[data['ObjectID'].isin(valid_ids)].copy()

ground_truth_train = ground_truth[ground_truth['ObjectID'].isin(train_ids)].copy()
ground_truth_valid = ground_truth[ground_truth['ObjectID'].isin(valid_ids)].copy()

# Count the number of objects in the training and validation sets
print('Number of objects in the training set:', len(train_data['ObjectID'].unique()))
print('Number of objects in the validation set:', len(valid_data['ObjectID'].unique()))

Number of objects in the training set: 2
Number of objects in the validation set: 1


In [7]:
# Get the unique values of EW and NS in train and test data
train_EW = set(train_data['EW'].unique())
train_NS = set(train_data['NS'].unique())
test_EW = set(train_data['EW'].unique())
test_NS = set(valid_data['NS'].unique())

# Get the values of EW and NS that are in test data but not in train data
missing_EW = test_EW.difference(train_EW)
missing_NS = test_NS.difference(train_NS)

# Check if all the values in EW are also present in NS
if not set(train_data['EW'].unique()).issubset(set(train_data['NS'].unique())):
    # Get the values of EW that are not present in NS
    missing_EW_NS = set(train_data['EW'].unique()).difference(
        set(train_data['NS'].unique())
    )
else:
    missing_EW_NS = None

# Print the missing values of EW and NS
print("Missing values of EW in test data:", missing_EW)
print("Missing values of NS in test data:", missing_NS)
print("Values of EW not present in NS:", missing_EW_NS)


Missing values of EW in test data: set()
Missing values of NS in test data: set()
Values of EW not present in NS: {'SS-HK', 'AD-NK'}


In [8]:
# Convert categorical data to numerical data
le_EW = LabelEncoder()
le_NS = LabelEncoder()

# Encode the 'EW' and 'NS' columns
train_data['EW_encoded'] = le_EW.fit_transform(train_data['EW'])
train_data['NS_encoded'] = le_NS.fit_transform(train_data['NS'])

# Define the Random Forest model for EW
model_EW = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the model to the training data for EW
model_EW.fit(train_data[updated_feature_cols], train_data['EW_encoded'])

# Define the Random Forest model for NS
model_NS = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the model to the training data for NS
model_NS.fit(train_data[updated_feature_cols], train_data['NS_encoded'])

In [9]:
# Make predictions on the training data for EW
train_data['Predicted_EW'] = le_EW.inverse_transform(
    model_EW.predict(train_data[updated_feature_cols])
)

# Make predictions on the validation data for NS
train_data['Predicted_NS'] = le_NS.inverse_transform(
    model_NS.predict(train_data[updated_feature_cols])
)

# Print the first few rows of the test data with predictions for both EW and NS
train_data[['Time Index', 'ObjectID', 'EW', 
            'Predicted_EW', 'NS', 'Predicted_NS']].groupby('ObjectID').head(3)

Unnamed: 0,Time Index,ObjectID,EW,Predicted_EW,NS,Predicted_NS
0,0,1,SS-HK,SS-HK,SS-NK,SS-NK
1,1,1,SS-HK,SS-HK,SS-NK,SS-NK
2,2,1,SS-HK,SS-HK,SS-NK,SS-NK
0,0,2,SS-NK,SS-NK,SS-NK,SS-NK
1,1,2,SS-NK,SS-NK,SS-NK,SS-NK
2,2,2,SS-NK,SS-NK,SS-NK,SS-NK


In [10]:
if config.valid_ratio > 0:
    # Make predictions on the validation data for EW
    valid_data['Predicted_EW'] = le_EW.inverse_transform(
        model_EW.predict(valid_data[updated_feature_cols])
    )

    # Make predictions on the validation data for NS
    valid_data['Predicted_NS'] = le_NS.inverse_transform(
        model_NS.predict(valid_data[updated_feature_cols])
    )

In [11]:
def convert_classifier_output(classifier_output):
    # Split the 'Predicted_EW' and 'Predicted_NS' columns into 'Node' and 'Type' columns
    ew_df = classifier_output[['Time Index', 'ObjectID', 'Predicted_EW']].copy()
    ew_df[['Node', 'Type']] = ew_df['Predicted_EW'].str.split('-', expand=True)
    ew_df['Direction'] = 'EW'
    ew_df.drop(columns=['Predicted_EW'], inplace=True)

    ns_df = classifier_output[['Time Index', 'ObjectID', 'Predicted_NS']].copy()
    ns_df[['Node', 'Type']] = ns_df['Predicted_NS'].str.split('-', expand=True)
    ns_df['Direction'] = 'NS'
    ns_df.drop(columns=['Predicted_NS'], inplace=True)

    # Concatenate the processed EW and NS dataframes
    final_df = pd.concat([ew_df, ns_df], ignore_index=True)

    # Sort dataframe based on 'ObjectID', 'Direction' and 'Time Index'
    final_df.sort_values(['ObjectID', 'Direction', 'Time Index'], inplace=True)

    # Apply the function to each group of rows with the same 'ObjectID' and 'Direction'
    groups = final_df.groupby(['ObjectID', 'Direction'])
    keep = groups[['Node', 'Type']].apply(lambda group: group.shift() != group).any(axis=1)

    # Filter the DataFrame to keep only the rows we're interested in
    keep.index = final_df.index
    final_df = final_df[keep]

    # Reset the index and reorder the columns
    final_df = final_df.reset_index(drop=True)
    final_df = final_df[['ObjectID', 'Time Index', 'Direction', 'Node', 'Type']]
    final_df = final_df.sort_values(['ObjectID', 'Time Index', 'Direction'])

    return final_df



In [12]:
train_results = convert_classifier_output(train_data)
evaluator = NodeDetectionEvaluator(ground_truth_train, 
                                   train_results, 
                                   config.evaluation_tolerance)
precision, recall, f2, rmse = evaluator.score()
print(f'Precision for the train set: {precision:.2f}')
print(f'Recall for the train set: {recall:.2f}')
print(f'F2 for the train set: {f2:.2f}')
print(f'RMSE for the train set: {rmse:.2f}')


Precision for the train set: 1.00
Recall for the train set: 0.73
F2 for the train set: 0.77
RMSE for the train set: 0.00


In [13]:
# Loop over the Object IDs in the training set and call the evaluation
# function for each object and aggregate the results
total_tp = 0
total_fp = 0
total_fn = 0
for oid in train_data['ObjectID'].unique():
    tp, fp, fn, gt_object, p_object = evaluator.evaluate(oid)
    total_tp += tp
    total_fp += fp
    total_fn += fn

print(f'Total true positives: {total_tp}')
print(f'Total false positives: {total_fp}')
print(f'Total false negatives: {total_fn}')

Total true positives: 8
Total false positives: 0
Total false negatives: 3


In [14]:
if config.valid_ratio > 0:
    valid_results = convert_classifier_output(valid_data)
    evaluator = NodeDetectionEvaluator(ground_truth_valid, valid_results,  
                                         config.evaluation_tolerance)
precision, recall, f2, rmse = evaluator.score()
print(f'Precision for the validation set: {precision:.2f}')
print(f'Recall for the validation set: {recall:.2f}')
print(f'F2 for the validation set: {f2:.2f}')
print(f'RMSE for the validation set: {rmse:.2f}')

Precision for the validation set: 0.01
Recall for the validation set: 0.25
F2 for the validation set: 0.06
RMSE for the validation set: 0.00


In [15]:
test_data_dir = config.challenge_dir / "test_data"

test_data, _ = prepare_data(test_data_dir, feature_cols, 
                            lag_steps=config.lag_steps)

# Make predictions on the test data for EW
test_data['Predicted_EW'] = le_EW.inverse_transform(
    model_EW.predict(test_data[updated_feature_cols])
)

# Make predictions on the test data for NS
test_data['Predicted_NS'] = le_NS.inverse_transform(
    model_NS.predict(test_data[updated_feature_cols])
)

# save the predictions to a csv file
test_data.to_csv(config.challenge_dir / 'baseline_results.csv', index=False)

# Print the first few rows of the test data with predictions for both EW and NS
test_results = convert_classifier_output(test_data)
test_results


  merged_data.fillna(method='bfill', inplace=True)


Unnamed: 0,ObjectID,Time Index,Direction,Node,Type
0,3,0,EW,SS,HK
1,3,0,NS,IK,HK
2,3,3,NS,SS,NK
3,3,146,NS,IK,HK
4,3,147,NS,SS,NK
...,...,...,...,...,...
237,4,1845,NS,IK,HK
238,4,1856,NS,SS,NK
239,4,1857,NS,IK,HK
240,4,1868,NS,SS,NK


In [16]:
ground_truth_test = pd.read_csv(config.challenge_dir / 'ground_truth_test.csv')
evaluator = NodeDetectionEvaluator(ground_truth_test, test_results,
                                         config.evaluation_tolerance)
precision, recall, f2, rmse = evaluator.score()
print(f'Precision for the validation set: {precision:.2f}')
print(f'Recall for the validation set: {recall:.2f}')
print(f'F2 for the validation set: {f2:.2f}')
print(f'RMSE for the validation set: {rmse:.2f}')

Precision for the validation set: 0.00
Recall for the validation set: 0.07
F2 for the validation set: 0.02
RMSE for the validation set: 0.00


In [17]:
# Save the test results to a csv file to be submitted to the challenge
test_results.to_csv(config.challenge_dir / 'baseline_results.csv', index=False)

In [None]:
# Table comparing results for the train, validation, and test sets
