# This Notebook trains on as much data as possible so as to optimize final prediction performance

## Loading the Data

In [37]:
import pandas as pd

train_df = pd.read_csv('Train_set.csv')

## Data Exploration

In [38]:
train_df.head()

Unnamed: 0,ID,T0,T1,T2,T3,T4,T5,T6,T7,T8,...,T178,T179,T180,T181,T182,T183,T184,T185,T186,Class
0,0,0.965812,0.792023,0.116809,0.0,0.162393,0.213675,0.264957,0.247863,0.270655,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,1.0,0.597015,0.0,0.109453,0.094527,0.084577,0.074627,0.094527,0.114428,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2,0.831382,0.714286,0.491803,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,3,1.0,0.837705,0.236066,0.037705,0.252459,0.329508,0.319672,0.306557,0.304918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,4,0.86859,0.448718,0.490385,0.477564,0.461538,0.455128,0.416667,0.304487,0.182692,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


# Data Preprocessing

In [39]:
# impute missing values in with the median value of the column by label for best replaced data 
def impute_values(df):
    grouped = df.groupby('Class')
    medians = grouped.median()
    filled_df = df.copy() 
    for label, group in grouped:
        mask = (df['Class'] == label)  # Mask to select rows corresponding to the current label
        for column in df.columns[1:-1]:
            median_value = medians.loc[label, column]  # Median value for the current label and column
            filled_df.loc[mask, column] = filled_df.loc[mask, column].fillna(median_value)
    return filled_df    

filled_df = impute_values(train_df)

In [40]:
# use this test to evaluate the performance of the model before using the actual test data (can train on entire train set for that)
from sklearn.model_selection import train_test_split
# split the data into features and labels
X_train = train_df.iloc[:, 1:-1]
y_train = train_df['Class']

In [47]:
print(y_train.value_counts())

Class
0    72471
4     6431
2     5788
1     2223
3      641
Name: count, dtype: int64


## Resampling train data 

In [21]:
# try oversampling the minority class and undersampling the majority class significantly
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

undersampling_strategy = {0: 35000}
under = RandomUnderSampler(sampling_strategy=undersampling_strategy)
steps = [('u', under)]
pipeline = Pipeline(steps=steps)

# apply the pipeline to the data
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

## Final Model

In [42]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

final_model = XGBClassifier(n_estimators=500, random_state=42, objective='multi:softmax', num_class=5, eval_metric='mlogloss')
final_model.fit(X_train, y_train)

## Producing Predictions File

In [44]:
# produece csv file with predictions on test data
test_df = pd.read_csv('Test_set.csv')
# impute missing values in with the median value of the column by label for best replaced data 
test_df.fillna(test_df.median(), inplace=True)

In [45]:
X_test_final = test_df.iloc[:, 1:]

y_pred = final_model.predict(X_test_final)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [46]:
#write predictions to file matching format of sample_submission.csv with columns ID and Class
submission = pd.DataFrame({'ID': test_df['ID'], 'Pred_Class': y_pred})
submission.to_csv('submission.csv', index=False)