In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/cs4780-spring-2023-kaggle-competition/sample_submission.csv
/kaggle/input/cs4780-spring-2023-kaggle-competition/LH_train.csv
/kaggle/input/cs4780-spring-2023-kaggle-competition/LF_test.csv
/kaggle/input/cs4780-spring-2023-kaggle-competition/LF_train.csv
/kaggle/input/cs4780-spring-2023-kaggle-competition/RF_test.csv
/kaggle/input/cs4780-spring-2023-kaggle-competition/RH_test.csv
/kaggle/input/cs4780-spring-2023-kaggle-competition/General info for Killian.pptx
/kaggle/input/cs4780-spring-2023-kaggle-competition/LH_test.csv
/kaggle/input/cs4780-spring-2023-kaggle-competition/RH_train.csv
/kaggle/input/cs4780-spring-2023-kaggle-competition/form_prediction.py
/kaggle/input/cs4780-spring-2023-kaggle-competition/RF_train.csv


In [2]:
LF_training = pd.read_csv("/kaggle/input/cs4780-spring-2023-kaggle-competition/LF_train.csv")
LH_training = pd.read_csv("/kaggle/input/cs4780-spring-2023-kaggle-competition/LH_train.csv")
RF_training = pd.read_csv("/kaggle/input/cs4780-spring-2023-kaggle-competition/RF_train.csv")
RH_training = pd.read_csv("/kaggle/input/cs4780-spring-2023-kaggle-competition/RH_train.csv")
datasets = [LF_training,LH_training,RF_training,RH_training]
labels = ['LF','LH','RF','RH']

Preprocessing below for each dataset

In [3]:
training_sets = []
for df, label in zip(datasets,labels):
    df = df.fillna(0)
    y = df[label]
    df = df.drop(['id','dob',label,'gait','Gait','forceplate_date'], axis=1)
    df.replace('Not able to trot',-1, inplace = True)
    df.replace('Not able to walk',-1, inplace = True)
    df.replace('no data', 0, inplace = True)
    df.replace('no valid trials', 0, inplace = True)

    training_sets.append([df,y])


Code that trains a model given a dataset

In [4]:
def TrainModelSingleInstance(dataset,clf): 
    X,y = dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    clf.fit(X_train, y_train)
    y_training_pred = clf.predict(X_train)
    training_accuracy = accuracy_score(y_train,y_training_pred)
    y_testing_pred = clf.predict(X_test)
    testing_accuracy = accuracy_score(y_test,y_testing_pred)
    return training_accuracy,testing_accuracy


Trains model with $k$ different training sets. Provides a rough expectation for model performance over $\frac{1}{n}\sum_{n=1}^{k} h_i(x) $ instead of just one instance

In [5]:
#printing version 
# def TrainModel(dataset,clf,iter):
#     training_accuracy = 0
#     testing_accuracy = 0
#     for i in range(iter): 
#         x,y = TrainModelSingleInstance(dataset,clf)
#         training_accuracy += x
#         testing_accuracy += y
#     training_accuracy = training_accuracy / iter
#     testing_accuracy = testing_accuracy / iter
#     string1 = "The training accuracy_score is " + str(training_accuracy) 
#     string2 = "The testing accuracy_score " + str(testing_accuracy)
#     print(string1)
#     print(string2)
def TrainModel(dataset,clf,k):
    training_accuracy = 0
    testing_accuracy = 0
    for i in range(k): 
        x,y = TrainModelSingleInstance(dataset,clf)
        training_accuracy += x
        testing_accuracy += y
    training_accuracy = training_accuracy / k
    testing_accuracy = testing_accuracy / k
    return training_accuracy, testing_accuracy

Used to compare performance of classifiers over the dataset (name, classifier are now deleted from notebook)  

In [6]:
def TrainAllClassifiers(df):
    best_accuracy = 0 
    training_accuracy = 0
    best_name = None 
    for name,classifier in zip(names,classifiers): 
        train,cur_accuracy = TrainModel(df,classifier,50)
        best_accuracy = max(cur_accuracy,best_accuracy)
        if best_accuracy == cur_accuracy: 
            best_name = name 
            training_accuracy = train
    print("The best model is" +best_name + "with" + str(best_accuracy))

After Adaboost performed as the best classifier, Using Grid Search for hyperparameter tuning of $n\_estimators$

In [7]:
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
# grid['learning_rate'] = [1.0]
# grid['learning_rate'] =  [1,1.5,2,2.5,3,3.5]

# define the evaluation procedure
model = AdaBoostClassifier(n_estimators = 500)

# define the grid search procedure
grid_search = GridSearchCV(model, grid)


# execute the grid search
X,y = training_sets[3]
grid_result = grid_search.fit(X, y)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Best: 0.836364 using {'n_estimators': 100}
0.763636 (0.078203) with: {'n_estimators': 10}
0.772727 (0.064282) with: {'n_estimators': 50}
0.836364 (0.061658) with: {'n_estimators': 100}
0.827273 (0.066804) with: {'n_estimators': 500}


Pre-processing of testing data:

In [8]:
LF_testing = pd.read_csv("/kaggle/input/cs4780-spring-2023-kaggle-competition/LF_test.csv").fillna(0)
LH_testing = pd.read_csv("/kaggle/input/cs4780-spring-2023-kaggle-competition/LH_test.csv").fillna(0)
RF_testing = pd.read_csv("/kaggle/input/cs4780-spring-2023-kaggle-competition/RF_test.csv").fillna(0)
RH_testing = pd.read_csv("/kaggle/input/cs4780-spring-2023-kaggle-competition/RH_test.csv").fillna(0)

df_id1 = LF_testing.copy(deep=True)["id"].to_numpy()
df_id2 = LH_testing.copy(deep=True)["id"].to_numpy()
df_id3 = RF_testing.copy(deep=True)["id"].to_numpy()
df_id4 = RH_testing.copy(deep=True)["id"].to_numpy()
test_datasets_pre = [LF_testing,LH_testing,RF_testing,RH_testing]
test_datasets = []
for df in test_datasets_pre:
    df = df.fillna(0)
    df = df.drop(['id','dob','gait','Gait','forceplate_date'], axis=1)
    df.replace('Not able to trot',-1, inplace = True)
    df.replace('Not able to walk',-1, inplace = True)
    df.replace('no data', 0, inplace = True)
    df.replace('no valid trials', 0, inplace = True)
    test_datasets.append(df)


In [9]:
# models = [RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),AdaBoostClassifier(),SVC(gamma=2, C=1),AdaBoostClassifier()]
models = [AdaBoostClassifier(n_estimators=500,learning_rate = 1.5),AdaBoostClassifier(n_estimators=500,learning_rate = 1.5),AdaBoostClassifier(n_estimators=500,learning_rate = 1.5),AdaBoostClassifier(n_estimators=500,learning_rate = 1.5)]

predictions = []
for model,test_dataset,training_set in zip(models,test_datasets,training_sets): 
    X_train, y_train  = training_set
    model.fit(X_train, y_train)
    predictions.append(model.predict(test_dataset))

In [10]:
LF_final = pd.DataFrame({'id': df_id1, 'LF':predictions[0]})
LH_final = pd.DataFrame({'id': df_id2, 'LH':predictions[1]})
RF_final = pd.DataFrame({'id': df_id3, 'RF':predictions[2]})
RH_final = pd.DataFrame({'id': df_id4, 'RH':predictions[3]})

LF_final.to_csv("/kaggle/working/LF_test_labels.csv")
LH_final.to_csv("/kaggle/working/LH_test_labels.csv")
RF_final.to_csv("/kaggle/working/RF_test_labels.csv")
RH_final.to_csv("/kaggle/working/RH_test_labels.csv")
