<a href="https://colab.research.google.com/github/muhammedbalogun/HDSC22-TeamNeuralNetwork/blob/main/Injury_pred_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Injury Prediction for Competitive Runners Using Day Approach Data

## Import Required Libraries

In [1]:
#Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Preprocessing libraries
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


#Performance evaluation libraries
from sklearn.metrics import confusion_matrix, classification_report

## Load Data

In [2]:
df_day = pd.read_csv('/content/day_approach_maskedID_timeseries.csv')

## Accessing Data

In [3]:
# preliminary assessment function

def assess_data(df):
    
    # check header
    print("The First Five Observations in the DataFrame \n")
    display(df.head())
    print('\n')

    # check tail
    print("The Last Five Observations in the DataFrame \n")
    display(df.tail())
    print('\n')
    
    # check shape of df
    print("The Shape of the DataFrame")
    print(df.shape)
    print('\n')
    
    # check info of df
    print("Basic Information of the DataFrame \n")
    display(df.info())
    print('\n')
    
    # check number of unique values in df
    print("Number of Unique Values in the DataFrame \n")
    print(df.nunique())
    print('\n')
    
    # check number of missing values in df
    print("Number of Missing Values in the DataFrame \n")
    print(df.isnull().sum())
    print('\n')
    
    # check number duplicates in df
    print("Number of Duplicates in DataFrame")
    print("Number of duplicates: ", df.duplicated().sum())
    print('\n')

In [4]:
# explore data set
assess_data(df_day)

The First Five Observations in the DataFrame 



Unnamed: 0,nr. sessions,total km,km Z3-4,km Z5-T1-T2,km sprinting,strength training,hours alternative,perceived exertion,perceived trainingSuccess,perceived recovery,...,km Z5-T1-T2.6,km sprinting.6,strength training.6,hours alternative.6,perceived exertion.6,perceived trainingSuccess.6,perceived recovery.6,Athlete ID,injury,Date
0,1.0,5.8,0.0,0.6,1.2,0.0,0.0,0.11,0.0,0.18,...,0.0,0.0,0.0,1.0,0.1,0.0,0.15,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,...,0.5,1.2,0.0,0.0,0.1,0.0,0.17,0,0,1
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.1,0.0,0.17,...,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,0,0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,...,0.0,0.0,1.0,0.0,0.1,0.0,0.17,0,0,3
4,1.0,0.0,0.0,0.0,0.0,0.0,1.08,0.08,0.0,0.18,...,0.0,0.0,0.0,0.0,0.11,0.0,0.17,0,0,4




The Last Five Observations in the DataFrame 



Unnamed: 0,nr. sessions,total km,km Z3-4,km Z5-T1-T2,km sprinting,strength training,hours alternative,perceived exertion,perceived trainingSuccess,perceived recovery,...,km Z5-T1-T2.6,km sprinting.6,strength training.6,hours alternative.6,perceived exertion.6,perceived trainingSuccess.6,perceived recovery.6,Athlete ID,injury,Date
42761,1.0,16.0,0.0,0.0,0.0,0.0,0.0,0.54,0.22,0.3,...,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,71,1,2143
42762,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,...,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,71,1,2286
42763,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.94,0.88,0.11,...,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,71,1,2483
42764,1.0,15.1,0.0,0.0,0.0,0.0,0.0,0.87,0.86,0.2,...,0.0,0.0,0.0,0.0,0.79,0.8,0.11,71,1,2647
42765,1.0,12.2,0.0,0.0,0.0,0.0,0.0,0.96,0.95,0.2,...,0.0,0.0,0.0,0.0,0.51,0.83,0.15,71,1,2673




The Shape of the DataFrame
(42766, 73)


Basic Information of the DataFrame 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42766 entries, 0 to 42765
Data columns (total 73 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   nr. sessions                 42766 non-null  float64
 1   total km                     42766 non-null  float64
 2   km Z3-4                      42766 non-null  float64
 3   km Z5-T1-T2                  42766 non-null  float64
 4   km sprinting                 42766 non-null  float64
 5   strength training            42766 non-null  float64
 6   hours alternative            42766 non-null  float64
 7   perceived exertion           42766 non-null  float64
 8   perceived trainingSuccess    42766 non-null  float64
 9   perceived recovery           42766 non-null  float64
 10  nr. sessions.1               42766 non-null  float64
 11  total km.1                   42766 non-null  float64

None



Number of Unique Values in the DataFrame 

nr. sessions                      3
total km                        384
km Z3-4                         174
km Z5-T1-T2                     134
km sprinting                     78
                               ... 
perceived trainingSuccess.6     102
perceived recovery.6            100
Athlete ID                       74
injury                            2
Date                           2614
Length: 73, dtype: int64


Number of Missing Values in the DataFrame 

nr. sessions                   0
total km                       0
km Z3-4                        0
km Z5-T1-T2                    0
km sprinting                   0
                              ..
perceived trainingSuccess.6    0
perceived recovery.6           0
Athlete ID                     0
injury                         0
Date                           0
Length: 73, dtype: int64


Number of Duplicates in DataFrame
Number of duplicates:  0




In [5]:
df_day.describe()

Unnamed: 0,nr. sessions,total km,km Z3-4,km Z5-T1-T2,km sprinting,strength training,hours alternative,perceived exertion,perceived trainingSuccess,perceived recovery,...,km Z5-T1-T2.6,km sprinting.6,strength training.6,hours alternative.6,perceived exertion.6,perceived trainingSuccess.6,perceived recovery.6,Athlete ID,injury,Date
count,42766.0,42766.0,42766.0,42766.0,42766.0,42766.0,42766.0,42766.0,42766.0,42766.0,...,42766.0,42766.0,42766.0,42766.0,42766.0,42766.0,42766.0,42766.0,42766.0,42766.0
mean,0.829561,7.038187,0.691381,0.57993,0.073016,0.116237,0.163492,0.247788,0.349802,0.195898,...,0.580347,0.072595,0.11612,0.162308,0.24755,0.349503,0.196224,34.550858,0.013632,1228.039892
std,0.580696,7.473216,2.317657,1.811938,0.48348,0.32601,0.549664,0.257262,0.3683,0.190321,...,1.814538,0.483691,0.326016,0.554031,0.256718,0.368042,0.190568,19.050033,0.11596,807.021168
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,...,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,...,0.0,0.0,0.0,0.0,-0.01,-0.01,-0.01,20.0,0.0,436.0
50%,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.16,0.26,0.16,...,0.0,0.0,0.0,0.0,0.16,0.26,0.17,34.0,0.0,1256.0
75%,1.0,12.0,0.0,0.0,0.0,0.0,0.0,0.44,0.72,0.3,...,0.0,0.0,0.0,0.0,0.44,0.72,0.3,50.0,0.0,1913.0
max,2.0,55.9,42.2,48.0,40.0,2.0,10.22,1.0,1.0,1.0,...,48.0,40.0,2.0,20.0,1.0,1.0,1.0,73.0,1.0,2673.0


In [6]:
# copy df for cleaning
df_day_copy = df_day.copy()

In [7]:
# drop Date and Athlete columns
#
df_day_copy.drop(columns=['Date', 'Athlete ID'], inplace=True) 

# confirm drop
df_day_copy.columns.to_list()

['nr. sessions',
 'total km',
 'km Z3-4',
 'km Z5-T1-T2',
 'km sprinting',
 'strength training',
 'hours alternative',
 'perceived exertion',
 'perceived trainingSuccess',
 'perceived recovery',
 'nr. sessions.1',
 'total km.1',
 'km Z3-4.1',
 'km Z5-T1-T2.1',
 'km sprinting.1',
 'strength training.1',
 'hours alternative.1',
 'perceived exertion.1',
 'perceived trainingSuccess.1',
 'perceived recovery.1',
 'nr. sessions.2',
 'total km.2',
 'km Z3-4.2',
 'km Z5-T1-T2.2',
 'km sprinting.2',
 'strength training.2',
 'hours alternative.2',
 'perceived exertion.2',
 'perceived trainingSuccess.2',
 'perceived recovery.2',
 'nr. sessions.3',
 'total km.3',
 'km Z3-4.3',
 'km Z5-T1-T2.3',
 'km sprinting.3',
 'strength training.3',
 'hours alternative.3',
 'perceived exertion.3',
 'perceived trainingSuccess.3',
 'perceived recovery.3',
 'nr. sessions.4',
 'total km.4',
 'km Z3-4.4',
 'km Z5-T1-T2.4',
 'km sprinting.4',
 'strength training.4',
 'hours alternative.4',
 'perceived exertion.4',
 '

## Split Data

In [8]:
X = df_day_copy.drop('injury', axis=1)
y = df_day_copy['injury']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print("Training data size: ", X_train.shape)
print("Test data size: ", X_test.shape)

Training data size:  (34212, 70)
Test data size:  (8554, 70)


## KNN Model

In [11]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [12]:
y_pred = knn.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
# check model performance
con_mat = confusion_matrix(y_test, y_pred)
con_mat

array([[8434,    0],
       [ 120,    0]])

In [15]:
classification_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.99      1.00      0.99      8434\n           1       0.00      0.00      0.00       120\n\n    accuracy                           0.99      8554\n   macro avg       0.49      0.50      0.50      8554\nweighted avg       0.97      0.99      0.98      8554\n'