In [None]:
import pandas as pd
import numpy as np

# utility
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from data_cleaning import clean_raw_data, create_dataset, get_all_results
# parallel
import ray
try:
    ray.init()
except:
    print("ray already started")

# viz
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

# feature selection / preprocessing
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from augmentdatalib_source.knnor.data_augment import KNNOR
from mlxtend.feature_selection import SequentialFeatureSelector
from feature_selection import FeatureSelector

# models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB, ComplementNB
from lineartree import LinearTreeClassifier
from xgboost import XGBClassifier, DMatrix
from catboost import CatBoostClassifier
from parallel_train import Trainer
from tuning import Tuner

# analysis
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score

pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_columns", None) # show all cols

# reload modules in py files
%load_ext autoreload
%autoreload 2

In [None]:
def weight_to_int(row):
    try:
        return int(row)
    except:
        return int(row.split(" ")[0])


def height_to_inches(row):
    if type(row) == float:
        # 6.10 -> 72 inches
        feet, inches = str(row).split(".")
        return int(feet) * 12 + int(inches)

    if type(row) == str:
        # 6' 10" -> 72 inches
        feet, inches = str(row).split(" ")
        feet = feet.replace("'", '')
        inches = inches.replace('"', '')
        return int(feet) * 12 + int(inches)

    if type(row) == int:
        return row * 12


players_df = pd.read_excel("Brdi_db_march.xlsx", engine="openpyxl").drop(columns=[123, "id", "Data Initials", "Code Name", "draft status", ])

# if no prev concussions "# of concussions" = 0
players_df.loc[players_df["previous concussions?"] == "NO", '# of concussions'] = 0

# "previous concussions?" YES/NO -> 0/1
players_df["previous concussions?"] = players_df["previous concussions?"].apply(lambda x: 1 if x=="YES" else 0)

# weight -> int
players_df["weight"] = players_df["weight"].apply(weight_to_int)

# height -> inches as int
players_df["height"] = players_df["height"].apply(height_to_inches)

# draft year -> int *not drafted == -1*
players_df["draft year"] = players_df["draft year"].apply(lambda x: int(x) if pd.notnull(x) and x != 0 else -1)

# draft number -> int *not drafted == -1*
players_df["draft number"] = players_df["draft number"].apply(lambda x: int(x) if pd.notnull(x) and x != 0 else -1)

# create drafted row
players_df["drafted"] = players_df["draft number"].apply(lambda x: 0 if x == -1 else 1)
column_to_move = players_df.pop("drafted")
players_df.insert(8, "drafted", column_to_move)

players_df.to_excel("Brdi_db_clean.xlsx")
players_df.head(123)

In [None]:
players_df.describe()

In [None]:
def autopct_format(values):
        def my_format(pct):
            total = sum(values)
            val = int(round(pct*total/100.0))
            return '{:.1f}%\n({v:d})'.format(pct, v=val)
        return my_format

positions = players_df.groupby("Position").year.count()
colors = sns.color_palette('pastel')[0:5]

plt.title(label="Distribution of Positions")
plt.pie(positions, labels = positions.index, colors = colors, autopct=autopct_format(positions))
plt.show()

In [None]:
ages = players_df.groupby("age as of June 1").year.count()
colors = sns.color_palette('pastel')[0:5]

plt.title(label="Distribution of Ages")
plt.pie(ages, labels = ages.index, colors = colors, autopct=autopct_format(ages))
plt.show()

In [None]:
concussions = players_df.groupby("previous concussions?").year.count()
colors = sns.color_palette('pastel')[0:5]

plt.title(label="Distribution of Concussions")
plt.pie(concussions, labels = ["No", "Yes"], colors = colors, autopct=autopct_format(concussions))
plt.show()

In [None]:
drafted = players_df.groupby("draft year").year.count()
colors = sns.color_palette('pastel')[0:5]

plt.title(label="Distribution of Played in NHL")
plt.pie(drafted, labels = drafted.index, colors = colors, autopct=autopct_format(drafted))
plt.show()

In [None]:
def seaborn_conf_matrix(cm):
    group_names = ["True Neg","False Pos","False Neg","True Pos"]
    group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cm, annot=labels, fmt='', cmap='Blues')

## Weighted SVM for Draft Classification

### Types of scaling

standard: $\frac{x_k(d) - \mu_k}{ \sigma_k}$

In [None]:
def print_metrics(y_test, y_pred):
    otuput = f"""precision: {precision_score(y_test, y_pred)}\nrecall: {recall_score(y_test, y_pred)}\naccuracy: {accuracy_score(y_test, y_pred)}\nf1: {f1_score(y_test, y_pred)}"""
    print(otuput)


In [None]:
def print_metrics(y_test, y_pred):
    otuput = f"""precision: {precision_score(y_test, y_pred)}\nrecall: {recall_score(y_test, y_pred)}\naccuracy: {accuracy_score(y_test, y_pred)}\nf1: {f1_score(y_test, y_pred)}"""
    print(otuput)

def print_all_metrics(y_test, yhat, classifier=True):
    print(f'\n\n-----MODULE {"CLASSIFICATION" if classifier else "PREDICTION"} METRICS-----')
    if classifier:    
        print_metrics(y_test, yhat)
        seaborn_conf_matrix(confusion_matrix(y_test, yhat))

In [None]:
def create_dataset(df, target_col="NHL"):

    non_feature_cols = ["year","DOB", "draft year", "shoots", "Position", "drafted", "draft number"]


    # X_train, X_test, y_train, y_test = scale_and_split(df, scaler="None", test_size=.3, target_col="NHL")
    # scale X train
    y = df[target_col]
    X = df.drop(columns=non_feature_cols + [target_col])

    X = X.fillna(X.mean())


    return X,y

In [None]:
def scale_and_split(df, scaler="standard", target_col="drafted", test_size=0.2, shuffle=False, print_columns=False, return_feature_names=False):
    non_feature_cols = ["year","DOB", "draft year", "shoots", "Position"]

    # if target == "drafted":
    #     df.drop(df.loc[df['line_race']==0].index, inplace=True)
    #     target = df[target_col]
    #     features = df.drop(columns=non_feature_cols + [target_col])
        
    X, y = create_dataset(df, target_col=target_col)
    feature_names = X.columns
    if print_columns:
        print(X.columns)
        
   

    if scaler == "standard":
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    
    # shuffler = np.random.permutation(len(X))
    # X = X[shuffler]
    # y = y[shuffler]


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=np.random.randint(25) if shuffle else 50)
         
    if not return_feature_names:
        return  X_train, X_test, y_train, y_test
    else:
       return  X_train, X_test, y_train, y_test, feature_names


    

In [None]:
feature_cols = ['age as of June 1', 'height', 'weight', 'NHL',
       'previous concussions?', '# of concussions', 'bimanual score: washer',
       'Bimanual Score: Button', 'RT_V', 'RT_HR', 'Delta_RT', 'MT_V', 'MT_HR',
       'Delta_MT', 'TMT_V', 'TMT_HR', 'CMT: V', 'CMT: HR', 'cvRT_V', 'cvRT_HR',
       'stdRT_V', 'stdRT_HR', 'Ball Path_V', 'Ball Path_HR', 'Delta_BallPath',
       'FullPath_V', 'FullPath_HR', 'Delta_Fullpath', 'Corrective_V',
       'Corrective_HR', 'PeakV_V', 'PeakV_HR', 'Delta_PV', 'AE_V', 'AE_HR',
       'Delta_AE', 'VE_V', 'VE_HR', 'Delta: VE', 'AbsOnAxis_V', 'AbsOnAxis_HR',
       'Delta_OnAxis', 'AbsOffAxis_V', 'AbsOffAxis_HR', 'Delta_OffAxis']

In [None]:
X_train, X_test, y_train, y_test = scale_and_split(players_df, "standard", test_size=.2)

model = SVC(gamma='auto')
model.fit(X_train, y_train)
yhat = model.predict(X_test)


seaborn_conf_matrix(confusion_matrix(y_test, yhat))





In [None]:
from sklearn.linear_model import LinearRegression
from lineartree import LinearTreeRegressor
X_train, X_test, y_train, y_test = scale_and_split(players_df, "standard", test_size=.2)
model = LinearTreeClassifier(base_estimator=LogisticRegression())

model.fit(X_train, y_train)
display(model.plot_model(feature_names=feature_cols))
yhat = model.predict(X_test)

seaborn_conf_matrix(confusion_matrix(y_test, yhat))
print_metrics(y_test, yhat)

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree

X_train, X_test, y_train, y_test = scale_and_split(players_df, "standard", test_size=.2)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
yhat = clf.predict(X_test)

tree.plot_tree(clf, 
                   feature_names=feature_cols,  
                   class_names=["drafted", "not drafted"],
                   filled=True)

print_metrics(y_test, yhat)
print(yhat)
print(list(y_test))
plt.show()
seaborn_conf_matrix(confusion_matrix(y_test, yhat))

_____

In [None]:
class GeneralClassifier:
    def __init__(self, base_classifier=LogisticRegression(), data=None, target="NHL") -> None:
        self.df = data
        self.target = target
        self.features = []

        self.model = base_classifier
    
        self.X_train, self.X_test, self.y_train, self.y_test = [],[],[],[],
        self.y_pred = []

        self.accuracy_metrics = {"precision" : 0, "recall" : 0, "f1" : 0, "accuracy" : 0 }

    def train_test_split(self, scaler="standard", test_size=0.2, shuffle=False):
            non_feature_cols = ["year","DOB", "draft year", "shoots", "Position"]

            # if target == "drafted":
            #     df.drop(df.loc[df['line_race']==0].index, inplace=True)
            #     target = df[target_col]
            #     features = df.drop(columns=non_feature_cols + [target_col])
                

            target_col=self.target

            y = self.df[target_col]
            
            if target_col == "drafted":
                X = self.df.drop(columns=non_feature_cols + [target_col] + ["draft number", "# of concussions", "previous concussions?"])
            else:
                X = self.df.drop(columns=non_feature_cols + [target_col])

            self.features = X.columns
            X = X.fillna(X.mean())

            if scaler == "standard":
                scaler = StandardScaler()
            
            # shuffler = np.random.permutation(len(X))
            # X = X[shuffler]
            # y = y[shuffler]

            X = scaler.fit_transform(X)

            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=np.random.randint(25) if shuffle else 50)

            return self

    def fit(self):
        self.model.fit(self.X_train, self.y_train)
        return self

    def predict(self,):
        self.y_pred = self.model.predict(self.X_test)
        return self

    def get_accuracy_metrics(self):
        self.accuracy_metrics["precision"] = precision_score(self.y_test, self.y_pred)
        self.accuracy_metrics["recall"] = recall_score(self.y_test, self.y_pred)
        self.accuracy_metrics["f1"] = f1_score(self.y_test, self.y_pred)
        self.accuracy_metrics["accuracy"] = accuracy_score(self.y_test, self.y_pred)

        return self

    def accuracy_heatmap(self):
        cm = confusion_matrix(self.y_test, self.y_pred)
        group_names = ["True Neg","False Pos","False Neg","True Pos"]
        group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
        group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
        labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
        labels = np.asarray(labels).reshape(2,2)
        sns.heatmap(cm, annot=labels, fmt='', cmap='Blues')
        plt.show()
        return self

    def display_metrics(self):
        output = f"""----- Classifier: {type(self.model).__name__}-----\n  * precision: {self.accuracy_metrics["precision"]}\n  * recall: {self.accuracy_metrics["recall"]}\n  * f1: {self.accuracy_metrics["f1"]}\n  * accuracy: {self.accuracy_metrics["accuracy"]}"""
        print(output)

        



### 1. Can categorize athletes who actually played on field based on their performance metrics: Binary Classification where the target is to predict if played NHL or not (column J) using performance metrics as input

### Class distributions

In [None]:
played_in_nhl = players_df.groupby("NHL").year.count()
colors = sns.color_palette('pastel')[0:5]

plt.title(label="Distribution of Played in NHL")
plt.pie(played_in_nhl, labels = ["No", "Yes"], colors = colors, autopct=autopct_format(played_in_nhl))
plt.show()

### Baseline Classifiers

In [None]:
clf = GeneralClassifier(base_classifier=LogisticRegression(), data=players_df, target="NHL")
clf = clf.train_test_split().fit().predict().get_accuracy_metrics()

clf.display_metrics()

In [None]:
X_train, X_test, y_train, y_test = scale_and_split(players_df, scaler="standard", test_size=.2, target_col="NHL")
model = LogisticRegression()

model.fit(X_train, y_train)
yhat = model.predict(X_test)


print_all_metrics(y_test, yhat, classifier=True)

In [None]:
X_train, X_test, y_train, y_test = scale_and_split(players_df, scaler="standard", test_size=.2, target_col="NHL")
model = DecisionTreeClassifier()

model.fit(X_train, y_train)
yhat = model.predict(X_test)


print_all_metrics(y_test, yhat, classifier=True)

In [None]:
X_train, X_test, y_train, y_test = scale_and_split(players_df, scaler="standard", test_size=.2, target_col="NHL")
model = MLPClassifier()

model.fit(X_train, y_train)
yhat = model.predict(X_test)


print_all_metrics(y_test, yhat, classifier=True)

In [None]:
X_train, X_test, y_train, y_test = scale_and_split(players_df, scaler="standard", test_size=.2, target_col="NHL")
model = SVC()

model.fit(X_train, y_train)
yhat = model.predict(X_test)


print_all_metrics(y_test, yhat, classifier=True)

In [None]:
X_train, X_test, y_train, y_test = scale_and_split(players_df, scaler="standard", test_size=.2, target_col="NHL")
model = GaussianNB()

model.fit(X_train, y_train)
yhat = model.predict(X_test)


print_all_metrics(y_test, yhat, classifier=True)

In [None]:
X_train, X_test, y_train, y_test = scale_and_split(players_df, scaler="None", test_size=.3, target_col="NHL")

model = XGBClassifier(base_score=0.6, booster='gbtree', max_depth=10, n_estimators=200) 

# input matrix form for XGBoost
data_matrix = DMatrix(data=X_train, label=y_train)

model.fit(X_train, y_train)


# scores = cross_val_score(model, X_train, y_train, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())

y_pred = model.predict(X_test)

 
print_all_metrics(y_test, y_pred, classifier=True)
# kfold = KFold(n_splits=10, shuffle=True)
# kf_cv_scores = cross_val_score(model, X_train, y_train, cv=kfold )
# print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

# print_all_metrics(y_test, yhat, classifier=True)

In [None]:
X_train, X_test, y_train, y_test = scale_and_split(players_df, scaler="None", test_size=.3, target_col="NHL")

model = CatBoostClassifier()
model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False, )

y_pred = model.predict(X_test)
print_all_metrics(y_test, y_pred, classifier=True)


------
DF to keep track off all the results

-----
## Out of the box classifiers

### Evaluate all classifiers

In [None]:
# classifiers = [LogisticRegression(), LinearTreeClassifier(base_estimator=LogisticRegression()), DecisionTreeClassifier(), MLPClassifier(), SVC(),GaussianNB(), XGBClassifier(), CatBoostClassifier(), ]
classifiers = [LogisticRegression(), MLPClassifier(), SVC(), CatBoostClassifier()]
all_f1_results = pd.DataFrame()


In [None]:
kf = StratifiedShuffleSplit(test_size=.2, n_splits=5, random_state=0)

X, y = create_dataset(players_df)
scaler = StandardScaler()
X = scaler.fit_transform(X)

f1_scores = []
df_f1_scores = []
df_models = []
df_precisions = []
df_recalls = []
df_accuracys = []


for clf in classifiers:
    clf = clone(clf)
    indiv_f1_scores = []
    indiv_df_scores = []
    indiv_df_models = []
    indiv_df_precisions = []
    indiv_df_recalls = []
    indiv_df_accuracys = []
    for i, (train_index, test_index) in enumerate(kf.split(X, y)):
        

        X_train = X[train_index]
        y_train = y.iloc[train_index]

        X_test= X[test_index]
        y_test = y.iloc[test_index]    


        model_name = clf.__class__.__name__
        if  model_name == "CatBoostClassifier": 
            clf.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)
        else:
            clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        f1 = f1_score(y_test, y_pred)


        indiv_f1_scores.append(f1)
        indiv_df_precisions.append(precision_score(y_test, y_pred))
        indiv_df_recalls.append(recall_score(y_test, y_pred))
        indiv_df_accuracys.append(accuracy_score(y_test, y_pred))
    


    f1_scores.append({model_name : np.mean(indiv_f1_scores)})
    df_f1_scores.append(np.mean(indiv_f1_scores))
    df_precisions.append(np.mean(indiv_df_precisions))
    df_recalls.append(np.mean(indiv_df_recalls))
    df_models.append(model_name)
    df_accuracys.append(np.mean(indiv_df_accuracys))

results_otb = pd.DataFrame({"Model" : df_models, "Precision" : df_precisions, "Recall" : df_recalls, "Accuracy" : df_accuracys, "F1" : df_f1_scores}).set_index("Model").sort_values(by="F1", ascending=False).round(3)
all_f1_results['f1_otb'] = df_f1_scores
all_f1_results.index = df_models
all_f1_results




___

### Dealing with Class Imbalance

1. SMOTE
2. ADASyn
2. KNN OveRsampling

In [None]:
def balance_dataset(X, y, spl_type="SMOTE"):
    techniques = {
        "SMOTE" : SMOTE(random_state=0),
        "ADASYN" : ADASYN(random_state=0),
        "RANDOM" : RandomOverSampler(random_state=0),
    }

    sampler = techniques[spl_type]

    X, y = sampler.fit_resample(X, y,)

    return X, y

In [None]:
kf = StratifiedShuffleSplit(test_size=.2, n_splits=5, random_state=0)


X, y = create_dataset(players_df, target_col="NHL")

# resample
X, y = balance_dataset(X, y, spl_type="SMOTE")

scaler = StandardScaler()
X = scaler.fit_transform(X)

f1_scores = []
df_f1_scores = []
df_models = []
df_precisions = []
df_recalls = []
df_accuracys = []

print(X.shape, y.shape)
for clf in classifiers:
    clf = clone(clf)
    indiv_f1_scores = []
    indiv_df_scores = []
    indiv_df_models = []
    indiv_df_precisions = []
    indiv_df_recalls = []
    indiv_df_accuracys = []
    for i, (train_index, test_index) in enumerate(kf.split(X, y)):
        

        X_train = X[train_index]
        y_train = y.iloc[train_index]

        X_test= X[test_index]
        y_test = y.iloc[test_index]    


        model_name = clf.__class__.__name__
        if  model_name == "CatBoostClassifier": 
            clf.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)
        else:
            clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        f1 = f1_score(y_test, y_pred)


        indiv_f1_scores.append(f1)
        indiv_df_precisions.append(precision_score(y_test, y_pred))
        indiv_df_recalls.append(recall_score(y_test, y_pred))
        indiv_df_accuracys.append(accuracy_score(y_test, y_pred))
    


    f1_scores.append({model_name : np.mean(indiv_f1_scores)})
    df_f1_scores.append(np.mean(indiv_f1_scores))
    df_precisions.append(np.mean(indiv_df_precisions))
    df_recalls.append(np.mean(indiv_df_recalls))
    df_models.append(model_name)
    df_accuracys.append(np.mean(indiv_df_accuracys))

results_balance = pd.DataFrame({"Model" : df_models, "Precision" : df_precisions, "Recall" : df_recalls, "Accuracy" : df_accuracys, "F1" : df_f1_scores}).set_index("Model").sort_values(by="F1", ascending=False).round(3)
all_f1_results['f1_balanced'] = df_f1_scores
all_f1_results


Conclusion: Balancing does not seem to help the accuracy of the models at all. I do not understand the reason for this. 

--------
## Feature Selection 

### Drop correlated features using Pearson Correlation

Features that are very highly correlated with one another don't all need to be in the dataset. Too many features encourages the curse of dimensionality and so if we can be smart at reducing features, we may serve to gain model accuracy as the feature space decreases. 

<br>
Note:<br> 

- Chi square didn't work becuase values were negative

In [None]:
X, y = create_dataset(players_df, target_col="NHL")
cor = pd.DataFrame(X).corr()
plt.figure(figsize=(25,25))
sns.heatmap(cor, cmap=plt.cm.CMRmap_r,annot=True)
plt.show()  

def correlation(dataset, threshold):

    """
    Find all pairs of collumns with correllation > .7. Add one of the pairs to a set to be dropped
    """
    col_corr = set()  
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: 
                colname = corr_matrix.columns[i]                  
                col_corr.add(colname)
    return col_corr  


corr_features = correlation(X, .7)
corr_features

In [None]:
kf = StratifiedShuffleSplit(test_size=.2, n_splits=5, random_state=0)

X, y = create_dataset(players_df)


# drop correlated features
corr_features = correlation(X, .7)
X.drop(corr_features, axis=1)


scaler = StandardScaler()
X = scaler.fit_transform(X)

f1_scores = []
df_f1_scores = []
df_models = []
df_precisions = []
df_recalls = []
df_accuracys = []

print(X.shape, y.shape)
for clf in classifiers:
    clf = clone(clf)
    indiv_f1_scores = []
    indiv_df_scores = []
    indiv_df_models = []
    indiv_df_precisions = []
    indiv_df_recalls = []
    indiv_df_accuracys = []
    for i, (train_index, test_index) in enumerate(kf.split(X, y)):
        

        X_train = X[train_index]
        y_train = y.iloc[train_index]

        X_test= X[test_index]
        y_test = y.iloc[test_index]    


        model_name = clf.__class__.__name__
        if  model_name == "CatBoostClassifier": 
            clf.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)
        else:
            clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        f1 = f1_score(y_test, y_pred)


        indiv_f1_scores.append(f1)
        indiv_df_precisions.append(precision_score(y_test, y_pred))
        indiv_df_recalls.append(recall_score(y_test, y_pred))
        indiv_df_accuracys.append(accuracy_score(y_test, y_pred))
    


    f1_scores.append({model_name : np.mean(indiv_f1_scores)})
    df_f1_scores.append(np.mean(indiv_f1_scores))
    df_precisions.append(np.mean(indiv_df_precisions))
    df_recalls.append(np.mean(indiv_df_recalls))
    df_models.append(model_name)
    df_accuracys.append(np.mean(indiv_df_accuracys))

results_feat_sel = pd.DataFrame({"Model" : df_models, "Precision" : df_precisions, "Recall" : df_recalls, "Accuracy" : df_accuracys, "F1" : df_f1_scores}).set_index("Model").sort_values(by="F1", ascending=False).round(3)
all_f1_results['f1_rm_corr_features'] = df_f1_scores
all_f1_results


## Ranking Columns based on ${Chi}^2$

In [None]:
f_p_values=chi2(X,y)
p_values=pd.Series(f_p_values[1])
p_values.index=X_train.columns
p_values = p_values.sort_index(ascending=False)
p_values

## Keeping only the top N=8 features using extra tree classifier

Notes: <br>

* Trains differently every time and I am not sure why. Might be because of the ExtraTreesclassifier although that should be seeded

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
def get_n_important_features(X, y, n_features=10):
    model = ExtraTreesClassifier(random_state=0)
    model.fit(X, y)
    feat_importances = pd.Series(model.feature_importances_, index=X.columns).nlargest(n_features)
    
    top_n_columns=feat_importances.keys().to_list()

    top_n_features = pd.DataFrame({'importance' : feat_importances}, index=top_n_columns).sort_values(by="importance", ascending=False)
    return top_n_features




In [None]:
kf = StratifiedShuffleSplit(test_size=.2, n_splits=5, random_state=0)

X, y = create_dataset(players_df)

important_features = get_n_important_features(X, y, 10)

# drop all but 10 most important features
X = X.drop(list(set(X.columns) - set(important_features.index)), axis=1)
print(X.columns)
scaler = StandardScaler()
X = scaler.fit_transform(X)

f1_scores = []
df_f1_scores = []
df_models = []
df_precisions = []
df_recalls = []
df_accuracys = []

for clf in classifiers:
    clf = clone(clf)
    indiv_f1_scores = []
    indiv_df_scores = []
    indiv_df_models = []
    indiv_df_precisions = []
    indiv_df_recalls = []
    indiv_df_accuracys = []
    for i, (train_index, test_index) in enumerate(kf.split(X, y)):
        

        X_train = X[train_index]
        y_train = y.iloc[train_index]

        X_test= X[test_index]
        y_test = y.iloc[test_index]    


        model_name = clf.__class__.__name__
        if  model_name == "CatBoostClassifier": 
            clf.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)
        else:
            clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        f1 = f1_score(y_test, y_pred)


        indiv_f1_scores.append(f1)
        indiv_df_precisions.append(precision_score(y_test, y_pred))
        indiv_df_recalls.append(recall_score(y_test, y_pred))
        indiv_df_accuracys.append(accuracy_score(y_test, y_pred))
    


    f1_scores.append({model_name : np.mean(indiv_f1_scores)})
    df_f1_scores.append(np.mean(indiv_f1_scores))
    df_precisions.append(np.mean(indiv_df_precisions))
    df_recalls.append(np.mean(indiv_df_recalls))
    df_models.append(model_name)
    df_accuracys.append(np.mean(indiv_df_accuracys))

results_feat_sel = pd.DataFrame({"Model" : df_models, "Precision" : df_precisions, "Recall" : df_recalls, "Accuracy" : df_accuracys, "F1" : df_f1_scores}).set_index("Model").sort_values(by="F1", ascending=False).round(3)
all_f1_results['f1_top_n_features'] = df_f1_scores
all_f1_results


-----
Feature selection via Forward selection.

This was used in the SOA draft by the numbers and is a brute force technique for feature selection

In [None]:
def feature_num_to_name(indicies, df):
    cols = df.columns
    return list(map(lambda i: cols[i], indicies))

feature_num_to_name([0, 3, 4, 7, 8, 9, 11, 30], X_train)

In [None]:
import json

def do_sequential_selection(estimator, df, forward=True, floating=True, scoring="f1"):

    X, y = create_dataset(df)
    # print(X.columns)

    X_train, X_test, y_train, y_test= train_test_split(X, y, 
                                                    stratify=y,
                                                    test_size=0.3,
                                                    random_state=1)
                                                    
    sfs1 = SequentialFeatureSelector(estimator=estimator, 
            k_features=(7, 15),
            forward=forward, 
            floating=floating, 
            scoring=scoring,
            cv=5)

    pipe = make_pipeline(StandardScaler(), sfs1)

    pipe.fit(X_train, y_train,)

    print('best combination (ACC: %.3f): %s\n' % (sfs1.k_score_, feature_num_to_name(sfs1.k_feature_idx_, X_train)), end="\n\n")
    print('all subsets:\n', sfs1.subsets_)
    plot_sfs(sfs1.get_metric_dict(), kind='std_err')

    return sfs1


floating, forward = ["True"] * 2
scoring="f1"
estimator=MLPClassifier()
sfs1 = do_sequential_selection(estimator=estimator, df=players_df, floating=floating, forward=forward, scoring=scoring), 

In [None]:
kf = StratifiedShuffleSplit(test_size=.2, n_splits=5, random_state=0).split(X, y)
list(kf)

In [None]:

from feature_selection import FeatureSelector
from data_cleaning import clean_raw_data, create_dataset
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

df = clean_raw_data(filename="./Implementation/Brdi_db_march.xlsx")
X, y = create_dataset(df, target_col="NHL")

kf = list(StratifiedShuffleSplit(test_size=.2, n_splits=5, random_state=0).split(X, y))

ESTIMATOR = DecisionTreeClassifier()
ftsl = FeatureSelector(ESTIMATOR, selection_type="forward", floating=True, scoring="f1", k_features=2, cv=kf)
ftsl.fit(X, y)

ftsl.get_results()


In [None]:
df = clean_raw_data(filename="./Implementation/Brdi_db_march.xlsx")
X, y = create_dataset(df, target_col="NHL")
X = X[["previous concussions?", "DR Errors: V", "TMT_V", "cvRT_HR", "PeakV_V", "Delta: VE", "AbsOffAxis_V", "age as of June 1"]]
X_train, X_test, y_train, y_test= train_test_split(X, y, 
                                                    stratify=y,
                                                    test_size=0.3,
                                                    random_state=1)

pipe = make_pipeline(StandardScaler(), MLPClassifier())
pipe.fit(X, y)                                        

y_pred = pipe.predict(X_test)

f1_score(y_test, y_pred)

In [None]:
ftsl.get_results()

In [None]:
test = pd.DataFrame()

# test.append(ftsl.get_results())

test = pd.concat([test,ftsl.get_results()] )
test = pd.concat([test,ftsl.get_results()] )
test


In [None]:
import uuid

df = clean_raw_data(filename="./Implementation/Brdi_db_march.xlsx")
X, y = create_dataset(df, target_col="NHL")
kf = list(StratifiedShuffleSplit(test_size=.2, n_splits=5, random_state=0).split(X, y))

@ray.remote
def parrallell_feature_selection(estimator):
    # ESTIMATOR = MLPClassifier(max_iter=300)
    ftsl = FeatureSelector(estimator, selection_type="backward", floating=True, scoring="f1", k_features=25, cv=kf)
    ftsl.fit(X, y)
    results = ftsl.get_results()
    return results


all_parallel_results = pd.DataFrame()
classifiers = [DecisionTreeClassifier(), RandomForestClassifier(), SVC()]
# classifiers = [LogisticRegression(), MLPClassifier(), XGBClassifier()]
result_ids = []
for cls in classifiers:
    print(type(cls).__name__)
    result_ids.append(parrallell_feature_selection.remote(cls))

for r in ray.get(result_ids):
    res_df = r
    all_parallel_results = pd.concat([all_parallel_results, res_df])


all_parallel_results.sort_values(by="score", ascending=False).to_excel(f"./training_output/DT_RF_SVC_25_backward_floating_f1{uuid.uuid4()}.xlsx", index=False)

In [None]:
# Do feature selection on 3 different models

df = clean_raw_data(filename="./Implementation/Brdi_db_march.xlsx")
X, y = create_dataset(df, target_col="NHL")
kf = list(StratifiedShuffleSplit(test_size=.2, n_splits=5, random_state=0).split(X, y))
models = [LogisticRegression(), MLPClassifier(), XGBClassifier()]
kwargs = {"selection_type":"forward", "floating":True, "scoring":"f1", "k_features": 2, "cv":kf}

trainer = Trainer(X, y, models)
trainer.train(how="feature_selection", kwargs=kwargs)

output = trainer.get_results(filename="LR_MLP_XGB_All")
output

## Tune Decision Tree

In [None]:
def get_cross_val_score(model, kf, features):
    df = clean_raw_data(filename="./Implementation/Brdi_db_march.xlsx")

    X, y = create_dataset(df, target_col="NHL")

    X = X[features]

    scores = []
    for train_index, test_index in kf:
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        pipe = make_pipeline(StandardScaler(), model)
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        scores.append(f1_score(y_test, y_pred))

    return np.mean(scores)
        

In [None]:
df = clean_raw_data(filename="./Implementation/Brdi_db_march.xlsx")

X, y = create_dataset(df, target_col="NHL")

# from feature selectoin DT_RF_SVC_25Backwardf_floating_f1721...
features = ['age as of June 1', 'height', 'weight', 'previous concussions?', '# of concussions', 'Bimanual Score: Button', 'DR Errors: V', 'DR Errors: HR', 'RT_HR', 'MT_HR', 'Delta_MT', 'TMT_V', 'TMT_HR', 'Ball Path_V', 'Delta_BallPath', 'FullPath_V', 'FullPath_HR', 'Delta_PV', 'VE_V', 'Delta: VE', 'AbsOffAxis_HR', 'Delta_OffAxis']

kf = list(StratifiedShuffleSplit(test_size=.2, n_splits=3, random_state=0).split(X, y))


# models = [LogisticRegression(), MLPClassifier(), XGBClassifier()]


print("f1 score: ", get_cross_val_score(DecisionTreeClassifier(), kf, features=features))

model = DecisionTreeClassifier()
hyperparams = {
    "criterion" : ["gini", "entropy"],
    "max_depth" : range(1, 12),
    "min_samples_split" : range(1, 12),
    "min_samples_leaf" : range(1, 12),
    "max_features" : ["sqrt", "log2", None],
    "class_weight" : [None, "balanced"],

}


model_tuner = Tuner(model, hyperparams, cv=kf)
model_tuner.tune(X, y)

best_params = model_tuner.get_best_params()
best_estimator = model_tuner.get_best_estimator()
results = model_tuner.get_results()

print("best after model tuning")
print(model_tuner.get_best_estimator(), model_tuner.get_best_score(), model_tuner.get_best_params())






## Tune Random Forest

In [None]:
# Note: this takes 106 mins to run as it is right now


df = clean_raw_data(filename="./Implementation/Brdi_db_march.xlsx")

X, y = create_dataset(df, target_col="NHL")

# best Decision Tree features from feature selectoin DT_RF_SVC_25Backwardf_floating_f1721...
features = ['age as of June 1', 'height', 'weight', 'previous concussions?', '# of concussions', 'Bimanual Score: Button', 'DR Errors: V', 'DR Errors: HR', 'RT_HR', 'MT_HR', 'Delta_MT', 'TMT_V', 'TMT_HR', 'Ball Path_V', 'Delta_BallPath', 'FullPath_V', 'FullPath_HR', 'Delta_PV', 'VE_V', 'Delta: VE', 'AbsOffAxis_HR', 'Delta_OffAxis']
kf = list(StratifiedShuffleSplit(test_size=.2, n_splits=3, random_state=0).split(X, y))


# models = [LogisticRegression(), MLPClassifier(), XGBClassifier()]


print("f1 score: ", get_cross_val_score(RandomForestClassifier(), kf, features=features))

model = RandomForestClassifier()
hyperparams = {
    "bootstrap" : [True, False],
    "max_depth" : range(1, 12),
    "min_samples_split" : range(1, 12),
    "min_samples_leaf" : range(1, 12),
    "max_features" : ["sqrt", "log2", None],
    "class_weight" : [None, "balanced"],
    "n_estimators" : np.linspace(10, 100, 10, dtype=int),
}


model_tuner = Tuner(model, hyperparams, cv=kf)
model_tuner.tune(X, y)

best_params = model_tuner.get_best_params()
best_estimator = model_tuner.get_best_estimator()
results = model_tuner.get_results()

print("best after model tuning")
print(model_tuner.get_best_estimator(), model_tuner.get_best_score(), model_tuner.get_best_params())

# saving output
# f1 score:  0.6868236900494965
# Fitting 3 folds for each of 159720 candidates, totalling 479160 fits
# best after model tuning
# RandomForestClassifier(class_weight='balanced', max_depth=3, max_features=None,
#                        min_samples_leaf=8, min_samples_split=6,
#                        n_estimators=40) 0.7733333333333334 {'RandomForestClassifier': {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 8, 'min_samples_split': 6, 'n_estimators': 40}}


## Tune Logistic Regression

In [None]:
df = clean_raw_data(filename="./Implementation/Brdi_db_march.xlsx")

X, y = create_dataset(df, target_col="NHL")

# # from feature selectoin DT_RF_SVC_25Backwardf_floating_f1721...
# features = ['age as of June 1', 'height', 'weight', 'previous concussions?', '# of concussions', 'Bimanual Score: Button', 'DR Errors: V', 'DR Errors: HR', 'RT_HR', 'MT_HR', 'Delta_MT', 'TMT_V', 'TMT_HR', 'Ball Path_V', 'Delta_BallPath', 'FullPath_V', 'FullPath_HR', 'Delta_PV', 'VE_V', 'Delta: VE', 'AbsOffAxis_HR', 'Delta_OffAxis']
# X = X[features]

kf = list(StratifiedShuffleSplit(test_size=.2, n_splits=3, random_state=0).split(X, y))


# models = [LogisticRegression(), MLPClassifier(), XGBClassifier()]

model = LogisticRegression(max_iter=500, solver="liblinear")

print("f1 score: ", 

ple_model(clone(model), kf, features=['age as of June 1', 'height', 'weight', 'previous concussions?', '# of concussions', 'Bimanual Score: Button', 'DR Errors: V', 'DR Errors: HR', 'RT_HR', 'MT_HR', 'Delta_MT', 'TMT_V', 'TMT_HR', 'Ball Path_V', 'Delta_BallPath', 'FullPath_V', 'FullPath_HR', 'Delta_PV', 'VE_V', 'Delta: VE', 'AbsOffAxis_HR', 'Delta_OffAxis']))

    # "penalty" : ["l1", "l2", "elasticnet",],
hyperparams = {
    "dual" : [True, False],  
    "C" : np.linspace(0.1, 1, 10),
    "class_weight" : [None, "balanced"],
    # "solver" : ["liblinear"]
}


model_tuner = Tuner(model, hyperparams, cv=kf)
model_tuner.tune(X, y)

best_params = model_tuner.get_best_params()
best_estimator = model_tuner.get_best_estimator()
results = model_tuner.get_results()

print("best after model tuning")
print(model_tuner.get_best_estimator(), model_tuner.get_best_score(), model_tuner.get_best_params())


## Feature Selection Sanity Check

In [None]:
from feature_selection import FeatureSelector
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# Do feature selection on 3 different models

df = clean_raw_data(filename="./Implementation/Brdi_db_march.xlsx")
X, y = create_dataset(df, target_col="NHL")

X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.1, random_state=1)

# kf = list(StratifiedShuffleSplit(test_size=.2, n_splits=2, random_state=0).split(X_train, y_train))
model = LogisticRegression(solver="liblinear", max_iter=400)
kwargs = {"selection_type":"forward", "floating":True, "scoring":"f1", "k_features": (10,20), "cv":2}

# trainer = Trainer(X, y, models)
# self.training_args = dict(kwargs)
ftsl = FeatureSelector(model, **kwargs)
ftsl = ftsl.fit(X_train, y_train)
results = ftsl.get_results()
# results

plot_sfs(metric_dict=ftsl.selector.get_metric_dict(), kind="ci")
# display()

# output = trainer.get_results(filename="LR_MLP_XGB_All")
# output

In [None]:
kf = list(StratifiedShuffleSplit(test_size=.2, n_splits=3, random_state=0).split(X, y))
get_cross_val_score(LogisticRegression, kf, features=features)

In [None]:
ftsl.get_results()

In [None]:
sfs1 = ftsl.selector
X_train_sfs = sfs1.transform(X_train)
X_test_sfs = sfs1.transform(X_test)

# Fit the estimator using the new feature subset
# and make a prediction on the test data
model.fit(X_train_sfs, y_train)
y_pred = model.predict(X_test_sfs)

acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc * 100))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import mlxtend

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.pipeline import Pipeline

lr1 = LogisticRegression()
lr2 = LogisticRegression()

X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.1, random_state=1)

sfs = SFS(estimator=lr1, 
           k_features=(10,20),
           scoring='accuracy',
           clone_estimator=False,
           cv=2,
           n_jobs=-1)

pipe = Pipeline([("scaler", StandardScaler()), ("sfs",sfs), ("lr", lr2)])

param_grid = {'sfs__estimator__C': [0.1, 1.0, 10.0]}

gs = GridSearchCV(estimator=pipe, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  n_jobs=-1, 
                  cv=2, 
                  verbose=1, 
                  refit=True)

# run gridearch
gs = gs.fit(X_train, y_train)

In [None]:
for i in range(len(gs.cv_results_['params'])): 
    print(gs.cv_results_['params'][i], 'test acc.:', gs.cv_results_['mean_test_score'][i])

In [None]:
sfs.get_metric_dict()

In [None]:
print("Best parameters via GridSearch", gs.best_params_)
pipe.set_params(**gs.best_params_).fit(X_train, y_train)

In [None]:
all_training_results_df = get_all_results("./training_output/")
all_training_results_df.to_excel("./aggregate_results/aggregate_best_features.xlsx", index=False)
all_training_results_df[all_training_results_df["training_args"].notnull()]

