In [None]:
import pandas as pd
import numpy as np

# utility
from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from data_cleaning import clean_raw_data, create_dataset, get_all_results

#viz
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

# parallel
import ray
try:
    ray.init()
except:
    print("ray already started")

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, StratifiedShuffleSplit, LeaveOneOut
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from augmentdatalib_source.knnor.data_augment import KNNOR
from mlxtend.feature_selection import SequentialFeatureSelector, ExhaustiveFeatureSelector
from feature_selection import FeatureSelector
from tuning import Tuner

# models
from sklearn.svm import SVC

# analysis
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score

pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_columns", None) # show all cols


SEED = 42

# reload modules in py files
%load_ext autoreload
%autoreload 2

# Research Question
Can we categorize athletes who actually played on field based on their performance metrics: Binary Classification where the target is to predict if played NHL or not (column J) using performance metrics as input

## 1. Exploration / Cleaning

In [None]:
df = clean_raw_data("Brdi_db_march.xlsx")

X, y = create_dataset(df, target_col="NHL")
print(X.columns)
print("No missing values in X: ", (X.isna().sum() == 0).all())
print("No missing values in y: ", (y.isna().sum() == 0).all())
display(X, y)

In [None]:
count_one = y.value_counts()[1]
count_zero = y.value_counts()[0]

print(1 / count_zero, 1 / count_one)
print(1 / (count_one / len(y)), 1 / (count_zero / len(y)))

In [None]:
def seaborn_conf_matrix(cm):
    group_names = ["True Neg","False Pos","False Neg","True Pos"]
    group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cm, annot=labels, fmt='', cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")

In [None]:
def autopct_format(values):
        def my_format(pct):
            total = sum(values)
            val = int(round(pct*total/100.0))
            return '{:.1f}%\n({v:d})'.format(pct, v=val)
        return my_format


def plot_pie(df, col, title):
    concussions = df.groupby(col).height.count()
    colors = sns.color_palette('pastel')[0:5]

    plt.title(label=title)
    plt.pie(concussions, labels = ["No", "Yes"], colors = colors, autopct=autopct_format(concussions))
    plt.show()


plot_pie(df, "NHL", "Played in the NHL")

## 2. Sampling

* **SMOTE**
* **ADASYN**
* **RANDOM** Over Sampling
* **KNNOR** - K-nearest neighbors oversampling

In [None]:
def balance_dataset(X, y, type="SMOTE"):
    techniques = {
        "SMOTE" : SMOTE(random_state=SEED),
        "ADASYN" : ADASYN(random_state=SEED),
        "RANDOM" : RandomOverSampler(random_state=SEED),
        "KNNOR" : KNNOR(),

    }

    if type == "weighted":
        count_one = y.value_counts()[1]
        count_zero = y.value_counts()[0]
        
        y = pd.DataFrame(y)
        y[y["NHL"] == 1] = 1 / count_one
        y[y["NHL"] == 0] = 1 / count_zero

        return X, np.array(y)

    sampler = techniques[type]


    if type == "KNNOR":
        X_cols = X.columns
        y_name = y.name

        X, y, _, _ = sampler.fit_resample(X.values, y.values)
        # y = y.reshape(-1, 1)

        # because of how the library is setup have to convert back to DF/Series
        X = pd.DataFrame(X, columns=X_cols)
        y = pd.Series(y, name=y_name)
    else:
        X, y = sampler.fit_resample(X, y,)

    return X, y


# X_smote, y_smote = balance_dataset(X, y, type="SMOTE")
# X_adasyn, y_adasyn = balance_dataset(X, y, type="ADASYN")
# X_random, y_random = balance_dataset(X, y, type="RANDOM")
# X_knnor, y_knnor = balance_dataset(X, y, type="KNNOR")

X_weighted, y_weighted = balance_dataset(X, y, type="weighted")

In [None]:
plot_pie(pd.concat([X_smote, y_smote], axis=1), "NHL", "Played in the NHL: SMOTE")
plot_pie(pd.concat([X_adasyn, y_adasyn], axis=1), "NHL", "Played in the NHL: ADASYN")
plot_pie(pd.concat([X_random, y_random], axis=1), "NHL", "Played in the NHL: Random")
plot_pie(pd.concat([X_knnor, y_knnor], axis=1), "NHL", "Played in the NHL: KNNOR")

## 3. Feature Selection


* **Pearson Correlation** - Two features that are highly correlated with each other are redundant
* **Extra Trees Classifier** - Randomized decision trees on subsamples of the dataset to determine feature importance
* **Forward Feature Selection** - S.O.A, parallelized

### 3.1 Pearson Correlation

In [None]:
def plot_correlation_heatmap(X):
    cor = pd.DataFrame(X).corr()
    plt.figure(figsize=(25,25))
    sns.heatmap(cor, cmap=plt.cm.CMRmap_r,annot=True)
    plt.show()  

def correlation(dataset, threshold):

    """
    Find all pairs of collumns with correllation > .7. Add one of the pairs to a set to be dropped
    """
    col_corr = set()  
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: 
                colname = corr_matrix.columns[i]                  
                col_corr.add(colname)
    return col_corr  



plot_correlation_heatmap(X)
orthogonal_features = correlation(X, .7)
print("Features that exhibit pearson correlation of .7 or more:\n",)
display(orthogonal_features)

### 3.2 Extra Trees 

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
def get_n_important_features(X, y, n_features=10):
    model = ExtraTreesClassifier(random_state=SEED)
    model.fit(X, y)
    feat_importances = pd.Series(model.feature_importances_, index=X.columns).nlargest(n_features)
    
    top_n_columns=feat_importances.keys().to_list()

    top_n_features = pd.DataFrame({'importance' : feat_importances}).sort_values(by="importance", ascending=False)
    return top_n_features

top_n_features = get_n_important_features(X, y, n_features=15)
display(top_n_features)

print("\nFeatures that are orthogonal and in the top 15 features:")
set(list(top_n_features.index)).intersection(set(list(orthogonal_features)))

### 3.3 Forward Selection

*See ./training_output for all results*

In [None]:
def get_cross_validation(X, y, test_size=.2, n_splits=3, type="stratified"):
    if type == "stratified":
        return list(StratifiedShuffleSplit(test_size=test_size, n_splits=n_splits, random_state=SEED).split(X, y))
    elif type == "leave_one_out":
        return list(LeaveOneOut().split(X, y))

kf = get_cross_validation(X, y)

### 3.3.1 Forward Feature Selection

* Forward Feature selection, 10-20 features, floating, f1 scoring
* 90% training, 10% validation, stratified 
* Cross Validation: 80% training, 20% testing, stratified, 2 folds

Positive that get_cross_val_score is correct

Things to note:
k = 5 CV

test size = .1

fit and transforms on training data, applies same transformation to test

balances on scaled training data

In [None]:
def get_cross_val_score(model, X, y, display_cm=False):
    # X = X[features]
    kf = get_cross_validation(X, y, n_splits=5, test_size=.1)
    scores = []
    
    all_ytest = []
    all_ypred = []

    if isinstance(X, pd.DataFrame):
        X = X.values
    
    if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
        y = y.values

    
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        display(y_train)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        X_train_scaled, y_train = balance_dataset(X_train_scaled, y_train, type="SMOTE")
        model.fit(X_train_scaled, y_train)

        y_pred = model.predict(X_test_scaled)
        score = f1_score(y_test, y_pred)
        print(score)
        scores.append(score)

        all_ytest.extend(y_test)
        all_ypred.extend(y_pred)


    if display_cm:
        seaborn_conf_matrix(confusion_matrix(all_ytest, all_ypred))
    return np.mean(scores)

### Train SVC

Steps: 
1) Split into train/val 80/20 stratified split 
2) Scale X_train 
3) Balance X_train, y_train
4) Feature Selection using cross-validation and all features
5) Keep only best features in X_val
6) Scale X_val (seperately from X_train)
7) Retrain model on X_val using CV and best features

### SVC

In [None]:
X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
model = RandomForestClassifier(random_state=SEED)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)
# print(f1_score(y_test, y_pred))


# print(get_cross_validation(X_train, y_train, n_splits=2))
# X = StandardScaler().fit_transform(X)
get_cross_val_score(model, X, y, display_cm=False)

Have to worry about data leakage, data used in feature selection can't make it into validation data later on 
Make sure that X_train in feature selection and X_train in model tuning is the **EXACT** same

In [None]:
X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)
# X_train_not_scaled = X_train.copy().values
# X_val_not_scaled = X_val.copy().values
# model = SVC(random_state=SEED)
model = SVC(random_state=SEED, kernel="linear")
kwargs = {"selection_type":"forward", "floating":True, "scoring":"f1", "k_features": len(X.columns), "cv": get_cross_validation(X_train, y_train)}
cols = X.columns

# scale training data
X_train = StandardScaler().fit_transform(X_train)

# balance data after scaling
X_train, y_train = balance_dataset(X_train, y_train, type="SMOTE")


# perform feature selection
ftsl = FeatureSelector(model, **kwargs)
ftsl = ftsl.fit(X_train, y_train)
results = ftsl.get_results(cols)
features_svc = list(map(lambda x: x.replace("'", ""), list(results.iloc[0].features)))
features_svc

In [None]:
results.head()

In [None]:
# use best features to retrain / test model
# new X and y split into train and test in cross validation
X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

X_sfs = X_train.iloc[:, ftsl.get_feature_idx()]
y_sfs = y_train


X_val_sfs = X_val.iloc[:, ftsl.get_feature_idx()]



# # train model on test data
mean_f1 = get_cross_val_score(model, X_sfs, y_sfs, display_cm=True)
print("Mean F1 Score: ", mean_f1,)
print("Train-Val Split 80/20")
print("CV, done on train and val: Kfold = 2, test = 20%")
print("Features: ", features_svc)

# y_pred = model.predict(X_val)
# seaborn_conf_matrix(confusion_matrix(y_val, y_pred))



----

## Decision Tree

In [None]:
from feature_selection import FeatureSelector
from sklearn.tree import DecisionTreeClassifier

X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

model = DecisionTreeClassifier(random_state=SEED)
kwargs = {"selection_type":"forward", "floating":True, "scoring":"f1", "k_features": len(X.columns), "cv": get_cross_validation(X_train, y_train)}
cols = X.columns

# scale training data
X_train = StandardScaler().fit_transform(X_train)

# balance data after scaling
X_train, y_train = balance_dataset(X_train, y_train, type="SMOTE")


# perform feature selection
ftsl = FeatureSelector(model, **kwargs)
ftsl = ftsl.fit(X_train, y_train)
results = ftsl.get_results(cols)
features_dt = list(map(lambda x: x.replace("'", ""), list(results.iloc[0].features)))
features_dt

In [None]:
results.head()

In [None]:
# use best features to retrain / test model
# new X and y split into train and test in cross validation
X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

X_sfs = X_train.iloc[:, ftsl.get_feature_idx()]
y_sfs = y_train


X_val_sfs = X_val.iloc[:, ftsl.get_feature_idx()]



# # train model on test data
mean_f1 = get_cross_val_score(model, X_sfs, y_sfs, display_cm=True)
print("Mean F1 Score: ", mean_f1,)
print("Train-Val Split 80/20")
print("CV, done on train and val: Kfold = 2, test = 20%")
print("Features: ", features_dt)

# y_pred = model.predict(X_val)
# seaborn_conf_matrix(confusion_matrix(y_val, y_pred))



-------

## Train Random Forest

In [None]:
from feature_selection import FeatureSelector
from sklearn.ensemble import RandomForestClassifier

X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

model = RandomForestClassifier(random_state=SEED)
# model = SVC(random_state=SEED, kernel="linear")
kwargs = {"selection_type":"forward", "floating":True, "scoring":"f1", "k_features": len(X.columns), "cv": get_cross_validation(X_train, y_train)}
cols = X.columns

# scale training data
X_train = StandardScaler().fit_transform(X_train)

# balance data after scaling
X_train, y_train = balance_dataset(X_train, y_train, type="SMOTE")


# perform feature selection
ftsl = FeatureSelector(model, **kwargs)
ftsl = ftsl.fit(X_train, y_train)
results = ftsl.get_results(cols)
features_rf = list(map(lambda x: x.replace("'", ""), list(results.iloc[0].features)))
features_rf

In [None]:
results.head()

In [None]:
# use best features to retrain / test model
# new X and y split into train and test in cross validation
X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

X_sfs = X_train.iloc[:, ftsl.get_feature_idx()]
y_sfs = y_train


X_val_sfs = X_val.iloc[:, ftsl.get_feature_idx()]



# # train model on test data
mean_f1 = get_cross_val_score(model, X_sfs, y_sfs, display_cm=True)
print("Mean F1 Score: ", mean_f1,)
print("Train-Val Split 80/20")
print("CV, done on train and val: Kfold = 2, test = 20%")
print("Features: ", features_rf)

# y_pred = model.predict(X_val)
# seaborn_conf_matrix(confusion_matrix(y_val, y_pred))



## Train MLP


In [None]:
from feature_selection import FeatureSelector
from sklearn.neural_network import MLPClassifier

X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

model = MLPClassifier(random_state=SEED, max_iter=1000)
kwargs = {"selection_type":"forward", "floating":True, "scoring":"f1", "k_features": len(X.columns), "cv": get_cross_validation(X_train, y_train)}
cols = X.columns

# scale training data
X_train = StandardScaler().fit_transform(X_train)

# balance data after scaling
X_train, y_train = balance_dataset(X_train, y_train, type="SMOTE")


# perform feature selection
ftsl = FeatureSelector(model, **kwargs)
ftsl = ftsl.fit(X_train, y_train)
results = ftsl.get_results(cols)
features_mlp = list(map(lambda x: x.replace("'", ""), list(results.iloc[0].features)))
features_mlp


In [None]:
results.head()

In [None]:
# use best features to retrain / test model
# new X and y split into train and test in cross validation
X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

X_sfs = X_train.iloc[:, ftsl.get_feature_idx()]
y_sfs = y_train


X_val_sfs = X_val.iloc[:, ftsl.get_feature_idx()]



# # train model on test data
mean_f1 = get_cross_val_score(model, X_sfs, y_sfs, display_cm=True)
print("Mean F1 Score: ", mean_f1,)
print("Train-Val Split 80/20")
print("CV, done on train and val: Kfold = 2, test = 20%")
print("Features: ", features_mlp)

# y_pred = model.predict(X_val)
# seaborn_conf_matrix(confusion_matrix(y_val, y_pred))



-----

## Train Logistic Regression

In [None]:
from feature_selection import FeatureSelector
from sklearn.linear_model import LogisticRegression

X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

model = LogisticRegression(random_state=SEED)
kwargs = {"selection_type":"forward", "floating":True, "scoring":"f1", "k_features": len(X.columns), "cv": get_cross_validation(X_train, y_train)}
cols = X.columns

# scale training data
X_train = StandardScaler().fit_transform(X_train)

# balance data after scaling
X_train, y_train = balance_dataset(X_train, y_train, type="SMOTE")


# perform feature selection
ftsl = FeatureSelector(model, **kwargs)
ftsl = ftsl.fit(X_train, y_train)
results = ftsl.get_results(cols)
features_lr = list(map(lambda x: x.replace("'", ""), list(results.iloc[0].features)))
features_lr


In [None]:
results.head()

In [None]:
# use best features to retrain / test model
# new X and y split into train and test in cross validation
X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

X_sfs = X_train.iloc[:, ftsl.get_feature_idx()]
y_sfs = y_train


X_val_sfs = X_val.iloc[:, ftsl.get_feature_idx()]



# # train model on test data
mean_f1 = get_cross_val_score(model, X_sfs, y_sfs, display_cm=True)
print("Mean F1 Score: ", mean_f1,)
print("Train-Val Split 80/20")
print("CV, done on train and val: Kfold = 2, test = 20%")
print("Features: ", features_lr)

# y_pred = model.predict(X_val)
# seaborn_conf_matrix(confusion_matrix(y_val, y_pred))



----  

## Train XG Boost

In [None]:
from feature_selection import FeatureSelector
from xgboost import XGBClassifier

X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

model = XGBClassifier(random_state=SEED)
kwargs = {"selection_type":"forward", "floating":True, "scoring":"f1", "k_features": len(X.columns), "cv": get_cross_validation(X_train, y_train)}
cols = X.columns

# scale training data
X_train = StandardScaler().fit_transform(X_train)

# balance data after scaling
X_train, y_train = balance_dataset(X_train, y_train, type="SMOTE")


# perform feature selection
ftsl = FeatureSelector(model, **kwargs)
ftsl = ftsl.fit(X_train, y_train)
results = ftsl.get_results(cols)
features_xgb = list(map(lambda x: x.replace("'", ""), list(results.iloc[0].features)))
features_xgb


In [None]:
results.head()

In [None]:
# use best features to retrain / test model
# new X and y split into train and test in cross validation
X, y = create_dataset(clean_raw_data("Brdi_db_march.xlsx"), target_col="NHL")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=SEED, stratify=y)

X_sfs = X_train.iloc[:, ftsl.get_feature_idx()]
y_sfs = y_train


X_val_sfs = X_val.iloc[:, ftsl.get_feature_idx()]



# # train model on test data
mean_f1 = get_cross_val_score(model, X_sfs, y_sfs, display_cm=True)
print("Mean F1 Score: ", mean_f1,)
print("Train-Val Split 80/20")
print("CV, done on train and val: Kfold = 2, test = 20%")
print("Features: ", features_xgb)

# y_pred = model.predict(X_val)
# seaborn_conf_matrix(confusion_matrix(y_val, y_pred))



------

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
model1 = SVC(random_state=SEED)
model2 = SVC(random_state=SEED)

sfs1 = SFS(estimator=model, k_features=10, forward=True, floating=True, scoring='f1', cv=2)
pipe = Pipeline([('sfs', sfs1), ('svc', model2)])

param_grid =  {
    'sfs__k_features': [1, 2, 3],
    'sfs__estimator__kernel': ['linear', 'rbf'],
    'sfs__estimator__C': [0.1, 1, 10, 100, 1000],

}

gs = GridSearchCV(estimator=pipe, 
                  param_grid=param_grid, 
                  scoring='f1', 
                  n_jobs=-1, 
                  cv=2,
                  refit=False)

gs = gs.fit(X_train, y_train)

In [None]:
print("Best parameters via GridSearch", gs.best_params_)


In [None]:
plot_sfs(ftsl.selector.get_metric_dict(), kind='ci')

## Model Tuning
* Grid Search 


First, lets get baselines before tuning with models using features from feature selection and balancing

In [None]:
# standardized cross validation for each search
kf = get_cross_validation(X, y)

In [None]:
# top features for SVC
top_n_features =['weight', 'bimanual score: washer', 'Bimanual Score: Button', 'RT_HR', 'CMT: HR', 'cvRT_HR', 'Ball Path_V', 'Delta_Fullpath', 'Corrective_V', 'AE_HR', 'Delta_AE', 'Delta: VE', 'AbsOnAxis_HR', 'Delta_OnAxis', 'Delta_OffAxis']
X_top_n_features = X[top_n_features]

In [None]:
X_smote, y_smote = balance_dataset(X, y, type="SMOTE")
X_adasyn, y_adasyn = balance_dataset(X, y, type="ADASYN")
X_random, y_random = balance_dataset(X, y, type="RANDOM")
X_knnor, y_knnor = balance_dataset(X, y, type="KNNOR")

In [None]:
# unbalanced, all features
f1_all_features = get_cross_val_score(SVC(random_state=SEED), get_cross_validation(X, y), X, y)

# top n features
f1_top_n_features = get_cross_val_score(SVC(random_state=SEED), get_cross_validation(X_top_n_features, y), X_top_n_features, y)

# smote, all features
f1_smote = get_cross_val_score(SVC(random_state=SEED), get_cross_validation(X_smote, y_smote), X_smote, y_smote)

# adasyn, all features
f1_adasyn = get_cross_val_score(SVC(random_state=SEED), get_cross_validation(X_adasyn, y_adasyn), X_adasyn, y_adasyn)

# random, all features
f1_random = get_cross_val_score(SVC(random_state=SEED), get_cross_validation(X_random, y_random), X_random, y_random)

# knnor, all features
f1_knnor = get_cross_val_score(SVC(random_state=SEED), get_cross_validation(X_knnor, y_knnor), X_knnor, y_knnor)

print(f"f1 all features, unbalanced: {f1_all_features}")
print(f"f1 top n features: {f1_top_n_features}")
print(f"f1 smote: {f1_smote}")
print(f"f1 adasyn: {f1_adasyn}")
print(f"f1 random: {f1_random}")
print(f"f1 knnor: {f1_knnor}")

In [None]:

model = SVC(random_state=SEED)
hyperparams = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": [0.1, 1, 10, 100, 1000],
    "degree": [1, 2, 3, 4, 5, 6],
    "gamma" : ["scale", "auto"],
    "class_weight": ["balanced", None],
    "break_ties": [True, False],
}

features = ['age as of June 1', "bimanual score: washer", "Delta_PV", "MT_HR"]
model_tuner = Tuner(model, hyperparams, cv=get_cross_validation(X[features], y))
model_tuner.tune(X, y)

best_params = model_tuner.get_best_params()
best_estimator = model_tuner.get_best_estimator()
results = model_tuner.get_results()

print("best after model tuning")
print(model_tuner.get_best_estimator(), model_tuner.get_best_score(), model_tuner.get_best_params())

In [None]:
# feature_set = ['age as of June 1', 'height', 'weight', 'previous concussions?', '# of concussions', 'bimanual score: washer', 'Bimanual Score: Button', 'RT_V', 'Delta_MT', 'CMT: V', 'CMT: HR', 'cvRT_V', 'cvRT_HR', 'stdRT_V', 'stdRT_HR', 'Corrective_V', 'Corrective_HR', 'PeakV_HR', 'AE_HR', 'Delta_AE', 'Delta: VE', 'AbsOnAxis_V', 'Delta_OnAxis', 'AbsOffAxis_V']
feature_set = ['age as of June 1', "bimanual score: washer", "Delta_PV", "MT_HR"]
X2 = X[feature_set]

f1_all_features = get_cross_val_score(SVC(kernel="poly", random_state=SEED, gamma="scale", C=100, class_weight="balanced", break_ties=False), get_cross_validation(X2, y), X2, y)
f1_all_features

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

f1s = []
for fs in results["features"]:
    fs = list(map(lambda x: x.replace("'", ""), list(fs)))
    X2 = X[fs]
    f_score = get_cross_val_score( RandomForestClassifier(class_weight='balanced', max_depth=11, max_features=None,
                       min_samples_leaf=4, min_samples_split=5,
                       n_estimators=10), get_cross_validation(X2, y), X2, y)
    f1s.append(f_score)
    print(fs)
    

f1s