# **Advanced Data Science for Innovation - Assignment 1**


## NBA Career Prediction: Predict 5-Year Longevity for NBA Rookies
**Student Name:** Sean Williams

**Team Name:** Group 1
* Nuwan Munasinghe
* Wenying Wu
* Nathan Fragar
* Sean Williams
* Carol Myhill

## Environment Setup

In [9]:
import pandas as pd
import numpy as np
from importlib.machinery import SourceFileLoader
dataprep = SourceFileLoader('sets', '../src/data/prepare.py').load_module()
sets = SourceFileLoader('sets', '../src/data/sets.py').load_module()
base = SourceFileLoader('base', '../src/models/null.py').load_module()
from IPython.display import display
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV,StratifiedKFold
import joblib as job
from sklearn.preprocessing import MinMaxScaler

In [2]:
def score_base(y_train_preds, y_train, y_val_preds, y_val, f1_average='weighted'):
    name = 'Base'
    model_scores = []
    t_acc = accuracy_score(y_train, y_train_preds)
    t_prec = precision_score(y_train, y_train_preds)
    t_rec = recall_score(y_train, y_train_preds)
    t_f1 = f1_score(y_train, y_train_preds, average=f1_average)
    #t_auc = roc_auc_score(y_t, clf.predict_proba(X_t)[:, 1])
    v_acc = accuracy_score(y_val, y_val_preds)
    v_prec = precision_score(y_val, y_val_preds)
    v_rec = recall_score(y_val, y_val_preds)
    v_f1 = f1_score(y_val, y_val_preds, average=f1_average)
    #v_auc = roc_auc_score(y_v, clf.predict_proba(X_v)[:, 1])
    model_scores.append([name, t_acc, t_prec, t_rec, t_f1, v_acc, v_prec, v_rec, v_f1])
    df_model_scores = pd.DataFrame (model_scores, columns = ['model','t_accuracy','t_precision','t_recall','t_F1','v_accuracy','v_precision','v_recall','v_F1'])
    display(df_model_scores)
    
def fit_score_models(models, X_t, y_t, X_v, y_v, dump_model="NO"):
    model_scores = []
    best_acc = 0
    i = 0
    for name, model in models.items():
        i = i+1;
        clf = model
        if dump_model == "YES":
            job.dump(clf, "../models/williams_sean-week2_" + name + ".joblib", compress=3)
        clf.fit(X_t, y_t)
        t_preds = clf.predict(X_t)
        t_acc = accuracy_score(y_t, t_preds)
        if i == 1:
            best_acc = t_acc
            best_clf = clf
        else:
            if t_acc > best_acc:
                best_acc = t_acc
                best_clf = clf            
        t_prec = precision_score(y_t, t_preds)
        t_rec = recall_score(y_t, t_preds)
        t_f1 = f1_score(y_t, t_preds)
        t_auc = roc_auc_score(y_t, clf.predict_proba(X_t)[:, 1])
        v_preds = clf.predict(X_v)
        v_acc = accuracy_score(y_v, v_preds)
        v_prec = precision_score(y_v, v_preds)
        v_rec = recall_score(y_v, v_preds)
        v_f1 = f1_score(y_v, v_preds)
        v_auc = roc_auc_score(y_v, clf.predict_proba(X_v)[:, 1])
        model_scores.append([name, t_acc, t_prec, t_rec, t_f1, t_auc, v_acc, v_prec, v_rec, v_f1, v_auc])
    df_model_scores = pd.DataFrame (model_scores, columns = ['model','t_accuracy','t_precision','t_recall','t_F1','t_auc','v_accuracy','v_precision','v_recall','v_F1','v_auc'])
    display(df_model_scores)
    return best_clf

# 3. Modelling

**<u>Load saved data sets</u>**

In [3]:
X_train, y_train, X_val, y_val, X_test, y_test = sets.load_sets(path='../data/processed/')

**<u>Assess Baseline of Train and Validation datasets</u>**

In [4]:
base_model = base.NullModel(target_type="classification")
y_base_train_preds = base_model.fit_predict(y_train)
y_base_val_preds = base_model.fit_predict(y_val)
score_base(y_base_train_preds, y_train, y_base_val_preds, y_val)

Unnamed: 0,model,t_accuracy,t_precision,t_recall,t_F1,v_accuracy,v_precision,v_recall,v_F1
0,Base,0.833594,0.833594,1.0,0.757942,0.83375,0.83375,1.0,0.758161


---
**<u>Train various models with default parameters to determine which model is the most performant</u>**

In [5]:
models_to_fit = {"Logistic Regression": LogisticRegression(random_state=8, solver='liblinear'),
                 "KNN Euclidian": KNeighborsClassifier(metric='euclidean'),
                 "KNN Manhattan": KNeighborsClassifier(metric='manhattan'),
                 "XGBoost": xgb.XGBClassifier(random_state=8, use_label_encoder=False)}
clf1 = fit_score_models (models_to_fit, X_train, y_train, X_val, y_val)



Unnamed: 0,model,t_accuracy,t_precision,t_recall,t_F1,t_auc,v_accuracy,v_precision,v_recall,v_F1,v_auc
0,Logistic Regression,0.833906,0.835638,0.996813,0.909138,0.706177,0.833125,0.834903,0.997001,0.90878,0.699612
1,KNN Euclidian,0.848594,0.861185,0.975633,0.914843,0.850319,0.82,0.843627,0.962519,0.89916,0.596633
2,KNN Manhattan,0.852344,0.864134,0.976382,0.916835,0.856547,0.8175,0.844577,0.957271,0.8974,0.588099
3,XGBoost,0.978125,0.974429,1.0,0.987049,0.999054,0.815625,0.836682,0.967766,0.897463,0.631926


*Observations:*
* XGBoost seems to be best performer. Next steps. Tune hyperparameters to reduce overfitting

---
**<u>Perform a grid serach on XGBoost to determine which hyperparameters result in best performance</u>**

In [6]:
# grid search
clf1 = xgb.XGBClassifier(use_label_encoder=False)
n_estimators = [100, 200, 300, 400, 500]
learning_rate = [0.0001, 0.001, 0.01, 0.1]
param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=8)
grid_search = GridSearchCV(clf1, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)
# Print best score and parameters
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.835000 using {'learning_rate': 0.01, 'n_estimators': 300}


---
**<u>Train XGBoost with best Hyperparameter and print performance metrics</u>**

In [7]:
models_to_fit = {"XGBoost": xgb.XGBClassifier(random_state=8, use_label_encoder=False, learning_rate=0.01, n_estimators=300)}
clf2 = fit_score_models (models_to_fit, X_train, y_train, X_val, y_val, "YES")



Unnamed: 0,model,t_accuracy,t_precision,t_recall,t_F1,t_auc,v_accuracy,v_precision,v_recall,v_F1,v_auc
0,XGBoost,0.855313,0.852213,0.999813,0.920131,0.872544,0.8275,0.835234,0.988006,0.90522,0.676218


# Kaggle Submission File

In [10]:
df_test = pd.read_csv('../data/raw/test.csv')
df_cleaned = df_test.copy()
df_cleaned = dataprep.remove_invalid_rows(df_cleaned, ['GP','FT%'])
X_test = df_cleaned.copy()
X_test = dataprep.drop_features(X_test, ['Id'])
X_test = dataprep.scale_features(X_test, MinMaxScaler(), None)
X_test.info()
test_probs = clf2.predict_proba(X_test)[:, 1]
df_kaggle = pd.DataFrame({'Id': df_cleaned['Id'], 'TARGET_5Yrs': test_probs});
df_kaggle.shape
df_kaggle.to_csv("../data/external/williams_sean-week2_xgboost-v1.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3799 entries, 0 to 3798
Data columns (total 19 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   GP       3799 non-null   float64
 1   MIN      3799 non-null   float64
 2   PTS      3799 non-null   float64
 3   FGM      3799 non-null   float64
 4   FGA      3799 non-null   float64
 5   FG%      3799 non-null   float64
 6   3P Made  3799 non-null   float64
 7   3PA      3799 non-null   float64
 8   3P%      3799 non-null   float64
 9   FTM      3799 non-null   float64
 10  FTA      3799 non-null   float64
 11  FT%      3799 non-null   float64
 12  OREB     3799 non-null   float64
 13  DREB     3799 non-null   float64
 14  REB      3799 non-null   float64
 15  AST      3799 non-null   float64
 16  STL      3799 non-null   float64
 17  BLK      3799 non-null   float64
 18  TOV      3799 non-null   float64
dtypes: float64(19)
memory usage: 564.0 KB
