# Modeling

## From the end of EDA:

### Conclusion

So the moral of the story currently is that we have at the minimum a couple of heuristics for choosing players:

- Choose value players, ie players with moderate price tags but good matchups
- Choose players based on Def they play
- Avoid expensive players, since statistically they are unable to produce high scores consistently.

With these guidelines, week 1 will be a total gamble, since we won't have any real data besides salaries. Week 2 will be the first time we can use any defensive data to help with our decision making.

## Goal for this notebook:

Based on the conclusions from the EDA, we want to see if we can find a model that confirms these ideas across seasons, and also has a high enough (cross-validated) accuracy to warrant trying to use this with real money.

### Note:
Sci-kit Learn says, according to https://scikit-learn.org/stable/tutorial/machine_learning_map/, that we should be using the linear SVC classifier, but for the sake of this exercise, we are going to try many different models to see what produces the best result.

## Import Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

## Helper Functions

In [2]:
def get_weekly_data(week, year):
    file_path = f"./csv's/{year}/year-{year}-week-{week}-DK-player_data.csv"
    df = pd.read_csv(file_path)
    return df

def get_season_data(year):
    df = get_weekly_data(1,year)
    for week in range(2,17):
        try:
            df = df.append(get_weekly_data(week, year), ignore_index=True)
        except:
            print("No data for week: "+str(week))
    df = df.drop(['Unnamed: 0', 'Year'], axis=1)
    return df

def make_confusion_matrix(classifier, X_test, y_test, y_pred):
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    acc_score = accuracy_score(y_test, y_pred)
    return cm, acc_score

def scale_features(sc, X_train, X_test):
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test

def fix_df_cols(df):
#     df['points/1k'] = np.array(df['DK points']) / np.array(df['DK salary']) * 1000
    df['scoring_potential'] = np.where(df['DK points'] >= 20, 1, 0)
    df['scoring_potential'] = np.where(df['DK points'] >= 30, 2, 0)
    return df

def handle_nulls(df):
    # checking metrics per 1k of salary is fairly common,
    # so we are going to make these 0s/NaNs into huge values to minimize
    # the impact they have on the calculated metrics
    df['DK salary'] = df['DK salary'].fillna(0)
    df['DK salary'].replace(0, 15000, inplace=True)
    return df

def train_test_split_dicts(x_dict, y_dict, idx):
    X = x_dict[idx]
    y = y_dict[idx+1]
    X = X.iloc[:,:-1]
    combined = pd.merge(X, y, how="right", on=["Name"])
    combined = combined.dropna()
    x_filt = combined['Week_x']==idx
    y_filt = combined['Week_y']==idx+1, ['Name', 'scoring_potential']
    X_train, X_test, y_train, y_test = train_test_split(combined.loc[x_filt],
                                                        combined.loc[y_filt], 
                                                        test_size=0.5,
                                                        random_state=0)
    return X_train, X_test, y_train, y_test

def undummify(df, prefix_sep="_"):
    # borrowed from https://newbedev.com/reverse-a-get-dummies-encoding-in-pandas
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

## Import Data

In [3]:
season = 2019
dataset = get_season_data(season)
# dataset

In [4]:
df = handle_nulls(dataset)
df = fix_df_cols(df)
# df

In [5]:
# Remove points, because those won't be available
df_no_points = df.drop(labels='DK points', axis=1)

In [6]:
# create dictionaries to match previous week 
# with "next" week's potential outcomes
x_df_dict={}
y_df_dict={}
for i in range(1,17):
    filt = df['Week'] == i
    x_df_dict[i] = df_no_points.loc[filt]
    y_df_dict[i] = df.loc[filt]

In [7]:
# Establish dependent and independent variables
# These will be non-scaled data for boost models
X_trains_list = []
y_trains_list = []
X_tests_list = []
y_tests_list = []
for num in range(1,17):
    try:
        X_train, X_test, y_train, y_test = train_test_split_dicts(x_df_dict, y_df_dict,num)
        X_train = X_train.drop(labels=['Week_y', 'Pos_y', 'Team_y', 
                               'h/a_y', 'Oppt_y',  'DK points',  
                               'DK salary_y', 'scoring_potential'], 
                               axis=1)
        X_test = X_test.drop(labels=['Week_y', 'Pos_y', 'Team_y', 
                                       'h/a_y', 'Oppt_y',  'DK points',  
                                       'DK salary_y', 'scoring_potential'], 
                               axis=1)
        X_trains_list.append(X_train)
        X_tests_list.append(X_test)
        y_trains_list.append(y_train)
        y_tests_list.append(y_test)
    except KeyError:
        pass

In [8]:
X_trains_list

[     Week_x                  Name Pos_x Team_x h/a_x Oppt_x  DK salary_x
 404     1.0          Jacksonville   Def    jac     h    kan       2300.0
 240     1.0      Sims Jr., Steven    WR    was     a    phi       3000.0
 145     1.0           Logan, T.J.    RB    tam     h    sfo       3000.0
 258     1.0  Arcega-Whiteside, JJ    WR    phi     h    was       3400.0
 391     1.0            Hill, Josh    TE    nor     h    hou       2700.0
 ..      ...                   ...   ...    ...   ...    ...          ...
 340     1.0         Jarwin, Blake    TE    dal     h    nyg       2700.0
 201     1.0         Hollins, Mack    WR    phi     h    was       3000.0
 124     1.0         Bellore, Nick    RB    sea     h    cin       3000.0
 52      1.0          Gurley, Todd    RB    lar     a    car       7900.0
 180     1.0         Cooper, Amari    WR    dal     h    nyg       7000.0
 
 [204 rows x 7 columns],
      Week_x              Name Pos_x Team_x h/a_x Oppt_x  DK salary_x
 248     2.0   

In [9]:
X_tests_list

[     Week_x                    Name Pos_x Team_x h/a_x Oppt_x  DK salary_x
 252     1.0          Willis, Damion    WR    cin     a    sea       3000.0
 314     1.0          Waller, Darren    TE    oak     h    den       3000.0
 316     1.0            Engram, Evan    TE    nyg     a    dal       4800.0
 370     1.0         Lewis, Marcedes    TE    gnb     a    chi       2500.0
 109     1.0        Ogunbowale, Dare    RB    tam     h    sfo       3000.0
 ..      ...                     ...   ...    ...   ...    ...          ...
 30      1.0      Trubisky, Mitchell    QB    chi     h    gnb       5700.0
 2       1.0           Prescott, Dak    QB    dal     h    nyg       5900.0
 284     1.0             Wims, Javon    WR    chi     h    gnb       3000.0
 163     1.0         Kirk, Christian    WR    ari     h    det       4700.0
 228     1.0  Patterson, Cordarrelle    WR    chi     h    gnb       3600.0
 
 [204 rows x 7 columns],
      Week_x                  Name Pos_x Team_x h/a_x Oppt_x 

In [20]:
# Encode data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,2,3,4])], remainder='passthrough')
for num in range(0, len(X_trains_list)-1):
    X_trains_list[num] = np.array(ct.fit_transform(X_trains_list[num]))
for num in range(0, len(X_trains_list)-1):
    X_tests_list[num] = np.array(ct.fit_transform(X_tests_list[num]))
for num in range(0, len(X_trains_list)-1):
    y_trains_list[num] = np.array(ct.fit_transform(y_trains_list[num]))
for num in range(0, len(X_trains_list)-1):
    y_tests_list[num] = np.array(ct.fit_transform(y_tests_list[num]))

In [21]:
X_trains_list

[array([[0., 1., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]),
 array([[1., 0., 0., ..., 0., 1., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 1., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]),
 array([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]),
 array([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 1., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]),
 arr

In [19]:
# Scaled Data
scaled_X_trains = []
scaled_X_tests = []
sc = StandardScaler()
for num in range(0,len(X_trains_list)-1):
    print(num)
    scaled_X_train, scaled_X_test = scale_features(sc, X_trains_list[num], X_tests_list[num])
    scaled_X_trains.append(scaled_X_train)
    scaled_X_tests.append(scaled_X_test)

0


ValueError: X has 338 features, but this StandardScaler is expecting 333 features as input.

## Non-Boost Methods (using scaled data)

In [None]:
# Logistic Regression
def make_log_reg(scaled_X_train, y_train, scaled_X_test, y_test):
    classifier = LogisticRegression(random_state=0)
    classifier.fit(scaled_X_train, y_train)
    y_pred = classifier.predict(scaled_X_test)
    cm, acc_score = make_confusion_matrix(classifier, scaled_X_test, y_test, y_pred)
    print("Confusion Matrix: \n")
    print(cm)
    print("\n")
    print("Accuracy: \n")
    print(acc_score)

In [None]:
# K-NN 
def make_knn(scaled_X_train, y_train, scaled_X_test, y_test):
    classifier = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2)
    classifier.fit(scaled_X_train, y_train)
    y_pred = classifier.predict(scaled_X_test)
    cm, acc_score = make_confusion_matrix(classifier, scaled_X_test, y_test, y_pred)
    print("Confusion Matrix: \n")
    print(cm)
    print("\n")
    print("Accuracy: \n")
    print(acc_score)

In [None]:
# SVM 
def make_svm(scaled_X_train, y_train, scaled_X_test, y_test):
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(scaled_X_train, y_train)
    y_pred = classifier.predict(scaled_X_test)
    cm, acc_score = make_confusion_matrix(classifier, scaled_X_test, y_test, y_pred)
    print("Confusion Matrix: \n")
    print(cm)
    print("\n")
    print("Accuracy: \n")
    print(acc_score)

In [None]:
# Kernel SVM
def make_k_svm(scaled_X_train, y_train, scaled_X_test, y_test):
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(scaled_X_train, y_train)
    y_pred = classifier.predict(scaled_X_test)
    cm, acc_score = make_confusion_matrix(classifier, scaled_X_test, y_test, y_pred)
    print("Confusion Matrix: \n")
    print(cm)
    print("\n")
    print("Accuracy: \n")
    print(acc_score)

In [None]:
# Naive Bayes
def make_nb(scaled_X_train, y_train, scaled_X_test, y_test):
    classifier = GaussianNB()
    classifier.fit(scaled_X_train, y_train)
    y_pred = classifier.predict(scaled_X_test)
    cm, acc_score = make_confusion_matrix(classifier, scaled_X_test, y_test, y_pred)
    print("Confusion Matrix: \n")
    print(cm)
    print("\n")
    print("Accuracy: \n")
    print(acc_score)

In [None]:
# Decision Tree
def make_tree(scaled_X_train, y_train, scaled_X_test, y_test):
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(scaled_X_train, y_train)
    y_pred = classifier.predict(scaled_X_test)
    cm, acc_score = make_confusion_matrix(classifier, scaled_X_test, y_test, y_pred)
    print("Confusion Matrix: \n")
    print(cm)
    print("\n")
    print("Accuracy: \n")
    print(acc_score)

In [None]:
# Random Forest
def make_forest(scaled_X_train, y_train, scaled_X_test, y_test):
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(scaled_X_train, y_train)
    y_pred = classifier.predict(scaled_X_test)
    cm, acc_score = make_confusion_matrix(classifier, scaled_X_test, y_test, y_pred)
    print("Confusion Matrix: \n")
    print(cm)
    print("\n")
    print("Accuracy: \n")
    print(acc_score)

In [None]:
# Summary

## Boost Methods (using non-scaled data)

In [None]:
# AdaBoost
def make_adaboost(X_train, y_train, X_test, y_test):
    classifier = AdaBoostClassifier()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    cm, acc_score = make_confusion_matrix(classifier, X_test, y_test, y_pred)
    print("Confusion Matrix: \n")
    print(cm)
    print("\n")
    print("Accuracy: \n")
    print(acc_score)

In [None]:
# GradientBoost
def make_gradientboost(X_train, y_train, X_test, y_test):
    classifier = GradientBoostingClassifier()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    cm, acc_score = make_confusion_matrix(classifier, X_test, y_test, y_pred)
    print("Confusion Matrix: \n")
    print(cm)
    print("\n")
    print("Accuracy: \n")
    print(acc_score)

In [None]:
# XGBoost
def make_xgboost(X_train, y_train, X_test, y_test):
    classifier = XGBClassifier()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    cm, acc_score = make_confusion_matrix(classifier, X_test, y_test, y_pred)
    print("Confusion Matrix: \n")
    print(cm)
    print("\n")
    print("Accuracy: \n")
    print(acc_score)

In [None]:
# Summary

## Results