In [None]:
import dask.array as da
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

from dask_ml.model_selection import train_test_split
from dask_ml import preprocessing
import dask_ml.metrics as metrics

from dask_ml.linear_model import LogisticRegression
from dask_ml.xgboost import XGBClassifier

import pandas as pd
import math
import pickle
import paths

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
cluster = LocalCluster(n_workers=8)
client = Client(cluster)

In [None]:
sg_rd = dd.read_csv(paths.processed + "/EngineeredTrainData/*.part")

In [None]:
sg_rd.head()

In [None]:
sg_rd = sg_rd.drop("Unnamed: 0", axis=1)

In [None]:
sg_rd.head()

In [None]:
# Enumerating all of the possible training columns
XR_cols = sg_rd.columns[4:40]
XD_cols = sg_rd.columns[40:]

# Chosen training columns
P_cols = ["Ast_D", "Blk_D", "FGP_D", "OppAst_D", "OppDR_D",
          "OppFGP_D", "OppRank_D", "OppScore_R", "Rank_D", "Score_R", "TO_R", "Wins_D", "RankedWins_D"]

# Specifying outcome column
Y_cols = sg_rd.columns[3]

X_df = sg_rd[P_cols]
Y_df = sg_rd[Y_cols]


In [None]:
X_df.head()

In [None]:
Y_df.head()

In [None]:
X = X_df.to_dask_array(lengths=True)
Y = Y_df.to_dask_array(lengths=True)

scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)

X_cv_train, X_cv_test, Y_cv_train, Y_cv_test = train_test_split(X, Y, train_size=0.8, random_state=0)

# Persisting data for efficiency when doing analysis
X_cv_test.persist()
Y_cv_test.persist()

In [None]:
def getStats (model, X_test, Y_test):
    prob = model.predict_proba(X_test)
    print("Log-Loss: ", metrics.log_loss(Y_test, prob).compute())

    preds = model.predict(X_test)
    print("Accuracy: ", metrics.accuracy_score(Y_test, preds))
    
def getStatsSK (model, X_test, Y_test):
    prob = model.predict_proba(X_test)
    print("Log-Loss: ", metrics.log_loss(Y_test, prob))

    preds = model.predict(X_test)
    print("Accuracy: ", metrics.accuracy_score(Y_test, preds))    


In [None]:
lr = LogisticRegression()
lr.fit(X_cv_train, Y_cv_train)
getStats(lr, X_cv_test, Y_cv_test)


In [None]:
lr_final = LogisticRegression()
lr_final.fit(X, Y)
pickle.dump(lr_final, open(paths.models + "/lr_final.sav", "wb"))

In [None]:
xgbc = XGBClassifier(max_depth=4)
xgbc.fit(X_cv_train, Y_cv_train)

getStats(xgbc, X_cv_test, Y_cv_test)


In [None]:
xgbc_final = XGBClassifier(max_depth=4)
xgbc_final.fit(X, Y)

In [None]:
pickle.dump(xgbc_final, open(paths.models + "/xgb_final.sav", "wb"))