In [24]:
from path import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import os
import pickle
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


In [25]:
def get_training_df(year):
    
    training_path = Path(f"training_dataframes/clean_{year}.csv")
    train_df = pd.read_csv(training_path, index_col = "Date", infer_datetime_format = True)
    
    return train_df

In [26]:
def get_xy(df):
    
    #getting team for future verification purposes
    team_df = df[["Tm","Opp"]]
    
    #getting X
    X = df.copy()
    X.drop(columns = ["W/L",], inplace = True)
    
    #Getting y
    y = df["W/L"]
    
    #Cleanning X
    string_col = [x for x in X.columns if X[x].dtype == "O" or X[x].dtype == "str"]
    X.drop(columns = string_col, inplace = True)
    
    return {"X":X, "y":y,"Team":team_df}  

In [27]:
def traintestsplit(X,y,team_df, split_at = 0.5):
   
    mid = int(X.shape[0]*split_at)
    
    #Train
    X_train = X[ : mid]
    y_train = y[ : mid]
    team_df_train = team_df[ : mid]
    
    #test
    X_test = X[mid : ]
    y_test = y[mid : ] 
    team_df_test = team_df[mid : ]
    
    return {"X_train":X_train, "y_train":y_train, "Team_train":team_df_train,
           "X_test":X_test, "y_test":y_test, "Team_test":team_df_test}    

In [28]:
def get_pca(X):
    pca = PCA(n_components= 20)
    #found 20 as the most optimal number of components

    X_reduced = pca.fit_transform(X)
    return [X_reduced,pca.explained_variance_ratio_.sum()]

In [29]:
#pca.explained_variance_ratio_.sum()

In [123]:
def random_forest(X_train,y_train,X_test):
    
    rand_for_clf = RandomForestClassifier(n_estimators = 1000, min_samples_split = 4, 
                                      max_leaf_nodes = 10, max_depth = 25)
    
    rand_for_clf.fit(X_train,y_train)
    if type(X_train) == np.ndarray: feat = np.array(range(0,20))
    else: feat = X_train.columns
    
    return {"model":rand_for_clf,
            "y_pred":rand_for_clf.predict(X_test),
            "feature_importance":
            pd.Series(rand_for_clf.feature_importances_,index=feat).sort_values(ascending=False)
           }    

In [102]:
def svm(X_train,y_train,X_test):
    svm = Pipeline((
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel = "poly", degree = 4, coef0=1.5, C=30, gamma = "auto"))#Best without PCA
    #("svm_clf", SVC(kernel = "poly", degree = 4, coef0=1.6, C=13, gamma = "auto"))#Best with PCA
    ))
    svm.fit(X_train,y_train)
    
    return {"model":svm,
            "y_pred":svm.predict(X_test)}

In [32]:
def get_raw_col(columns):
    
    raw_col = []
    for x in columns:
        
        x = x.split("_")
        if len(x)>1:
            if len(x)>2: raw_col.append("_".join(x[1:]))
            else: raw_col.append(x[1])
                        
        else: raw_col.append(*x)
        
    return raw_col

In [33]:
def process_feat_imp(feature_imp):

    imp_features = set(get_raw_col(feature_imp.index.values))

    relevance_raw_feat = []
    for feat in imp_features:

        if feat == "Streak": stats = ["Streak"]
        else: 

            stats = [x for x in feature_imp.index if (feat == x[3:] or 
                                                      x == "team_"+feat or 
                                                      x == "opponet_"+feat or
                                                      x == "Home_"+feat   )]


        batter_stats = [x for x in stats 
                        if (x.startswith("01_") or x.startswith("02_") or x.startswith("03_") or x.startswith("04_") or 
                            x.startswith("05_") or x.startswith("06_") or x.startswith("07_") or x.startswith("08_") or
                            x.startswith("09_") or x.startswith("10_") or x.startswith("11_") or x.startswith("12_") or
                            x.startswith("13_") or x.startswith("14_") or x.startswith("15_") or x.startswith("16_") or
                            x.startswith("17_") or x.startswith("18_"))]

        pitcher_stats = [x for x in stats if x.startswith("team_pitcher_") or x.startswith("opponet_pitcher_") ]

        total_relevance = 0
        if len(batter_stats)>0:
            kind = "Batter"
            for stat in batter_stats: total_relevance += feature_imp[stat]
            relevance_raw_feat.append([feat,total_relevance, kind])   

        total_relevance = 0
        if len(pitcher_stats)>0: 
            kind = "Pitcher"
            for stat in pitcher_stats: total_relevance += feature_imp[stat]
            relevance_raw_feat.append([feat,total_relevance, kind]) 

        if len(batter_stats)==0 and len(pitcher_stats)==0:
            kind = "Team"

            relevance_raw_feat.append([stats[0],feature_imp[stats[0]], kind])

    relevance_raw_feat = sorted(relevance_raw_feat, key=lambda x: x[1], reverse=True)
    relevance_raw_feat_df = pd.DataFrame(relevance_raw_feat)
    relevance_raw_feat_df.columns = ["Statistic","Importance","Kind of stat"]
    
    return relevance_raw_feat_df

In [34]:
def store_model(year=2017, model=None, accuracy=0.0, relevance_raw_feat_df=None, 
               predictions_df=None,model_type="randomforest"):
    
    os.makedirs(f"Models/{year}/{model_type}/acc_{accuracy:.5f}")
    
    model_file = f"Models/{year}/{model_type}/acc_{accuracy:.5f}/{model_type}_{accuracy:.5f}.sav"
    pred_file = Path(f"Models/{year}/{model_type}/acc_{accuracy:.5f}/prediction_{accuracy:.5f}.csv")

    pickle.dump(model, open(model_file, 'wb'))
    predictions_df.to_csv(pred_file)
    
    if model_type == "randomforest":
        feat_file = Path(f"Models/{year}/{model_type}/acc_{accuracy:.5f}/randomforest_{accuracy:.5f}_features.csv")
        relevance_raw_feat_df.to_csv(feat_file)


In [125]:
def run_model(year=2019, model_type = "randomforest", save_model = False, pca = False):
    
    data = get_training_df(year)
    xy_dict = get_xy(data)
    
    ############## EXPERIMENT ################
    #xy_dict["X"] = xy_dict["X"] ["Streak"].to_frame()
    #print(xy_dict["X"].columns)
    #THIS IS WHAT IS GIVIN AWAY THE RESULTS!!!!!
    #NOW LET'S DROP STREAK:
    xy_dict["X"].drop(columns=["Streak"], inplace = True)
    ##########################################
    
    if pca: 
        X = get_pca(xy_dict["X"])
        print(f"PCA total explained variance: {X[1]}")
        xy_dict["X"] = X[0]
        
    xy_split = traintestsplit( xy_dict["X"] , xy_dict["y"] , xy_dict["Team"], split_at = 0.5 )
          
    if model_type == "randomforest":
        model_dict = random_forest( xy_split["X_train"] , xy_split["y_train"] , xy_split["X_test"] )
        model, y_pred, feat_imp = model_dict["model"],model_dict["y_pred"],model_dict["feature_importance"]
        if not pca:
            relevance_raw_feat_df = process_feat_imp(feat_imp)
            print(relevance_raw_feat_df.head(25))
        
    elif model_type == "svm":
        model_dict = svm( xy_split["X_train"] , xy_split["y_train"] , xy_split["X_test"] )
        model, y_pred = model_dict["model"],model_dict["y_pred"]
        
    accuracy = metrics.accuracy_score(xy_split["y_test"], y_pred)
    print(f"accuracy: {accuracy}")
    
    y_test_df = xy_split["y_test"].to_frame()
    predictions_df = pd.concat([xy_split["Team_test"],y_test_df, pd.DataFrame({"predicted":y_pred},index = y_test_df.index)], axis=1,ignore_index=False)
   
    
    if save_model: 
       
        if model_type == "randomforest" and not pca:
            store_model(year, model , accuracy ,relevance_raw_feat_df, predictions_df ,model_type)
        else:
            store_model(year ,model ,  accuracy ,predictions_df = predictions_df,model_type = model_type )

In [128]:
for x in range(0,10): run_model(2018,"randomforest",True, False)  
 

          Statistic  Importance Kind of stat
0               WPA    0.018504       Batter
1               RAR    0.018236       Batter
2               WAR    0.017145       Batter
3               Off    0.015816       Batter
4              RE24    0.013776       Batter
5               REW    0.013625       Batter
6              +WPA    0.012896       Batter
7                K%    0.012803       Batter
8               Bat    0.011826       Batter
9            WPA/LI    0.011076       Batter
10                R    0.010250       Batter
11  Z-Contact% (pi)    0.009397       Batter
12           Clutch    0.009341       Batter
13       wCU/C (pi)    0.009286       Batter
14             BB/K    0.009225       Batter
15         vCS (pi)    0.009195       Batter
16              OBP    0.009129       Batter
17              wRC    0.009099       Batter
18           SwStr%    0.009059       Batter
19             wRAA    0.009038       Batter
20    O-Swing% (pi)    0.008954       Batter
21        

In [115]:
run_model(2018,"randomforest",True, False)  

        Statistic  Importance Kind of stat
0             WPA    0.020520       Batter
1             RAR    0.019239       Batter
2             Off    0.018527       Batter
3            +WPA    0.016225       Batter
4             WAR    0.015240       Batter
5            RE24    0.014994       Batter
6             REW    0.014147       Batter
7              K%    0.011900       Batter
8            wRAA    0.010886       Batter
9          WPA/LI    0.010853       Batter
10            Bat    0.010583       Batter
11             AB    0.010237       Batter
12            OBP    0.009744       Batter
13              R    0.009497       Batter
14          Balls    0.009288       Batter
15            wCB    0.009079       Batter
16         SwStr%    0.008793       Batter
17             PA    0.008708       Batter
18  O-Swing% (pi)    0.008703       Batter
19     wCU/C (pi)    0.008668       Batter
20       O-Swing%    0.008626       Batter
21            wRC    0.008602       Batter
22      F-S

In [37]:
test_data = get_training_df(2019)
xy = get_xy(test_data)
X = get_pca(xy["X"])
print(X[1])
loaded_model = pickle.load(open("Models/2018/svm/acc_0.99341/svm_0.99341.sav", 'rb'))

#without PCA:
#pred = loaded_model.predict(xy["X"])

#With PCA:
pred = loaded_model.predict(X[0])

accuracy = metrics.accuracy_score(xy["y"], pred)
print(accuracy)
#result = loaded_model.score(xy["X"], xy["y"])
#print(result)

0.9858285508833632


ValueError: operands could not be broadcast together with shapes (4856,20) (3768,) (4856,20) 

In [21]:
test_data

Unnamed: 0_level_0,W/L,Tm,Home_Away,Opp,Streak,01_Name,01_G,01_AB,01_PA,01_H,...,opponet_pitcher_wSL/C (pi),opponet_pitcher_wXX/C (pi),opponet_pitcher_O-Swing% (pi),opponet_pitcher_Z-Swing% (pi),opponet_pitcher_Swing% (pi),opponet_pitcher_O-Contact% (pi),opponet_pitcher_Z-Contact% (pi),opponet_pitcher_Contact% (pi),opponet_pitcher_Zone% (pi),opponet_pitcher_Pace (pi)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-03-20,L,OAK,0,SEA,-1,0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.0
2019-03-20,W,SEA,1,OAK,1,0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.0
2019-03-21,L,OAK,0,SEA,-2,0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.0
2019-03-21,W,SEA,1,OAK,2,0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.0
2019-03-28,L,ARI,1,LAD,-1,Adam Jones,145.0,580.0,613.0,163.0,...,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.0
2019-03-28,L,WSN,0,NYM,-1,0,0.0,0.0,0.0,0.0,...,2.25,0.00,0.367,0.661,0.518,0.524,0.804,0.708,0.513,21.9
2019-03-28,W,CIN,0,PIT,1,Jesse Winker,89.0,281.0,334.0,84.0,...,1.18,23.80,0.346,0.643,0.487,0.617,0.869,0.775,0.475,22.8
2019-03-28,W,DET,1,TOR,1,Josh Harrison,97.0,344.0,374.0,86.0,...,0.73,0.00,0.274,0.617,0.436,0.591,0.888,0.789,0.473,23.0
2019-03-28,W,MIL,0,STL,1,Lorenzo Cain,141.0,539.0,620.0,166.0,...,3.13,0.00,0.342,0.668,0.521,0.659,0.880,0.815,0.549,22.7
2019-03-28,W,KCR,0,CHW,1,Whit Merrifield,158.0,632.0,707.0,192.0,...,1.75,0.00,0.270,0.664,0.455,0.610,0.887,0.800,0.469,23.2


In [64]:
year = 2017
#whole 90% year 2018 to train
training_data = get_training_df(year)
xy_train = get_xy(training_data)


#whole 90% year 2019 to test
test_data = get_training_df(year+1)
xy_test = get_xy(test_data)

model_dict = svm( X_train = xy_train["X"] ,
                           y_train = xy_train["y"] , 
                           X_test = xy_test["X"] )

#model, y_pred, feat_imp = model_dict["model"],model_dict["y_pred"],model_dict["feature_importance"]
model, y_pred = model_dict["model"],model_dict["y_pred"]
accuracy = metrics.accuracy_score(xy_test["y"], y_pred)
#relevance_raw_feat_df = process_feat_imp(feat_imp)
y_test_df = xy_test["y"].to_frame()
predictions_df = pd.concat([xy_test["Team"],y_test_df, pd.DataFrame({"predicted":y_pred},index = y_test_df.index)], axis=1,ignore_index=False)


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [65]:
accuracy

0.5047344586249486

In [61]:
xy_train["X"]

Date
2018-03-29    L
2018-03-29    L
2018-03-29    L
2018-03-29    L
2018-03-29    W
2018-03-29    L
2018-03-29    W
2018-03-29    W
2018-03-29    L
2018-03-29    L
2018-03-29    W
2018-03-29    L
2018-03-29    W
2018-03-29    W
2018-03-29    W
2018-03-29    L
2018-03-29    W
2018-03-29    L
2018-03-29    L
2018-03-29    W
2018-03-29    W
2018-03-29    L
2018-03-29    W
2018-03-29    W
2018-03-29    L
2018-03-29    W
2018-03-30    W
2018-03-30    W
2018-03-30    W
2018-03-30    L
             ..
2018-09-30    L
2018-09-30    L
2018-09-30    W
2018-09-30    W
2018-09-30    W
2018-09-30    W
2018-09-30    W
2018-09-30    L
2018-09-30    L
2018-09-30    W
2018-09-30    L
2018-09-30    L
2018-09-30    L
2018-09-30    W
2018-09-30    L
2018-09-30    W
2018-09-30    L
2018-09-30    W
2018-09-30    L
2018-09-30    W
2018-09-30    L
2018-09-30    L
2018-09-30    W
2018-09-30    W
2018-09-30    W
2018-09-30    W
2018-09-30    W
2018-09-30    L
2018-09-30    L
2018-09-30    L
Name: W/L, Length: 

In [73]:
loaded_model = pickle.load(open("Models/2019/svm_acc_0.99300/randomforest_0.99300.sav", 'rb'))
test_data = get_training_df(2018)
xy = get_xy(test_data)
pred = loaded_model.predict(xy["X"])
accuracy = metrics.accuracy_score(xy["y"], pred)
print(accuracy)


  Xt = transform.transform(Xt)


0.9967064635652532
