In [578]:
from path import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import os
import pickle
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


In [496]:
def get_training_df(year):
    
    training_path = Path(f"training_dataframes/clean_{year}.csv")
    train_df = pd.read_csv(training_path, index_col = "Date", infer_datetime_format = True)
    
    return train_df

In [385]:
def get_xy(df):
    
    #getting team for future verification purposes
    team_df = df["01_Team"].to_frame()
    
    #getting X
    X = df.copy()
    X.drop(columns = ["W/L"], inplace = True)
    
    #Getting y
    y = df["W/L"]
    
    #Cleanning X
    string_col = [x for x in X.columns if X[x].dtype == "O" or X[x].dtype == "str"]
    X.drop(columns = string_col, inplace = True)
    
    return {"X":X, "y":y,"Team":team_df}  

In [522]:
def traintestsplit(X,y,team_df, split_at = 0.5):
   
    mid = int(X.shape[0]*split_at)
    
    #Train
    X_train = X[ : mid]
    y_train = y[ : mid]
    team_df_train = team_df[ : mid]
    
    #test
    X_test = X[mid : ]
    y_test = y[mid : ] 
    team_df_test = team_df[mid : ]
    
    return {"X_train":X_train, "y_train":y_train, "Team_train":team_df_train,
           "X_test":X_test, "y_test":y_test, "Team_test":team_df_test}    

In [348]:
pca = PCA(n_components= 20)
#found 20 as the most optimal number of components

X_17_reduced = pca.fit_transform(X_17)

In [349]:
pca.explained_variance_ratio_.sum()

0.9859018720431341

In [553]:
def random_forest(X_train,y_train,X_test):
    
    rand_for_clf = RandomForestClassifier(n_estimators = 1000, min_samples_split = 4, 
                                      max_leaf_nodes = 10, max_depth = 25)
    
    rand_for_clf.fit(X_train,y_train)
    
    return {"model":rand_for_clf,
            "y_pred":rand_for_clf.predict(X_test),
            "feature_importance":
            pd.Series(rand_for_clf.feature_importances_,index=X_train.columns).sort_values(ascending=False)
           }    

In [594]:
def svm(X_train,y_train,X_test):
    svm = Pipeline((
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel = "poly", degree = 3, coef0=.5, C=10, gamma = "auto"))
    ))
    svm.fit(X_train,y_train)
    
    return {"model":svm,
            "y_pred":svm.predict(X_test)}

In [363]:
def get_raw_col(columns):
    
    raw_col = []
    for x in columns:
        
        x = x.split("_")
        if len(x)>1:
            if len(x)>2: raw_col.append("_".join(x[1:]))
            else: raw_col.append(x[1])
                        
        else: raw_col.append(*x)
        
    return raw_col

In [499]:
def process_feat_imp(feature_imp):

    imp_features = set(get_raw_col(feature_imp.index.values))

    relevance_raw_feat = []
    for feat in imp_features:

        if feat == "Streak": stats = ["Streak"]
        else: 

            stats = [x for x in feature_imp.index if (feat == x[3:] or 
                                                      x == "team_"+feat or 
                                                      x == "opponet_"+feat or
                                                      x == "Home_"+feat   )]


        batter_stats = [x for x in stats 
                        if (x.startswith("01_") or x.startswith("02_") or x.startswith("03_") or x.startswith("04_") or 
                            x.startswith("05_") or x.startswith("06_") or x.startswith("07_") or x.startswith("08_") or
                            x.startswith("09_") or x.startswith("10_") or x.startswith("11_") or x.startswith("12_") or
                            x.startswith("13_") or x.startswith("14_") or x.startswith("15_") or x.startswith("16_") or
                            x.startswith("17_") or x.startswith("18_"))]

        pitcher_stats = [x for x in stats if x.startswith("team_pitcher_") or x.startswith("opponet_pitcher_") ]

        total_relevance = 0
        if len(batter_stats)>0:
            kind = "Batter"
            for stat in batter_stats: total_relevance += feature_imp[stat]
            relevance_raw_feat.append([feat,total_relevance, kind])   

        total_relevance = 0
        if len(pitcher_stats)>0: 
            kind = "Pitcher"
            for stat in pitcher_stats: total_relevance += feature_imp[stat]
            relevance_raw_feat.append([feat,total_relevance, kind]) 

        if len(batter_stats)==0 and len(pitcher_stats)==0:
            kind = "Team"

            relevance_raw_feat.append([stats[0],feature_imp[stats[0]], kind])

    relevance_raw_feat = sorted(relevance_raw_feat, key=lambda x: x[1], reverse=True)
    relevance_raw_feat_df = pd.DataFrame(relevance_raw_feat)
    relevance_raw_feat_df.columns = ["Statistic","Importance","Kind of stat"]
    
    return relevance_raw_feat_df

In [607]:
def save_model(year=2017, model=None, accuracy=0.0, relevance_raw_feat_df=None, 
               predictions_df=None,model_type="randomforest"):
    
    os.mkdir(f"Models/{year}/{model_type}_acc_{accuracy:.5f}")
    model_file = f"Models/{year}/{model_type}_acc_{accuracy:.5f}/randomforest_{accuracy:.5f}.sav"
    pred_file = Path(f"Models/{year}/{model_type}_acc_{accuracy:.5f}/prediction_{accuracy:.5f}.csv")

    pickle.dump(model, open(model_file, 'wb'))
    predictions_df.to_csv(pred_file)
    
    if model_type == "randomforest":
        feat_file = Path(f"Models/{year}/{model_type}_acc_{accuracy:.5f}/randomforest_{accuracy:.5f}_features.csv")
        relevance_raw_feat_df.to_csv(feat_file)


In [622]:
def run_model(year=2019, model_type = "randomforest", save_model = False):
    
    data = get_training_df(year)
    xy_dict = get_xy(data)
    xy_split = traintestsplit( xy_dict["X"] , xy_dict["y"] , xy_dict["Team"], split_at = 0.5 )
          
    if model_type == "randomforest":
        model_dict = random_forest( xy_split["X_train"] , xy_split["y_train"] , xy_split["X_test"] )
        model, y_pred, feat_imp = model_dict["model"],model_dict["y_pred"],model_dict["feature_importance"]
        relevance_raw_feat_df = process_feat_imp(feat_imp)
        print(relevance_raw_feat_df.head(25))
        
    elif model_type == "svm":
        model_dict = svm( xy_split["X_train"] , xy_split["y_train"] , xy_split["X_test"] )
        model, y_pred = model_dict["model"],model_dict["y_pred"]
        
    accuracy = metrics.accuracy_score(xy_split["y_test"], y_pred)
    print(f"accuracy: {accuracy}")
    
    y_test_df = xy_split["y_test"].to_frame()
    predictions_df = pd.concat([xy_split["Team_test"],y_test_df, pd.DataFrame({"predicted":y_pred},index = y_test_df.index)], axis=1,ignore_index=False)
   
    
    if save_model: 
        print(predictions_df.head())
        print(model)
        print(model_type)
        print(year)
        print(accuracy)
        if model_type == "randomforest":
            save_model(year, 
                       model , 
                       accuracy ,
                       relevance_raw_feat_df, 
                       predictions_df ,
                       model_type)
        else:
            print(predictions_df.head())
            print(model)
            print(model_type)
            print(year)
            print(accuracy)
            save_model(
                       year ,
                       model , 
                       accuracy ,
                       predictions_df = predictions_df,
                       model_type = model_type
                       )

In [592]:
#for x in range(0,10): run_model(2017,"svm",True)  
 

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


KeyboardInterrupt: 

In [623]:
run_model(2017,"randomforest",True)  

        Statistic  Importance Kind of stat
0          Streak    0.152498         Team
1             GB%    0.010296       Batter
2             UBR    0.010250       Batter
3           GB/FB    0.009559       Batter
4       SI-X (pi)    0.009000       Batter
5          Clutch    0.008989       Batter
6        vFA (pi)    0.008760       Batter
7             SL%    0.008519       Batter
8             SLv    0.008255       Batter
9           wCT/C    0.008209       Batter
10            pLI    0.008066       Batter
11  Z-Swing% (pi)    0.007671       Batter
12     wSI/C (pi)    0.007586       Batter
13           phLI    0.007581       Batter
14         SwStr%    0.007581       Batter
15            LD%    0.007350       Batter
16       SI% (pi)    0.007254       Batter
17             K%    0.007233       Batter
18           TTO%    0.007228       Batter
19            FBv    0.006971       Batter
20    Swing% (pi)    0.006884       Batter
21       wSL (pi)    0.006798       Batter
22       SL

TypeError: 'bool' object is not callable

In [564]:
accuracy

0.8631892015557081

In [562]:
relevance_raw_feat_df  

Unnamed: 0,Statistic,Importance,Kind of stat
0,Streak,0.162675,Team
1,REW,0.029438,Batter
2,WPA,0.023945,Batter
3,RE24,0.015983,Batter
4,wCU (pi),0.014909,Batter
5,wRAA,0.014869,Batter
6,wFA (pi),0.014659,Batter
7,wCU/C (pi),0.014548,Batter
8,wCB,0.014291,Batter
9,Off,0.013881,Batter


In [375]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(xy_split["X_test"], y_17_test)
print(result)

0.9362139917695473


In [389]:
loaded_model

In [545]:
year = 2018
#whole 90% year 2018 to train
training_data = get_training_df(year)
xy_train = get_xy(training_data)
xy_train_split = traintestsplit( xy_train["X"] , xy_train["y"] , xy_train["Team"], 0.9 )

#whole 90% year 2019 to test
test_data = get_training_df(year+1)
xy_test = get_xy(test_data)
xy_test_split = traintestsplit( xy_test["X"] , xy_test["y"] , xy_test["Team"], 0.1 )


model_dict = random_forest( X_train = xy_train_split["X_train"] ,
                           y_train = xy_train_split["y_train"] , 
                           X_test = xy_test_split["X_test"] )
model, y_pred, feat_imp = model_dict["model"],model_dict["y_pred"],model_dict["feature_importance"]
accuracy = metrics.accuracy_score(xy_test_split["y_test"], y_pred)
relevance_raw_feat_df = process_feat_imp(feat_imp)
y_test_df = xy_test_split["y_test"].to_frame()
predictions_df = pd.concat([xy_test_split["Team_test"],y_test_df, pd.DataFrame({"predicted":y_pred},index = y_test_df.index)], axis=1,ignore_index=False)


In [546]:
accuracy

0.8631892015557081

In [544]:
xy_test_split["X_test"]

Unnamed: 0_level_0,Home_Away,Streak,01_G,01_AB,01_PA,01_H,01_1B,01_2B,01_3B,01_HR,...,opponet_pitcher_wSL/C (pi),opponet_pitcher_wXX/C (pi),opponet_pitcher_O-Swing% (pi),opponet_pitcher_Z-Swing% (pi),opponet_pitcher_Swing% (pi),opponet_pitcher_O-Contact% (pi),opponet_pitcher_Z-Contact% (pi),opponet_pitcher_Contact% (pi),opponet_pitcher_Zone% (pi),opponet_pitcher_Pace (pi)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-04-16,1,-2,136.0,520.0,614.0,180.0,96.0,47.0,5.0,32.0,...,0.00,-50.48,0.354,0.680,0.525,0.536,0.815,0.726,0.527,20.1
2019-04-16,0,-2,129.0,396.0,465.0,108.0,75.0,27.0,1.0,5.0,...,3.38,0.00,0.332,0.581,0.452,0.517,0.820,0.705,0.483,24.3
2019-04-16,1,2,55.0,97.0,113.0,13.0,10.0,3.0,0.0,0.0,...,1.12,0.00,0.331,0.629,0.448,0.561,0.892,0.744,0.392,24.9
2019-04-16,1,1,41.0,141.0,152.0,36.0,22.0,11.0,1.0,2.0,...,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.0
2019-04-16,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.02,0.00,0.288,0.641,0.475,0.671,0.842,0.794,0.529,21.5
2019-04-16,0,-3,153.0,488.0,528.0,123.0,99.0,13.0,0.0,11.0,...,0.00,-5.09,0.269,0.599,0.425,0.640,0.897,0.811,0.472,24.4
2019-04-16,1,3,156.0,626.0,696.0,182.0,115.0,31.0,7.0,29.0,...,0.00,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.0
2019-04-16,0,2,141.0,539.0,620.0,166.0,129.0,25.0,2.0,10.0,...,1.13,0.00,0.324,0.627,0.459,0.520,0.828,0.708,0.445,24.0
2019-04-16,0,-3,141.0,477.0,519.0,117.0,75.0,26.0,8.0,8.0,...,0.80,0.00,0.292,0.618,0.454,0.466,0.848,0.724,0.497,23.6
2019-04-16,1,-1,141.0,466.0,515.0,121.0,92.0,14.0,1.0,14.0,...,6.60,0.00,0.261,0.638,0.442,0.466,0.850,0.733,0.481,23.0
