In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from itertools import combinations
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import seaborn as sns

%matplotlib inline

  from pandas.core import datetools


In [None]:
teams_abbr = {'- - -': '- - -', 'ANA': 'Angels', 'ARI': 'Diamondbacks', 
 'ATL': 'Braves', 'BAL': 'Orioles', 'BOS': 'Red Sox', 'CHC': 'Cubs', 
 'CHW': 'White Sox', 'CIN': 'Reds', 'CLE': 'Indians', 'COL': 'Rockies',
 'DET': 'Tigers', 'FLA': 'Marlins', 'HOU': 'Astros', 'KCR': 'Royals', 
 'LAA': 'Angels', 'LAD': 'Dodgers', 'MIA': 'Marlins', 'MIL': 'Brewers',
 'MIN': 'Twins', 'MON': 'Expos', 'NYM': 'Mets', 'NYY': 'Yankees',
 'OAK': 'Athletics', 'PHI': 'Phillies', 'PIT': 'Pirates', 'SDP': 'Padres',
 'SEA': 'Mariners', 'SFG': 'Giants', 'STL': 'Cardinals', 'TBD': 'Devil Rays', 
 'TBR': 'Rays', 'TEX': 'Rangers', 'TOR': 'Blue Jays', 'WSN': 'Nationals'}

In [None]:
#read in all data
sp = pd.read_csv('FanGraphs_Starters (2).csv')
rp = pd.read_csv('FanGraphs_Relievers (2).csv')
fd = pd.read_csv('FanGraphs_Fielders (2).csv')
bt = pd.read_csv('FanGraphs_Batting (2).csv')

a, b, c = len(sp),len(rp),len(fd)
print("""Number of starter pitchers:\t{}
Number of relief pitchers:\t{}
Number of fielders:\t\t{}
-------------------------------------
Total number of records:\t{}""".format(a, b, c, a+b+c))

#change abbreviation to full team name
fd.loc[[True]*len(fd),'Team'] = [teams_abbr[k] for k in fd.Team]



In [None]:
print(bt.columns)
bt = bt[['Season', 'Name', 'Team', 'G', 'playerid']]
bt.sample(4)

In [None]:
print(rp.columns)
rp = rp[['Season', 'Name', 'Team', 'G', 'GS', 'IP', 'playerid']]
rp['Inn'] = rp['IP']
rp.drop('IP', axis=1, inplace=True)
rp.sample(4)

In [None]:
sp.columns
sp = sp[['Season', 'Name', 'Team', 'G', 'GS', 'IP', 'playerid']]
sp['Inn'] = sp['IP']
sp.drop('IP', axis=1, inplace=True)
sp.sample(4)

In [None]:
print(fd.columns)
fd = fd[['Season', 'Name', 'Team', 'Pos', 'Inn', 'playerid']]
fd.sample(4)

In [None]:
#create a 3 sets containing tuples of playerids with the season for each record
fd_season_id = set([tuple(row) for row in fd[['playerid','Season']].values])
rp_season_id = set([tuple(row) for row in rp[['playerid','Season']].values])
sp_season_id = set([tuple(row) for row in sp[['playerid','Season']].values])
bt_season_id = set([tuple(row) for row in bt[['playerid','Season']].values])

a, b, c, d = len(sp_season_id), len(rp_season_id), len(fd_season_id), len(bt_season_id)
total = set()
total.update(sp_season_id);total.update(rp_season_id);total.update(fd_season_id);total.update(bt_season_id)

print("Only counting players once per season.\n")
print("""Number of starter pitchers:\t{}
Number of relief pitchers:\t{}
Number of fielders:\t\t{}
Number of batters:\t\t{}
-------------------------------------
Total number of records:\t{}""".format(a, b, c, d, len(total)))


In [None]:
print(len(bt_season_id.intersection(fd_season_id)))
print(len(bt_season_id.intersection(rp_season_id)))
print(len(bt_season_id.intersection(sp_season_id)))
print(len(rp_season_id.intersection(fd_season_id)))
print(len(rp_season_id.intersection(sp_season_id)))
print(len(sp_season_id.intersection(fd_season_id)))


## Combine the pitchers first

The three below are the only ones that switched between starter and relief pitcher mid season.

In [None]:
name = 'Scot Shields'
print('sp', sp[sp.Name==name], end='\n')
print('rp', rp[rp.Name==name], end='\n')
print('fd', fd[fd.Name==name], end='\n')
print('bt', bt[bt.Name==name], end='\n\n')


name = 'Tim Wakefield'
print('sp', sp[sp.Name==name], end='\n')
print('rp', rp[rp.Name==name], end='\n')
print('fd', fd[fd.Name==name], end='\n')
print('bt', bt[bt.Name==name], end='\n\n')

name = 'Mike Montgomery'
print('sp', sp[sp.Name==name], end='\n')
print('rp', rp[rp.Name==name], end='\n')
print('fd', fd[fd.Name==name], end='\n')
print('bt', bt[bt.Name==name], end='\n\n')


In [None]:
temp = pd.merge(rp, sp, on=['playerid', 'Season'], how='outer')
print(temp.Name_x.count())
print(temp.Name_y.count())
print(temp.Season.count())
print(temp[(~temp.Name_x.isnull())*(~temp.Name_y.isnull())])

In [None]:
#solve these rare occurrences by hand
#Scot spent more time as a relief pitcher
sp['Pos'] = 'SP'
rp['Pos'] = 'RP'
sp.drop( sp[(sp['Name']=='Scot Shields')*(sp['Season']==2003)].index, inplace=True )

#Tim spent more time as a starter
rp.drop( rp[(rp['Name']=='Tim Wakefield')*(rp['Season']==2002)].index, inplace=True )

#Mike spent more time as a starter
rp.drop( rp[(sp['Name']=='Mike Montgomery')*(sp['Season']==2017)].index, inplace=True )


In [None]:
pitchers = pd.concat([sp, rp], ignore_index=True)
pitchers.sort_values(by=['playerid', 'Season'])

## Clean the batters data

In [None]:
#drop any pitchers from the fielders since pitchers may play field positions on occasion
pitcher_ids = pitchers['playerid'].unique()
pitchers_playing_field = fd['playerid'].isin(pitcher_ids)
print(fd[pitchers_playing_field])
fd.drop( fd[pitchers_playing_field].index , inplace=True) #there are 63 pitchers in fd that aren't in rp or sp

fd.drop( fd[fd['Pos'] == 'P'].index, inplace=True)

In [None]:
#change all fielder positions (except catcher) to FD
fd.loc[fd.Pos.isin(['1B','2B','3B','RF','LF','CF','SS']),'Pos'] = 'FD'

In [None]:
#merge batters with fielders
fielders = pd.merge(bt, fd, on=['Season', 'Name', 'Team', 'playerid'], how='inner')
fielders.sort_values(by=['playerid', 'Season'])

In [None]:
fielders['GS'] = 0
p = pd.concat([pitchers, fielders], ignore_index=True)

## Old Code for Cleaning

In [None]:
#create temporary Seasonid column
fd.insert(2,'Seasonid',[tuple(row) for row in fd[['playerid','Season']].values])
rp.insert(2,'Seasonid',[tuple(row) for row in rp[['playerid','Season']].values])
sp.insert(2,'Seasonid',[tuple(row) for row in sp[['playerid','Season']].values])

#find and remove duplicates between dataframes
#mask = fd.Seasonid.isin(sp_season_id)
#fd.drop(fd[mask].index,inplace=True)

#mask = fd.Seasonid.isin(rp_season_id)
#fd.drop(fd[mask].index,inplace=True)

mask = rp.Seasonid.isin(sp_season_id)
rp.drop(rp[mask].index,inplace=True)

#remove temporary Seasonid column
fd.drop('Seasonid',axis=1,inplace=True)
sp.drop('Seasonid',axis=1,inplace=True)
rp.drop('Seasonid',axis=1,inplace=True)

In [None]:
#change all fielder positions (except catcher) to FD
fd.loc[fd.Pos.isin(['1B','2B','3B','RF','LF','CF','SS']),'Pos'] = 'FD'

#add column with potision to prepare for combining data
sp.insert(2, 'Pos', 'SP')
rp.insert(2,'Pos', 'RP')

In [None]:
p = pd.concat([sp,rp,fd], ignore_index=True)
#p = pd.merge(sp,rp, on=['Season', 'Name']'Seasonid', how='outer')
#p = pd.merge(p, fd, on='Seasonid', how='outer')
p.drop(p[p.Season<2002].index,inplace=True)
p.reset_index()

p[['W', 'L']].sample(5)

In [None]:
p = p.drop(['rSB', 'rGDP', 'rARM', 'rGFP',
       'rPM', 'BIZ', 'Plays', 'RZR', 'OOZ', 'CPP', 'TZL', 'FSR',
       'ARM', 'DPR', 'RngR', 'ErrR', 'UZR/150', 'Def',
        'L', 'SV', 'G', 'GS', 'IP', 'K/9',
       'BB/9', 'HR/9', 'BABIP', 'LOB%', 'GB%', 'HR/FB', 'ERA', 'xFIP',
       ], axis=1)

In [None]:
p['DRS'].fillna(p['DRS'].mean(), inplace=True)
p['RPP'].fillna(p['RPP'].mean(), inplace=True)
p['UZR'].fillna(p['UZR'].mean(), inplace=True)
p['FIP'].fillna(p['FIP'].mean(), inplace=True)
p['Inn'].fillna(p['Inn'].mean(), inplace=True)
p['WAR'].fillna(p['WAR'].mean(), inplace=True)
p['W'].fillna(p['W'].mean(), inplace=True)


In [None]:
by_pos = p.groupby('Pos')
for g in by_pos.groups:
    group = by_pos.get_group(g)
    print('#'*50)
    print(g)
    for c in p.columns:
        print(c,'\t', len(group)-group[c].count())

In [None]:
#account for team name changes
p.loc[p.Team=='Devil Rays','Team'] = 'Rays'
p.loc[p.Team=='Expos','Team'] = 'Nationals'
del(teams_abbr['MON'])
del(teams_abbr['TBD'])

#delete players from a season if their team is unknown
p.drop(p[p.Team=='- - -'].index,inplace=True)
del(teams_abbr['- - -'])

In [None]:
print("Number of records before:\t{}".format(len(p)))
p.drop_duplicates(inplace=True)
print("Number of records after:\t{}".format(len(p)))

In [None]:
p.sample(5)

In [None]:
p.dropna(axis=1, how='any')

## Feature Engineer Stay Length, Career Length

In [None]:
if 'Leave' in p.columns:
    p.drop('Leave', axis=1, inplace=True)
if 'Stay_Length' in p.columns:
    p.drop('Stay_Length', axis=1, inplace=True)
if 'Career_Length' in p.columns:
    p.drop('Career_Length', axis=1, inplace=True)
    
#initialize the new column with value 'No Change'
p.insert(1, 'Leave', 0)
p.insert(1, 'Stay_Length', 0)
p.insert(1, 'Career_Length', 0)
by_player = p.groupby('playerid')

#loop through, player by player
for g in by_player.groups:
    one_p = by_player.get_group(g)
    
    stay_counter = 1
    career_counter = 1
    #loop through the years for each player
    for y in sorted(one_p['Season'].unique()):
        p.loc[one_p[one_p.Season==y].index,'Career_Length'] = career_counter
        p.loc[one_p[one_p.Season==y].index,'Stay_Length'] = stay_counter
        
        if sum(one_p.Season==y+1)==0:
            #case: no next season
            if sum(one_p.Season==y+2)==0:
                #Case: no next two seasons, counted as leave
                p.loc[one_p[one_p.Season==y].index,'Leave'] = 1
                stay_counter = 1
                career_counter += 1
            elif (one_p[one_p.Season==y].Team.values[0] != 
                    one_p[one_p.Season==y+2].Team.values[0]):
                #case come back to different team after missing a a season
                p.loc[one_p[one_p.Season==y].index,'Leave'] = 1
                stay_counter = 1
                career_counter += 1
            else:
                #case: come back to same team after missing a season
                career_counter += 1
                stay_counter += 1 
        
        #Case: Leave in next season
        elif (one_p[one_p.Season==y].Team.values[0] != 
                    one_p[one_p.Season==y+1].Team.values[0]):
            p.loc[one_p[one_p.Season==y].index,'Leave'] = 1
            stay_counter = 1
            career_counter += 1
        else:
            #case: stay for next season
            career_counter += 1
            stay_counter += 1
            


In [None]:
p.sort_values(by=['playerid','Season']).head(10)

## Combine records of players playing multiple positions

In [None]:
df1 = []
by_player = p.groupby('playerid')

#loop through, player by player
for g in by_player.groups:
    one_p = by_player.get_group(g)
    for year in one_p.Season.unique():
        df1.append([one_p.Name.values[0],year])
        #for team in one_p.Team.unique():
        #df1.append([one_p.Team.values[0],year])
        
df1 = pd.DataFrame(df1, columns=['Name','Year'])
df1.insert(0,'Team',0)
df1.insert(0,'G',0)
df1.insert(0,'playerid',0)
df1.insert(0,'Pos',0)
df1.insert(0,'Inn',0)
df1.insert(0,'GS',0)
df1.insert(0,'Stay_Length',0)
df1.insert(0,'Career_Length',0)

In [None]:
for idx in df1.index:
    #print(p[ p.index == row]).
    row = df1[df1.index==idx]
    name = row['Name'].values[0]
    season = row['Year'].values[0]
    name_mask = (p.Name == name)
    season_mask = (p.Season == season)
    records = (p[name_mask*season_mask])
    I = records.Inn.sum()
    
    
    #print(records)
    #G = records.G.sum()
    df1.loc[idx,'Inn'] = I
    df1.loc[idx,'Team'] = records.Team.values[0] 
    df1.loc[idx,'Pos'] = records.Pos.values[0]
    df1.loc[idx,'playerid'] = records.playerid.values[0]
    df1.loc[idx,'G'] = records.G.values[0]
    df1.loc[idx,'GS'] = records.GS.values[0]
    df1.loc[idx,'Stay_Length'] = records.Stay_Length.values[0]
    df1.loc[idx,'Career_Length'] = records.Career_Length.values[0]
    
print(df1.head())
    

In [None]:
df1.sort_values(by=['playerid', 'Year'])

In [None]:
for col in ['P', 'SP', 'RP', 'FD', 'C']:
    if col in p.columns:
        p.drop(col, axis=1, inplace=True)

#one-hot encode positions
for position in p.Pos.unique():
    p.insert(0, position, (p.Pos==position).apply(int))

#one-hot encode teams
for team in p.Team.unique():
    p.insert(0, team, (p.Team==team).apply(int))

p.sample(5)

In [None]:
temp = p[list(p.Pos.unique())+['Stay_Length', 'Career_Length', 'GS', 'G', 'Inn']]#list(p.Team.unique())+
#temp = p[['Stay_Length', 'FD', 'C', 'P', 'SP', 'RP', 'Inn', 'W', 'WAR', 'DRS', 'RPP', 'UZR', 'FIP']]
#temp['P*WAR'] = temp['P']*temp['WAR']
#temp['P*FIP'] = temp['P']*temp['FIP']
#temp['FD*UZR'] = temp['FD']*temp['UZR']
#temp['C*DRS'] = temp['C']*temp['DRS']

#X = temp[['Stay_Length', 'FD', 'C', 'P', 'RP', 'SP']]#, 'Inn', 'W', 'RPP', 'P*WAR', 'C*DRS', 'FD*UZR', 'P*FIP']]
X = temp

y = p['Leave']

pd.concat([X,y],axis=1)

In [None]:
results = sm.OLS(y,X).fit()

In [None]:
print(results.summary())

In [None]:
np.average(y)

In [None]:
temp = results.predict(X)

plt.hist(temp.values)

In [None]:
temp = results.predict(X[X.RP==1])
plt.hist(temp.values)
plt.show()

plt.hist(X[(y==1)&(X.RP==1)].Stay_Length)
plt.show()

np.average(X[(y==1)&(X.RP==1)].Stay_Length)

In [None]:
temp = results.predict(X[X.SP==1])
plt.hist(temp.values)
plt.show()

plt.hist(X[(y==1)&(X.SP==1)].Stay_Length)
plt.show()

np.average(X[(y==1)&(X.SP==1)].Stay_Length)

In [None]:
temp = results.predict(X[X.FD==1])
plt.hist(temp.values)
plt.show()

plt.hist(X[(y==1)&(X.FD==1)].Stay_Length)
plt.show()

np.median(X[(y==1)&(X.FD==1)].Stay_Length)

In [None]:
temp = results.predict(X[X.C==1])
plt.hist(temp.values)
plt.show()

plt.hist(X[(y==1)&(X.C==1)].Stay_Length)
plt.show()

np.average(X[(y==1)&(X.C==1)].Stay_Length)

In [None]:
#for col in ['P', 'RP','SP','C','FD']:
#    print(col)
data = [X[(y==1)&(X[col]==1)].Stay_Length.values for col in ['RP','SP','C','FD']]
plt.boxplot(data)
plt.xticks(np.arange(1,1+5), ['RP','SP','C','FD'])
plt.show()

In [None]:
X[(y==1)&(X[col]==1)].Stay_Length.values

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

rf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [9, 25, 40],
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [10, 3, 5, 15], 
              'min_samples_split': [3, 5, 8],
              'min_samples_leaf': [5, 1,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(rf, parameters, n_jobs=4)#, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
rf = grid_obj.best_estimator_


rf.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, rf.predict(X_test))

In [None]:
rf.predict(X_test)

In [None]:
ind = np.argsort(rf.feature_importances_)
plt.figure(figsize=(10,12))

plt.barh(range(len(rf.feature_importances_)),rf.feature_importances_[ind])
# It is very important that you use the same columns that you fit your model with, or else this will be wrong!
plt.yticks(range(len(rf.feature_importances_)),X_train.columns[ind])
plt.title("Variable Importance")
plt.show()

## Extra Trees

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

et = ExtraTreesClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [9, 4, 6],
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [10, 2, 3, 5], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [5, 1,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(et, parameters, n_jobs=4)#, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
et = grid_obj.best_estimator_


et.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, et.predict(X_test))

In [None]:
et.predict(X_test)

In [None]:
ind = np.argsort(et.feature_importances_)
plt.figure(figsize=(10,12))

plt.barh(range(len(et.feature_importances_)),et.feature_importances_[ind])
# It is very important that you use the same columns that you fit your model with, or else this will be wrong!
plt.yticks(range(len(et.feature_importances_)),X_train.columns[ind])
plt.title("Variable Importance")
plt.show()

## Adaboost

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

ab = ExtraTreesClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [9, 4, 6],
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [10, 2, 3, 5], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [5, 1,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(ab, parameters, n_jobs=4)#, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
ab = grid_obj.best_estimator_


ab.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, ab.predict(X_test))

In [None]:
ab.predict(X_test)

## Logistic Regression

In [None]:
logreg = LogisticRegressionCV()
logreg.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, logreg.predict(X_test))

## SVM

In [None]:
clf = svm.SVC()
clf.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, clf.predict(X_test))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, model.predict(X_test))

In [None]:
pd.concat([X, y], axis=1).to_csv('perfect_numeric_data.csv')

## XGBoost

In [None]:
import xgboost as xgb

In [None]:
xmodel = xgb.XGBClassifier()
xmodel.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, xmodel.predict(X_test))