In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

  from pandas.core import datetools


In [2]:
teams_abbr = {'- - -': '- - -', 'ANA': 'Angels', 'ARI': 'Diamondbacks', 
 'ATL': 'Braves', 'BAL': 'Orioles', 'BOS': 'Red Sox', 'CHC': 'Cubs', 
 'CHW': 'White Sox', 'CIN': 'Reds', 'CLE': 'Indians', 'COL': 'Rockies',
 'DET': 'Tigers', 'FLA': 'Marlins', 'HOU': 'Astros', 'KCR': 'Royals', 
 'LAA': 'Angels', 'LAD': 'Dodgers', 'MIA': 'Marlins', 'MIL': 'Brewers',
 'MIN': 'Twins', 'MON': 'Expos', 'NYM': 'Mets', 'NYY': 'Yankees',
 'OAK': 'Athletics', 'PHI': 'Phillies', 'PIT': 'Pirates', 'SDP': 'Padres',
 'SEA': 'Mariners', 'SFG': 'Giants', 'STL': 'Cardinals', 'TBD': 'Devil Rays', 
 'TBR': 'Rays', 'TEX': 'Rangers', 'TOR': 'Blue Jays', 'WSN': 'Nationals'}

In [3]:
#read in all data
sp = pd.read_csv('FanGraphs_Starters (2).csv')
rp = pd.read_csv('FanGraphs_Relievers (2).csv')
fd = pd.read_csv('FanGraphs_Fielders (2).csv')
bt = pd.read_csv('FanGraphs_Batting (2).csv')

a, b, c = len(sp),len(rp),len(fd)
print("""Number of starter pitchers:\t{}
Number of relief pitchers:\t{}
Number of fielders:\t\t{}
-------------------------------------
Total number of records:\t{}""".format(a, b, c, a+b+c))

#change abbreviation to full team name
fd.loc[[True]*len(fd),'Team'] = [teams_abbr[k] for k in fd.Team]



Number of starter pitchers:	2741
Number of relief pitchers:	1265
Number of fielders:		16122
-------------------------------------
Total number of records:	20128


In [4]:
bt.columns

Index(['Season', 'Name', 'Team', 'G', 'PA', 'HR', 'R', 'RBI', 'SB', 'BB%',
       'K%', 'ISO', 'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'wRC+', 'BsR', 'Off',
       'Def', 'WAR', 'playerid'],
      dtype='object')

In [5]:
rp.columns

Index(['Season', 'Name', 'Team', 'W', 'L', 'SV', 'G', 'GS', 'IP', 'K/9',
       'BB/9', 'HR/9', 'BABIP', 'LOB%', 'GB%', 'HR/FB', 'ERA', 'FIP', 'xFIP',
       'WAR', 'playerid'],
      dtype='object')

In [6]:
sp.columns

Index(['Season', 'Name', 'Team', 'W', 'L', 'SV', 'G', 'GS', 'IP', 'K/9',
       'BB/9', 'HR/9', 'BABIP', 'LOB%', 'GB%', 'HR/FB', 'ERA', 'FIP', 'xFIP',
       'WAR', 'playerid'],
      dtype='object')

In [7]:
print(fd.columns)
fd.sample(5)

Index(['Season', 'Name', 'Team', 'Pos', 'Inn', 'rSB', 'rGDP', 'rARM', 'rGFP',
       'rPM', 'DRS', 'BIZ', 'Plays', 'RZR', 'OOZ', 'CPP', 'RPP', 'TZL', 'FSR',
       'ARM', 'DPR', 'RngR', 'ErrR', 'UZR', 'UZR/150', 'Def', 'playerid'],
      dtype='object')


Unnamed: 0,Season,Name,Team,Pos,Inn,rSB,rGDP,rARM,rGFP,rPM,...,TZL,FSR,ARM,DPR,RngR,ErrR,UZR,UZR/150,Def,playerid
14107,2010,Ervin Santana,Angels,P,222.2,-4.0,,,0,-3.0,...,,,,,,,,,,3200
5264,2012,Adeiny Hechavarria,Blue Jays,3B,156.0,,0.0,,1,-2.0,...,,2.0,,0.3,-1.3,1.1,0.1,-0.1,0.4,10459
15813,2016,Joe Ross,Nationals,P,105.0,0.0,,,1,-2.0,...,,,,,,,,,,12972
13760,2009,Tomo Ohka,Indians,P,71.0,0.0,,,0,2.0,...,,,,,,,,,,788
4208,2011,Donnie Murphy,Marlins,3B,79.2,,0.0,,0,0.0,...,,0.0,,-0.1,1.2,0.0,1.0,14.2,1.2,4704


In [8]:
#create a 3 sets containing tuples of playerids with the season for each record
fd_season_id = set([tuple(row) for row in fd[['playerid','Season']].values])
rp_season_id = set([tuple(row) for row in rp[['playerid','Season']].values])
sp_season_id = set([tuple(row) for row in sp[['playerid','Season']].values])
bt_season_id = set([tuple(row) for row in bt[['playerid','Season']].values])

a, b, c = len(sp_season_id), len(rp_season_id), len(fd_season_id)
total = set()
total.update(sp_season_id);total.update(rp_season_id);total.update(fd_season_id)

print("Only counting players once per season.\n")
print("""Number of starter pitchers:\t{}
Number of relief pitchers:\t{}
Number of fielders:\t\t{}
-------------------------------------
Total number of records:\t{}""".format(a, b, c, len(total)))


Only counting players once per season.

Number of starter pitchers:	2675
Number of relief pitchers:	1265
Number of fielders:		12253
-------------------------------------
Total number of records:	12254


In [9]:
print(len(bt_season_id.intersection(fd_season_id)))
print(len(bt_season_id.intersection(rp_season_id)))
print(len(bt_season_id.intersection(sp_season_id)))
print(len(rp_season_id.intersection(fd_season_id)))
print(len(rp_season_id.intersection(sp_season_id)))
print(len(sp_season_id.intersection(fd_season_id)))


8157
0
590
1264
3
2675


In [10]:
#create temporary Seasonid column
fd.insert(2,'Seasonid',[tuple(row) for row in fd[['playerid','Season']].values])
rp.insert(2,'Seasonid',[tuple(row) for row in rp[['playerid','Season']].values])
sp.insert(2,'Seasonid',[tuple(row) for row in sp[['playerid','Season']].values])

#find and remove duplicates between dataframes
#mask = fd.Seasonid.isin(sp_season_id)
#fd.drop(fd[mask].index,inplace=True)

#mask = fd.Seasonid.isin(rp_season_id)
#fd.drop(fd[mask].index,inplace=True)

mask = rp.Seasonid.isin(sp_season_id)
rp.drop(rp[mask].index,inplace=True)

#remove temporary Seasonid column
fd.drop('Seasonid',axis=1,inplace=True)
sp.drop('Seasonid',axis=1,inplace=True)
rp.drop('Seasonid',axis=1,inplace=True)

ValueError: Buffer has wrong number of dimensions (expected 1, got 2)

In [None]:
#change all fielder positions (except catcher) to FD
fd.loc[fd.Pos.isin(['1B','2B','3B','RF','LF','CF','SS']),'Pos'] = 'FD'

#add column with potision to prepare for combining data
sp.insert(2, 'Pos', 'SP')
rp.insert(2,'Pos', 'RP')

In [None]:
p = pd.concat([sp,rp,fd], ignore_index=True)
#p = pd.merge(sp,rp, on=['Season', 'Name']'Seasonid', how='outer')
#p = pd.merge(p, fd, on='Seasonid', how='outer')
p.drop(p[p.Season<2002].index,inplace=True)
p.reset_index()

p.sample(5)

In [None]:
p = p.drop(['rSB', 'rGDP', 'rARM', 'rGFP',
       'rPM', 'BIZ', 'Plays', 'RZR', 'OOZ', 'CPP', 'TZL', 'FSR',
       'ARM', 'DPR', 'RngR', 'ErrR', 'UZR/150', 'Def',
        'L', 'SV', 'G', 'GS', 'IP', 'K/9',
       'BB/9', 'HR/9', 'BABIP', 'LOB%', 'GB%', 'HR/FB', 'ERA', 'xFIP',
       ], axis=1)

In [None]:
p['DRS'].fillna(p['DRS'].mean(), inplace=True)
p['RPP'].fillna(p['RPP'].mean(), inplace=True)
p['UZR'].fillna(p['UZR'].mean(), inplace=True)
p['FIP'].fillna(p['FIP'].mean(), inplace=True)
p['Inn'].fillna(p['Inn'].mean(), inplace=True)
p['WAR'].fillna(p['WAR'].mean(), inplace=True)
p['W'].fillna(p['W'].mean(), inplace=True)


In [None]:
by_pos = p.groupby('Pos')
for g in by_pos.groups:
    group = by_pos.get_group(g)
    print('#'*50)
    print(g)
    for c in p.columns:
        print(c,'\t', len(group)-group[c].count())

In [None]:
#account for team name changes
p.loc[p.Team=='Devil Rays','Team'] = 'Rays'
p.loc[p.Team=='Expos','Team'] = 'Nationals'
del(teams_abbr['MON'])
del(teams_abbr['TBD'])

#delete players from a season if their team is unknown
p.drop(p[p.Team=='- - -'].index,inplace=True)
del(teams_abbr['- - -'])

In [None]:
print("Number of records before:\t{}".format(len(p)))
p.drop_duplicates(inplace=True)
print("Number of records after:\t{}".format(len(p)))

In [None]:
p.sample(5)

In [None]:
p.dropna(axis=1, how='any')

In [None]:
if 'Leave' in p.columns:
    p.drop('Leave', axis=1, inplace=True)
if 'Stay_Length' in p.columns:
    p.drop('Stay_Length', axis=1, inplace=True)

#initialize the new column with value 'No Change'
p.insert(1, 'Leave', 0)
p.insert(1, 'Stay_Length', 0)
by_player = p.groupby('playerid')

#loop through, player by player
for g in by_player.groups:
    one_p = by_player.get_group(g)
    
    counter = 1
    #loop through the years for each player
    for y in sorted(one_p['Season'].unique()):
        p.loc[one_p[one_p.Season==y].index,'Stay_Length'] = counter
        #Case: no next two seasons, counted as leave
        if sum(one_p.Season==y+1)==0:
            if sum(one_p.Season==y+2)==0:
                p.loc[one_p[one_p.Season==y].index,'Leave'] = 1
                counter = 1
            elif (one_p[one_p.Season==y].Team.values[0] != 
                    one_p[one_p.Season==y+2].Team.values[0]):
                p.loc[one_p[one_p.Season==y].index,'Leave'] = 1
                counter = 1
            else:
                counter += 1 
        
        #Case: Leave in next season
        elif (one_p[one_p.Season==y].Team.values[0] != 
                    one_p[one_p.Season==y+1].Team.values[0]):
            p.loc[one_p[one_p.Season==y].index,'Leave'] = 1
            counter = 1
            
        else:
            counter += 1
            


In [None]:
p.sort_values(by=['playerid','Season'])

In [None]:
df1 = []
by_player = p.groupby('playerid')

#loop through, player by player
for g in by_player.groups:
    one_p = by_player.get_group(g)
    for year in one_p.Seasons.unique():
        df1.append([one_p.Name.values[0],year])
df1 = pd.DataFrame(df1)



In [None]:
for col in ['P', 'SP', 'RP', 'FD', 'C']:
    if col in p.columns:
        p.drop(col, axis=1, inplace=True)

#one-hot encode positions
for position in p.Pos.unique():
    p.insert(0, position, (p.Pos==position).apply(int))

#one-hot encode teams
for team in p.Team.unique():
    p.insert(0, team, (p.Team==team).apply(int))

p.sample(5)

In [None]:
temp = p[list(p.Pos.unique())+list(p.Team.unique())+['Stay_Length']]
#temp = p[['Stay_Length', 'FD', 'C', 'P', 'SP', 'RP', 'Inn', 'W', 'WAR', 'DRS', 'RPP', 'UZR', 'FIP']]
#temp['P*WAR'] = temp['P']*temp['WAR']
#temp['P*FIP'] = temp['P']*temp['FIP']
#temp['FD*UZR'] = temp['FD']*temp['UZR']
#temp['C*DRS'] = temp['C']*temp['DRS']

#X = temp[['Stay_Length', 'FD', 'C', 'P', 'RP', 'SP']]#, 'Inn', 'W', 'RPP', 'P*WAR', 'C*DRS', 'FD*UZR', 'P*FIP']]
X = temp

y = p['Leave']

pd.concat([X,y],axis=1)

In [None]:
results = sm.OLS(y,X).fit()

In [None]:
print(results.summary())

In [None]:
np.average(y)

In [None]:
temp = results.predict(X)

plt.hist(temp.values)

In [None]:
temp = results.predict(X[X.P==1])
plt.hist(temp.values)
plt.show()

plt.hist(X[(y==1)&(X.P==1)].Stay_Length)
plt.show()

np.average(X[(y==1)&(X.P==1)].Stay_Length)

In [None]:
temp = results.predict(X[X.RP==1])
plt.hist(temp.values)
plt.show()

plt.hist(X[(y==1)&(X.RP==1)].Stay_Length)
plt.show()

np.average(X[(y==1)&(X.RP==1)].Stay_Length)

In [None]:
temp = results.predict(X[X.SP==1])
plt.hist(temp.values)
plt.show()

plt.hist(X[(y==1)&(X.SP==1)].Stay_Length)
plt.show()

np.average(X[(y==1)&(X.SP==1)].Stay_Length)

In [None]:
temp = results.predict(X[X.FD==1])
plt.hist(temp.values)
plt.show()

plt.hist(X[(y==1)&(X.FD==1)].Stay_Length)
plt.show()

np.median(X[(y==1)&(X.FD==1)].Stay_Length)

In [None]:
temp = results.predict(X[X.C==1])
plt.hist(temp.values)
plt.show()

plt.hist(X[(y==1)&(X.C==1)].Stay_Length)
plt.show()

np.average(X[(y==1)&(X.C==1)].Stay_Length)

In [None]:
#for col in ['P', 'RP','SP','C','FD']:
#    print(col)
data = [X[(y==1)&(X[col]==1)].Stay_Length.values for col in ['P', 'RP','SP','C','FD']]
plt.boxplot(data)
plt.xticks(np.arange(1,1+5), ['P', 'RP','SP','C','FD'])
plt.show()

In [None]:
X[(y==1)&(X[col]==1)].Stay_Length.values

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import datasets

In [None]:
da

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

rf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [9, 4, 6],
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [10, 2, 3, 5], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [5, 1,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(rf, parameters, n_jobs=4)#, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
rf = grid_obj.best_estimator_


rf.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, rf.predict(X_test))

In [None]:
rf.predict(X_test)

## Boosted Forest

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

et = ExtraTreesClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [9, 4, 6],
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [10, 2, 3, 5], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [5, 1,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(et, parameters, n_jobs=4)#, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
et = grid_obj.best_estimator_


et.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, et.predict(X_test))

In [None]:
et.predict(X_test)

## Adaboost

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

ab = ExtraTreesClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [9, 4, 6],
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [10, 2, 3, 5], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [5, 1,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(ab, parameters, n_jobs=4)#, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
ab = grid_obj.best_estimator_


ab.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, ab.predict(X_test))

In [None]:
ab.predict(X_test)

## Logistic Regression

In [None]:
logreg = LogisticRegressionCV()

logreg.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, logreg.predict(X_test))

## SVM

In [None]:
clf = svm.SVC()
clf.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, clf.predict(X_test))