In [2]:
import os
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv(os.path.join("data", "db", "Observations.csv"))

## Cleaning and Feature Selection

In [4]:
df['Annual Salary'] = df['Annual Salary'].str.replace("$", "")
df['Annual Salary'] = df['Annual Salary'].str.replace(",", "")
df['Annual Salary'] = pd.to_numeric(df['Annual Salary'])
df['Log Annual Salary'] = np.log(df['Annual Salary'])
#df.head()

# Average Player Salary by year
df_gb_year = df.groupby('Salary Year')
counts = df_gb_year.count()
average_player_salaries = df_gb_year.sum()['Annual Salary'] / df_gb_year.count()['Annual Salary']
avg_player_salaries_df = average_player_salaries.reset_index()
avg_player_salaries_df.columns = ['Salary Year', 'Average Annual Player Salary']
df = pd.merge(df, avg_player_salaries_df, on='Salary Year')

# Adjust salaries by the average annual salary in that year, to account for inflation
df['Scaled Salary'] = df['Annual Salary'] / df['Average Annual Player Salary']

# Pitching metrics
# Wins per Inning Pitched. This might be more useful than wins alone.
df['Pitching_Career_WPIP'] = df['Pitching_Career_W'] / df['Pitching_Career_IP'] 

# Starter or reliever? Games Started Per Full Games Played
df['Pitching_Career_GSPFGP']= df['Pitching_Career_GS'] * 9 / df['Pitching_Career_IP']   
df.head()

# Set position to prior year's
df['Position'] = df['Fielding_POS.1.Year-1']

# Add categorical value for multiple positions
df.loc[df['Num Positions.Year-1'] > 1, ('Position')] = 'MULTIPLE'

# Subset columns
df = pd.DataFrame(df, columns=['Player Id', 'Salary Year', 'Annual Salary', 'Scaled Salary',
                               'Log Annual Salary', 'Contract Years', 'Position', 
                               'Batting_Career_Num_Seasons', 'Batting_Career_G', 'Batting_Career_AVG', 
                               'Batting_Career_PSN', 'Batting_Career_SB', 'Batting_Career_HR',
                               'Batting_Career_RBI', 
                               'Pitching_Career_Num_Seasons', 'Pitching_Career_G', 'Pitching_Career_ER',
                               'Pitching_Career_ERA', 'Pitching_Career_IP', 'Pitching_Career_SO',
                               'Pitching_Career_SHO', 'Pitching_Career_W', 'Pitching_Career_L',
                               'Pitching_Career_WPIP', 'Pitching_Career_GS', 'Pitching_Career_GSPFGP',
                               'Fielding_Career_Num_Seasons', 'Fielding_Career_G', 'Fielding_Career_FPCT', 
                               'Fielding_Career_A', 'Fielding_Career_PO', 'Fielding_Career_E'])
df.head()

Unnamed: 0,Player Id,Salary Year,Annual Salary,Scaled Salary,Log Annual Salary,Contract Years,Position,Batting_Career_Num_Seasons,Batting_Career_G,Batting_Career_AVG,...,Pitching_Career_L,Pitching_Career_WPIP,Pitching_Career_GS,Pitching_Career_GSPFGP,Fielding_Career_Num_Seasons,Fielding_Career_G,Fielding_Career_FPCT,Fielding_Career_A,Fielding_Career_PO,Fielding_Career_E
0,jbverlander,2015,25714285,5.575865,17.062557,7 (2013-19),P,9,17.0,0.0625,...,89.0,0.0769,298.0,1.356875,10,298.0,0.926431,218.0,122.0,27.0
1,dzgreinke,2015,24500000,5.312561,17.014184,6 (2013-18),P,10,118.0,0.21875,...,95.0,0.066282,325.0,1.404899,11,366.0,0.989384,286.0,180.0,5.0
2,jhhamilton,2015,25000000,5.42098,17.034386,5 (2013-17),MULTIPLE,8,977.0,0.291789,...,0.0,,0.0,,8,1786.0,0.979695,88.0,3772.0,80.0
3,rjhoward,2015,25000000,5.42098,17.034386,5 (2012-16),1B,11,1331.0,0.265431,...,0.0,,0.0,,11,1278.0,0.991313,717.0,10923.0,102.0
4,fahernandez,2015,25000000,5.42098,17.034386,7 (2013-19),P,9,14.0,0.117647,...,92.0,0.060736,303.0,1.325009,10,303.0,0.971084,249.0,154.0,12.0


In [5]:
# Cleanup
# Replace mising values
df = df.fillna(0.0)
df = df.replace('-', 0.0)
df = df.replace('', 0.0)
df = df.replace('.---', 0.0)
df = df.replace('nan', 0.0)
df = df.round(3)
df.head()


Unnamed: 0,Player Id,Salary Year,Annual Salary,Scaled Salary,Log Annual Salary,Contract Years,Position,Batting_Career_Num_Seasons,Batting_Career_G,Batting_Career_AVG,...,Pitching_Career_L,Pitching_Career_WPIP,Pitching_Career_GS,Pitching_Career_GSPFGP,Fielding_Career_Num_Seasons,Fielding_Career_G,Fielding_Career_FPCT,Fielding_Career_A,Fielding_Career_PO,Fielding_Career_E
0,jbverlander,2015,25714285,5.576,17.063,7 (2013-19),P,9,17.0,0.062,...,89.0,0.077,298.0,1.357,10,298.0,0.926,218.0,122.0,27.0
1,dzgreinke,2015,24500000,5.313,17.014,6 (2013-18),P,10,118.0,0.219,...,95.0,0.066,325.0,1.405,11,366.0,0.989,286.0,180.0,5.0
2,jhhamilton,2015,25000000,5.421,17.034,5 (2013-17),MULTIPLE,8,977.0,0.292,...,0.0,0.0,0.0,0.0,8,1786.0,0.98,88.0,3772.0,80.0
3,rjhoward,2015,25000000,5.421,17.034,5 (2012-16),1B,11,1331.0,0.265,...,0.0,0.0,0.0,0.0,11,1278.0,0.991,717.0,10923.0,102.0
4,fahernandez,2015,25000000,5.421,17.034,7 (2013-19),P,9,14.0,0.118,...,92.0,0.061,303.0,1.325,10,303.0,0.971,249.0,154.0,12.0


# Replace Categorical variables


In [6]:
# Replace categorical variables with dummy variables
import re
dummy_vars_df = pd.get_dummies(df['Position'])
for dummy_col in dummy_vars_df.columns:
    df[dummy_col] = dummy_vars_df[dummy_col]
    


In [7]:
# Select subset of features
# This is me making an educated guess about which ones are predictive. I'll just use the most common stats
# for now, but will add more later as they become available (and as the amount of data available grows).

df.head()


Unnamed: 0,Player Id,Salary Year,Annual Salary,Scaled Salary,Log Annual Salary,Contract Years,Position,Batting_Career_Num_Seasons,Batting_Career_G,Batting_Career_AVG,...,Fielding_Career_PO,Fielding_Career_E,0.0,1B,2B,3B,C,MULTIPLE,P,SS
0,jbverlander,2015,25714285,5.576,17.063,7 (2013-19),P,9,17.0,0.062,...,122.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,dzgreinke,2015,24500000,5.313,17.014,6 (2013-18),P,10,118.0,0.219,...,180.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,jhhamilton,2015,25000000,5.421,17.034,5 (2013-17),MULTIPLE,8,977.0,0.292,...,3772.0,80.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,rjhoward,2015,25000000,5.421,17.034,5 (2012-16),1B,11,1331.0,0.265,...,10923.0,102.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,fahernandez,2015,25000000,5.421,17.034,7 (2013-19),P,9,14.0,0.118,...,154.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Subset 

We want to look at a few stats:

In [8]:
df = pd.DataFrame(df, columns=['Player Id', 'Salary Year', 'Scaled Salary',
                               'Batting_Career_Num_Seasons', 
                               'Batting_Career_HR',
                               'Batting_Career_SB',
                               'Batting_Career_RBI',
                               'Pitching_Career_SO',
                               'Pitching_Career_ERA',
                               'Fielding_Career_A',
                               'Fielding_Career_PO',
                               'Fielding_Career_E',
                               'Fielding_Career_G',
                               '0.0', '1B', '2B', '3B', 'C', 'MULTIPLE', 'P', 'SS'])
#df.columns
df.head()

Unnamed: 0,Player Id,Salary Year,Scaled Salary,Batting_Career_Num_Seasons,Batting_Career_HR,Batting_Career_SB,Batting_Career_RBI,Pitching_Career_SO,Pitching_Career_ERA,Fielding_Career_A,...,Fielding_Career_E,Fielding_Career_G,0.0,1B,2B,3B,C,MULTIPLE,P,SS
0,jbverlander,2015,5.576,9,0.0,0.0,0.0,1830.0,3.529,218.0,...,27.0,298.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,dzgreinke,2015,5.313,10,4.0,3.0,11.0,1887.0,3.545,286.0,...,5.0,366.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,jhhamilton,2015,5.421,8,192.0,50.0,676.0,0.0,0.0,88.0,...,80.0,1786.0,,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,rjhoward,2015,5.421,11,334.0,12.0,1058.0,0.0,0.0,717.0,...,102.0,1278.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,fahernandez,2015,5.421,9,1.0,0.0,7.0,1951.0,3.079,249.0,...,12.0,303.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Normalization

In [9]:
# Scale values from 0 to 1
for column in df.columns[3:]:
    #print(column)
    df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min()) 
df = df.round(3)
df = df.fillna(0.0)
df.head()

Unnamed: 0,Player Id,Salary Year,Scaled Salary,Batting_Career_Num_Seasons,Batting_Career_HR,Batting_Career_SB,Batting_Career_RBI,Pitching_Career_SO,Pitching_Career_ERA,Fielding_Career_A,...,Fielding_Career_E,Fielding_Career_G,0.0,1B,2B,3B,C,MULTIPLE,P,SS
0,jbverlander,2015,5.576,0.45,0.0,0.0,0.0,0.681,0.118,0.034,...,0.094,0.065,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,dzgreinke,2015,5.313,0.5,0.006,0.005,0.006,0.702,0.118,0.045,...,0.017,0.08,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,jhhamilton,2015,5.421,0.4,0.294,0.085,0.343,0.0,0.0,0.014,...,0.279,0.389,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,rjhoward,2015,5.421,0.55,0.511,0.02,0.537,0.0,0.0,0.113,...,0.355,0.278,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,fahernandez,2015,5.421,0.45,0.002,0.0,0.004,0.726,0.103,0.039,...,0.042,0.066,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


##  Cross-Validation
Split players into test/train sets, then use the corresponding observations.


In [12]:
players = df['Player Id'].unique()

from sklearn import linear_model
from sklearn import cross_validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupKFold

players = df['Player Id'].unique()

columns = ['Batting_Career_Num_Seasons', 
           'Batting_Career_HR',
           'Batting_Career_SB',
           'Batting_Career_RBI',
           'Pitching_Career_SO',
           'Pitching_Career_ERA',
           'Fielding_Career_A',
           'Fielding_Career_PO',
           'Fielding_Career_E',
           'Fielding_Career_G',
           '0.0', '1B', '2B', '3B', 'C', 'MULTIPLE', 'P', 'SS']

regr = linear_model.LinearRegression()

print("{} observations".format(len(df)))
print("{} players".format(players.size))

# Use GroupKFold cross-validation
X = np.asarray(pd.DataFrame(df, columns=columns))
y = np.asarray(df['Scaled Salary'])
regr = linear_model.LinearRegression()

# Simple train/test split
x_train, x_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=55)
regr.fit(x_train, y_train)
score = regr.score(x_test, y_test)
print(score)

# K-fold group cross-validation
df.sort(['Player Id'], inplace=True)
players = list(df['Player Id'].values)

groups = [players.index(row['Player Id']) for index, row in df.iterrows()]
score = cross_val_score(regr, X, y, groups, cv=GroupKFold(n_splits=5))

2259 observations
1088 players
0.554553717179




ImportError: DLL load failed: %1 is not a valid Win32 application.