In [None]:
# Todo:
# 1. Rerun the below once non-numeric fields have been cleaned
# 2. Use pca transformed data with linear SVC (http://scikit-learn.org/stable/modules/svm.html#classification)
#    and test that model out

In [24]:
#Imports
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [16]:
# Make sure plots show
%matplotlib inline

In [17]:
# Global Constants
read_file = '../data/labeled_player_seasons.csv'
plt.rcParams['figure.figsize'] = [16.0, 10.0] # Make plots visible

In [19]:
data = pd.read_csv(read_file)

In [20]:
data.columns.values

array(['Unnamed: 0', 'assists', 'gamesPlayed', 'goals', 'otGoals',
       'penaltyMinutes', 'playerFirstName', 'playerId', 'playerLastName',
       'playerName', 'playerPositionCode', 'playerTeamsPlayedFor',
       'plusMinus', 'points', 'pointsPerGame', 'seasonId', 'shiftsPerGame',
       'shootingPctg', 'shots', 'timeOnIcePerGame', 'isSuccessfulSeason'], dtype=object)

In [23]:
numeric_data = data.loc[:, ['assists', 'gamesPlayed', 'plusMinus', 'points', 'pointsPerGame',
                            'shiftsPerGame', 'shots', 'timeOnIcePerGame']]
target_data = data.loc[:, ['isSuccessfulSeason']]

In [37]:
pca = PCA(n_components=3)

In [38]:
pca_fit = pca.fit(numeric_data)

In [39]:
pca_fit.explained_variance_ratio_

array([ 0.92096077,  0.04455887,  0.01841544])

In [40]:
pca_fit.components_

array([[ 0.14682049,  0.30413369,  0.02063353,  0.24741809,  0.00265006,
         0.04072205,  0.906133  ,  0.03953063],
       [ 0.02156941, -0.945385  ,  0.07927038,  0.12484203,  0.00399045,
        -0.05748895,  0.28194013, -0.03318886],
       [ 0.47877891,  0.04876918,  0.5863311 ,  0.58258151,  0.00846167,
         0.06951683, -0.27285579,  0.07650141]])

In [45]:
# Rerun after adjusting timeOnIcePerGame to minutes
numeric_data['timeOnIcePerGame'] = numeric_data['timeOnIcePerGame'].apply(lambda x: int(x)/60)
pca = PCA(n_components=3)
pca_fit = pca.fit(numeric_data)
print(pca_fit.explained_variance_ratio_)
print(pca_fit.components_)

[ 0.92294162  0.04467776  0.01839019]
[[  1.46890885e-01   3.04317490e-01   2.06381968e-02   2.47599782e-01
    2.65140698e-03   4.06460960e-02   9.06875283e-01   6.57656159e-04]
 [  2.25170018e-02  -9.46408266e-01   7.98686438e-02   1.25283694e-01
    4.01199152e-03  -5.56074761e-02   2.80394033e-01  -5.22160436e-04]
 [  4.77606909e-01   5.48320752e-02   5.90870938e-01   5.85235541e-01
    8.40985608e-03   5.83191234e-02  -2.71630109e-01   1.10367120e-03]]


In [63]:
# Looks like shots encapsulates the most variance in the set by far. So much for trying to avoid scorer bias...
# At least points isn't the biggest indicator.

In [50]:
pca_fit.get_covariance()

array([[  2.26270274e+02,   2.62023948e+02,   2.64840085e+01,
          2.24656426e+02,   2.44720783e+00,   3.60172291e+01,
          7.88867396e+02,   5.86664123e-01],
       [  2.62023948e+02,   8.21291257e+02,   2.30810432e+01,
          4.24746206e+02,   4.04488573e+00,   8.41254192e+01,
          1.58577206e+03,   1.28974057e+00],
       [  2.64840085e+01,   2.30810432e+01,   1.05282306e+02,
          4.23055979e+01,   5.32209753e-01,   5.08330609e+00,
          1.11058570e+02,   9.10156835e-02],
       [  2.24656426e+02,   4.24746206e+02,   4.23055979e+01,
          4.68610486e+02,   4.14213211e+00,   5.93682154e+01,
          1.33634543e+03,   9.72761698e-01],
       [  2.44720783e+00,   4.04488573e+00,   5.32209753e-01,
          4.14213211e+00,   9.14807672e+01,   6.09553048e-01,
          1.44436040e+01,   1.02051145e-02],
       [  3.60172291e+01,   8.41254192e+01,   5.08330609e+00,
          5.93682154e+01,   6.09553048e-01,   1.01965872e+02,
          2.15386349e+02,   1.6

In [51]:
transformed_data = pca.transform(numeric_data)

In [56]:
transformed_data = pd.DataFrame(transformed_data)
transformed_data['isSuccessfulSeason'] = target_data['isSuccessfulSeason']

In [62]:
transformed_data.describe()

Unnamed: 0,0,1,2,isSuccessfulSeason
count,14084.0,14084.0,14084.0,14084.0
mean,2.738233e-13,-3.85138e-15,-2.935706e-15,0.16714
std,77.66767,17.0883,10.96343,0.373114
min,-94.72989,-47.59861,-60.07938,0.0
25%,-69.18526,-12.9866,-5.194858,0.0
50%,-11.09416,0.8612966,-0.05218542,0.0
75%,50.19715,14.64383,3.684261,0.0
max,444.0051,109.9754,90.61178,1.0
