In [1]:
# Todo:
# [x] Rerun the below with scaled (0, 1) numeric values
# [ ] Rerun the below once non-numeric fields have been cleaned
# [ ] Use pca transformed data with linear SVC (http://scikit-learn.org/stable/modules/svm.html#classification)
#    and test that model out

In [20]:
#Imports
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [3]:
# Make sure plots show
%matplotlib inline

In [4]:
# Global Constants
read_file = '../data/labeled_player_seasons.csv'
plt.rcParams['figure.figsize'] = [16.0, 10.0] # Make plots visible

In [5]:
data = pd.read_csv(read_file)

In [6]:
data.columns.values

array(['Unnamed: 0', 'assists', 'gamesPlayed', 'goals', 'otGoals',
       'penaltyMinutes', 'playerFirstName', 'playerId', 'playerLastName',
       'playerName', 'playerPositionCode', 'playerTeamsPlayedFor',
       'plusMinus', 'points', 'pointsPerGame', 'seasonId', 'shiftsPerGame',
       'shootingPctg', 'shots', 'timeOnIcePerGame', 'isSuccessfulSeason'], dtype=object)

In [7]:
numeric_data = data.loc[:, ['assists', 'gamesPlayed', 'plusMinus', 'points', 'pointsPerGame',
                            'shiftsPerGame', 'shots', 'timeOnIcePerGame']]
target_data = data.loc[:, ['isSuccessfulSeason']]

In [24]:
# Normalize numeric fields to get rid of scale biases
# Definitely seems to help move the variance spread around.
# Toggle this cell to test models with/without once we start predicting.
min_max_scaler = preprocessing.MinMaxScaler()
numeric_data = min_max_scaler.fit_transform(numeric_data)

In [33]:
pca = PCA(n_components=3)

In [34]:
pca_fit = pca.fit(numeric_data)

In [35]:
pca_fit.explained_variance_ratio_

array([ 0.72868668,  0.12300718,  0.09246464])

In [36]:
pca_fit.components_

array([[ 0.27107509,  0.73387955,  0.03779468,  0.330848  ,  0.2178682 ,
         0.26410887,  0.27386728,  0.29125063],
       [ 0.18642117, -0.6254723 ,  0.13121056,  0.19713022,  0.32036883,
         0.42125715,  0.05938281,  0.48408152],
       [ 0.29744886, -0.17711802,  0.16108615,  0.45121892,  0.40243189,
        -0.49639588,  0.23268013, -0.43371164]])

In [37]:
pca_fit.get_covariance()

array([[ 0.0290389 ,  0.03250568,  0.00269573,  0.01774105,  0.01260821,
         0.01258352,  0.01394328,  0.0142652 ],
       [ 0.03250568,  0.11673608,  0.00317099,  0.03985796,  0.02397116,
         0.03036924,  0.03437488,  0.03308206],
       [ 0.00269573,  0.00317099,  0.01543113,  0.00335625,  0.00281276,
         0.00196468,  0.00232097,  0.00238738],
       [ 0.01774105,  0.03985796,  0.00335625,  0.03655682,  0.01555746,
         0.014713  ,  0.0171829 ,  0.01678492],
       [ 0.01260821,  0.02397116,  0.00281276,  0.01555746,  0.02635294,
         0.0105996 ,  0.01174932,  0.01223794],
       [ 0.01258352,  0.03036924,  0.00196468,  0.014713  ,  0.0105996 ,
         0.03240764,  0.01208038,  0.01921065],
       [ 0.01394328,  0.03437488,  0.00232097,  0.0171829 ,  0.01174932,
         0.01208038,  0.02842829,  0.01359604],
       [ 0.0142652 ,  0.03308206,  0.00238738,  0.01678492,  0.01223794,
         0.01921065,  0.01359604,  0.0355057 ]])

In [38]:
transformed_data = pca.transform(numeric_data)

In [39]:
transformed_data = pd.DataFrame(transformed_data)
transformed_data['isSuccessfulSeason'] = target_data['isSuccessfulSeason']

In [40]:
transformed_data.describe()

Unnamed: 0,0,1,2,isSuccessfulSeason
count,14084.0,14084.0,14084.0,14084.0
mean,9.620251000000001e-17,-4.1123930000000004e-17,-4.2415740000000004e-17,0.16714
std,0.4369109,0.1795096,0.1556361,0.373114
min,-0.8687819,-0.6188767,-0.4678942,0.0
25%,-0.4048747,-0.1146851,-0.100853,0.0
50%,0.05091692,0.005100424,0.009426242,0.0
75%,0.3553148,0.1206502,0.09941249,0.0
max,1.136343,0.6694412,0.7489517,1.0
