In [1]:
# Packages
import pandas as pd
import sqlalchemy
from scipy.stats import spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from feature_functions import spearman_rank, boxplot, density_plot, diff_in_means, diff_in_medians
from matplotlib.backends.backend_pdf import PdfPages

## Acquire continuous features

In [2]:
# Create the engine to connect to the MySQL database
engine = sqlalchemy.create_engine('mysql+mysqlconnector://root:root@localhost/nhl')

In [3]:
master_query = """
SELECT a.player_id,
    a.date,
    a.G,
    b.*,
    c.*,
    d.*,
    e.*,
    f.*,
    g.*
FROM skater_games a
LEFT JOIN skater_per60_rolling3 b
    ON a.player_id = b.player_id AND a.date = b.date
LEFT JOIN skater_per60_rolling5 c
    ON a.player_id = c.player_id AND a.date = c.date
LEFT JOIN skater_per60_rolling10 d
    ON a.player_id = d.player_id AND a.date = d.date
LEFT JOIN skater_per60_rolling15 e
    ON a.player_id = e.player_id AND a.date = e.date
LEFT JOIN skater_per60_rolling20 f
    ON a.player_id = f.player_id AND a.date = f.date
LEFT JOIN point_streak g
    ON a.player_id = g.player_id AND a.date = g.date
"""

per60_3_query = """
SELECT a.player_id,
    a.date,
    a.G,
    b.*
FROM skater_games a
INNER JOIN skater_per60_rolling3 b
    ON a.player_id = b.player_id AND a.date = b.date
"""

In [4]:
# Read in all features at once
# features = pd.read_sql(master_query, con=engine)

# Read in just rolling 3 for testing
features = pd.read_sql(per60_3_query, con=engine)

In [5]:
print(f'Num rows: {features.shape[0]}\nNum columns: {features.shape[1]}')
display(features.head(5))

Num rows: 117367
Num columns: 22


Unnamed: 0,player_id,date,G,G60_3,A60_3,P60_3,rating60_3,PIM60_3,EVG60_3,PPG60_3,...,EVA60_3,PPA60_3,SHA60_3,S60_3,shifts60_3,HIT60_3,BLK60_3,FOW60_3,FOL60_3,avgTOI_3
0,/a/abruzni01,2022-04-12,0,0.0,0.0,0.0,-2.180497,0.0,0.0,0.0,...,0.0,0.0,0.0,4.360994,87.219888,4.360994,4.360994,0.0,0.0,9.17222
1,/a/abruzni01,2022-04-17,0,0.0,0.0,0.0,-2.187121,0.0,0.0,0.0,...,0.0,0.0,0.0,6.561362,80.923461,4.374241,4.374241,0.0,0.0,9.144443
2,/a/abruzni01,2022-04-19,0,0.0,0.0,0.0,-2.608696,0.0,0.0,0.0,...,0.0,0.0,0.0,2.608696,88.695652,0.0,2.608696,0.0,0.0,7.666667
3,/a/abruzni01,2022-04-24,0,0.0,0.0,0.0,-2.03275,4.065499,0.0,0.0,...,0.0,0.0,0.0,4.065499,89.440984,4.065499,0.0,4.065499,8.130999,9.83889
4,/a/abruzni01,2022-04-26,0,0.0,0.0,0.0,-1.873047,3.746094,0.0,0.0,...,0.0,0.0,0.0,1.873047,91.779291,5.61914,1.873047,3.746094,7.492187,10.67779


In [6]:
# Set up x
x_train = features.iloc[:,3:]

# Set up multiple y's
y_train = features.iloc[:,2]
y_train_binary = y_train > 0
y_train_012 = y_train.copy().astype('object')
y_train_012[y_train_012 >= 2] = '2+'

## Calculate spearman rank correlation

In [None]:
# Run spearman correlations for all continous features
spearman_correlations = x_train.apply(lambda f: spearman_rank(feature=f, target=y_train), axis=0).rename('correlation')

## Variance and MAD
Features with higher variance typically have higher discriminatory power. Conversely, if a variable has 0 variance, it cannot discriminate the target variable.

In [None]:
# Calculate variances
variances = x_train.var(axis=0).rename('variance')
#variances.sort_values(ascending=False)

In [None]:
# Calculate MAD
mads = x_train.apply(lambda x: np.mean(np.abs(x - np.mean(x))), axis=0).rename('MAD')
#mads.sort_values(ascending=False)

## Calculate difference in mean/med btw target levels

In [None]:
diff_means = x_train.apply(lambda f: diff_in_means(feature=f, target=y_train_binary), axis=0).sort_values(ascending=False, key=abs).rename('diff_mean')
#diff_means

In [None]:
diff_medians= x_train.apply(lambda f: diff_in_medians(feature=f, target=y_train_binary), axis=0).sort_values(ascending=False, key=abs).rename('diff_med')
#diff_medians

## Save results to CSV

In [None]:
# Concatenate into 1 dataframe
filter_method_results = pd.concat([spearman_correlations, diff_means, diff_medians, variances, mads], axis=1)

In [None]:
# Save to CSV
filter_method_results.to_csv('./continuous_filter_methods.csv', header=True, index=False)

## Correlation matrix (between features)

In [None]:
# Calculate correlations
correlation_matrix = x_train.corr()

In [None]:
# Save
correlation_matrix.to_csv('continuous_correlations.csv', header=True, index=False)

## Plots

In [None]:

for i, col in enumerate(x_train.columns):
    # Open PDF file
    with PdfPages(f'../feature_plots/{col}.pdf') as pdf_pages:
         # First plot
        fig1 = plt.figure(i)
        boxplot(x_train[col], y_train_binary)
        pdf_pages.savefig(fig1)

        # Second plot
        fig1 = plt.figure(i)
        boxplot(x_train[col], y_train_012)
        pdf_pages.savefig(fig1)

        # Third plot
        fig2 = plt.figure(i)
        density_plot(x_train[col], y_train_binary)
        pdf_pages.savefig(fig2)

        # Fourth plot
        fig3 = plt.figure(i)
        density_plot(x_train[col], y_train_012)
        pdf_pages.savefig(fig2)