## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('bmh')

from time import sleep
from datetime import datetime

import sys
from scipy import sparse
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

## Reading in CSV
---

In [2]:
df = pd.read_csv('../data/Combined_Final_Data/recommender_data.csv')
print(df.shape)
df.head(2)

(799, 36)


Unnamed: 0,team,year,games_played,wins,loss,win_pct,off_pts,def_pts,SRS,SOS,...,opp_rush_att,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot,league
0,New England Patriots,2018,16,11,5,0.688,27.25,20.3125,5.2,-1.8,...,22.9375,112.6875,0.4375,62.625,359.125,5.7,0.625,1.125,1.75,NFL
1,Miami Dolphins,2018,16,7,9,0.438,19.9375,27.0625,-8.8,-1.7,...,30.3125,145.3125,1.0625,64.0,391.0625,6.1,0.4375,1.3125,1.75,NFL


In [3]:
df.tail(2)

Unnamed: 0,team,year,games_played,wins,loss,win_pct,off_pts,def_pts,SRS,SOS,...,opp_rush_att,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot,league
797,Idaho,2014,11,1,10,0.091,25.0,37.3,-19.67,-7.21,...,43.9,245.9,2.9,66.8,462.1,6.9,0.8,0.7,1.5,College
798,Georgia State,2014,12,1,11,0.083,22.7,43.3,-20.41,-4.5,...,48.5,303.4,3.4,75.7,497.1,6.6,0.4,0.3,0.7,College


In [4]:
df.drop(columns=['wins','loss'], inplace=True)

## Recommender

In [5]:
rec_df = df.copy()

In [6]:
rec_df.shape

(799, 34)

In [7]:
sys.getsizeof(rec_df)

309722

There are 799 rows and 34 columns in the original DataFrame. The memory allocation of this DataFrame is very large at 309,722 bytes. Creating a sparse matrix will reduce this tremendously.

In [8]:
rec_df.set_index(['team','year','league'], inplace=True)

Setting my object columns of interest as the index, to allow vector calculations for all of the remaining numerical columns.

In [9]:
print(rec_df.shape)
rec_df.head(2)

(799, 31)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,games_played,win_pct,off_pts,def_pts,SRS,SOS,pass_comp,pass_att,comp_pct,pass_yds,...,opp_pass_TD,opp_rush_att,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot
team,year,league,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
New England Patriots,2018,NFL,16,0.688,27.25,20.3125,5.2,-1.8,23.625,35.875,65.9,266.1,...,1.8125,22.9375,112.6875,0.4375,62.625,359.125,5.7,0.625,1.125,1.75
Miami Dolphins,2018,NFL,16,0.438,19.9375,27.0625,-8.8,-1.7,18.25,28.4375,64.2,181.3,...,1.9375,30.3125,145.3125,1.0625,64.0,391.0625,6.1,0.4375,1.3125,1.75


In [10]:
rec_df.dtypes

games_played       int64
win_pct          float64
off_pts          float64
def_pts          float64
SRS              float64
SOS              float64
pass_comp        float64
pass_att         float64
comp_pct         float64
pass_yds         float64
pass_TD          float64
rush_att         float64
rush_yds         float64
rush_TD          float64
Fum              float64
Int              float64
TO_Tot           float64
opp_pass_comp    float64
opp_pass_att     float64
opp_comp_pct     float64
opp_pass_yds     float64
opp_pass_TD      float64
opp_rush_att     float64
opp_rush_yds     float64
opp_rush_TD      float64
opp_plays        float64
opp_yds          float64
opp_yds_play     float64
opp_Fum          float64
opp_Int          float64
opp_TO_Tot       float64
dtype: object

In [11]:
sparse_df = sparse.csr_matrix(rec_df)

In [12]:
sys.getsizeof(sparse_df)

64

In [13]:
print(sparse_df[:1])

  (0, 0)	16.0
  (0, 1)	0.688
  (0, 2)	27.25
  (0, 3)	20.3125
  (0, 4)	5.2
  (0, 5)	-1.8
  (0, 6)	23.625
  (0, 7)	35.875
  (0, 8)	65.9
  (0, 9)	266.1
  (0, 10)	1.8125
  (0, 11)	29.875
  (0, 12)	127.3
  (0, 13)	1.125
  (0, 14)	0.6875
  (0, 15)	0.6875
  (0, 16)	1.375
  (0, 17)	23.125
  (0, 18)	37.8125
  (0, 19)	0.6115702479338843
  (0, 20)	246.4375
  (0, 21)	1.8125
  (0, 22)	22.9375
  (0, 23)	112.6875
  (0, 24)	0.4375
  (0, 25)	62.625
  (0, 26)	359.125
  (0, 27)	5.7
  (0, 28)	0.625
  (0, 29)	1.125
  (0, 30)	1.75


### Pairwise Distance

In [14]:
recommender = pairwise_distances(sparse_df, metric = 'cosine')

In [15]:
recommender[:2]

array([[0.        , 0.0168141 , 0.00872815, ..., 0.0665135 , 0.03244053,
        0.05790882],
       [0.0168141 , 0.        , 0.00862556, ..., 0.04374958, 0.02269921,
        0.03926055]])

In [16]:
rec_df.index

MultiIndex([('New England Patriots', 2018,     'NFL'),
            (      'Miami Dolphins', 2018,     'NFL'),
            (       'Buffalo Bills', 2018,     'NFL'),
            (       'New York Jets', 2018,     'NFL'),
            (    'Baltimore Ravens', 2018,     'NFL'),
            ( 'Pittsburgh Steelers', 2018,     'NFL'),
            (    'Cleveland Browns', 2018,     'NFL'),
            (  'Cincinnati Bengals', 2018,     'NFL'),
            (      'Houston Texans', 2018,     'NFL'),
            (  'Indianapolis Colts', 2018,     'NFL'),
            ...
            (   'Appalachian State', 2014, 'College'),
            (           'Louisiana', 2014, 'College'),
            (      'Arkansas State', 2014, 'College'),
            (         'Texas State', 2014, 'College'),
            (       'South Alabama', 2014, 'College'),
            (    'Louisiana-Monroe', 2014, 'College'),
            (                'Troy', 2014, 'College'),
            (    'New Mexico State', 2014, 'Colle

In [17]:
recommender_df = pd.DataFrame(recommender, index = rec_df.index, columns = rec_df.index)

In [18]:
recommender_df

Unnamed: 0_level_0,Unnamed: 1_level_0,team,New England Patriots,Miami Dolphins,Buffalo Bills,New York Jets,Baltimore Ravens,Pittsburgh Steelers,Cleveland Browns,Cincinnati Bengals,Houston Texans,Indianapolis Colts,...,Appalachian State,Louisiana,Arkansas State,Texas State,South Alabama,Louisiana-Monroe,Troy,New Mexico State,Idaho,Georgia State
Unnamed: 0_level_1,Unnamed: 1_level_1,year,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,...,2014,2014,2014,2014,2014,2014,2014,2014,2014,2014
Unnamed: 0_level_2,Unnamed: 1_level_2,league,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,...,College,College,College,College,College,College,College,College,College,College
team,year,league,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3
New England Patriots,2018,NFL,0.000000,0.016814,0.008728,0.010548,0.005659,0.008603,0.002763,0.012930,0.002973,0.001609,...,0.037829,0.031240,0.027010,0.032924,0.030543,0.026306,0.047710,0.066514,0.032441,0.057909
Miami Dolphins,2018,NFL,0.016814,0.000000,0.008626,0.001669,0.028029,0.042957,0.006775,0.001369,0.017494,0.025120,...,0.044025,0.023621,0.027536,0.023884,0.017367,0.032701,0.022770,0.043750,0.022699,0.039261
Buffalo Bills,2018,NFL,0.008728,0.008626,0.000000,0.008636,0.010075,0.030693,0.005762,0.010762,0.012477,0.015893,...,0.021635,0.015221,0.011603,0.012514,0.010861,0.028770,0.022566,0.038343,0.019561,0.042022
New York Jets,2018,NFL,0.010548,0.001669,0.008636,0.000000,0.021652,0.031510,0.003175,0.000281,0.010115,0.016629,...,0.045917,0.026251,0.031241,0.029276,0.022797,0.031058,0.032283,0.055252,0.027405,0.046711
Baltimore Ravens,2018,NFL,0.005659,0.028029,0.010075,0.021652,0.000000,0.019123,0.012237,0.025126,0.005931,0.009509,...,0.022741,0.023365,0.025315,0.028638,0.031751,0.045837,0.056254,0.076993,0.045864,0.080107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Louisiana-Monroe,2014,College,0.026306,0.032701,0.028770,0.031058,0.045837,0.028072,0.022128,0.033185,0.041660,0.025167,...,0.066362,0.058460,0.028659,0.042601,0.031687,0.000000,0.030723,0.033775,0.010588,0.015045
Troy,2014,College,0.047710,0.022770,0.022566,0.032283,0.056254,0.080650,0.034678,0.032483,0.058966,0.059985,...,0.036469,0.026059,0.012003,0.011046,0.005451,0.030723,0.000000,0.005368,0.006557,0.013604
New Mexico State,2014,College,0.066514,0.043750,0.038343,0.055252,0.076993,0.095685,0.053414,0.056000,0.085376,0.077861,...,0.052828,0.048557,0.018584,0.023201,0.016719,0.033775,0.005368,0.000000,0.008966,0.009103
Idaho,2014,College,0.032441,0.022699,0.019561,0.027405,0.045864,0.051415,0.023759,0.028612,0.046990,0.038941,...,0.042079,0.034705,0.011080,0.017754,0.010164,0.010588,0.006557,0.008966,0.000000,0.006218


In [19]:
recommender_df.columns

MultiIndex([('New England Patriots', 2018,     'NFL'),
            (      'Miami Dolphins', 2018,     'NFL'),
            (       'Buffalo Bills', 2018,     'NFL'),
            (       'New York Jets', 2018,     'NFL'),
            (    'Baltimore Ravens', 2018,     'NFL'),
            ( 'Pittsburgh Steelers', 2018,     'NFL'),
            (    'Cleveland Browns', 2018,     'NFL'),
            (  'Cincinnati Bengals', 2018,     'NFL'),
            (      'Houston Texans', 2018,     'NFL'),
            (  'Indianapolis Colts', 2018,     'NFL'),
            ...
            (   'Appalachian State', 2014, 'College'),
            (           'Louisiana', 2014, 'College'),
            (      'Arkansas State', 2014, 'College'),
            (         'Texas State', 2014, 'College'),
            (       'South Alabama', 2014, 'College'),
            (    'Louisiana-Monroe', 2014, 'College'),
            (                'Troy', 2014, 'College'),
            (    'New Mexico State', 2014, 'Colle

In [33]:
# movies.loc[movies['title'].str.contains('Gump'), 'title']
df.loc[df['team'].str.contains('State'), 'team']

162    North Carolina State
165           Florida State
188              Iowa State
192          Oklahoma State
193            Kansas State
               ...         
789       Appalachian State
791          Arkansas State
792             Texas State
796        New Mexico State
798           Georgia State
Name: team, Length: 135, dtype: object

In [35]:
recommender_df[('Oklahoma', 2017,     'College')].sort_values()[1:10]

team                  year  league 
Southern Mississippi  2015  College    0.001813
Alabama               2018  College    0.001820
Penn State            2017  College    0.001859
Texas Christian       2015  College    0.002220
Ohio State            2018  College    0.002251
Boise State           2015  College    0.002595
Toledo                2016  College    0.002725
Fresno State          2018  College    0.002774
Oklahoma              2015  College    0.002841
Name: (Oklahoma, 2017, College), dtype: float64

* This recommender is based off of pairwise distance. Meaning the closer to 0 the more similar two teams are.
* A drawback noticed is that understandably so, only College teams are compared to other College teams. With NFL this is the same. 

NEXT STEPS - A few things I will try next include:
* Scaling the data and see how the results change.
* create a recommender using cosine similarity.
* Comparing 1 NFL team at a time to the entire college DataFrame.
* Attempt to reduce the amount of columns passed into my recommender.