## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('bmh')

from time import sleep
from datetime import datetime

import sys
from scipy import sparse
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

## Reading in CSV
---

In [2]:
df = pd.read_csv('../data/Combined_Final_Data/recommender_data.csv')
print(df.shape)
df.head(2)

(799, 36)


Unnamed: 0,team,year,games_played,wins,loss,win_pct,off_pts,def_pts,SRS,SOS,...,opp_rush_att,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot,league
0,New England Patriots,2018,16,11,5,0.688,27.25,20.3125,5.2,-1.8,...,22.9375,112.6875,0.4375,62.625,359.125,5.7,0.625,1.125,1.75,NFL
1,Miami Dolphins,2018,16,7,9,0.438,19.9375,27.0625,-8.8,-1.7,...,30.3125,145.3125,1.0625,64.0,391.0625,6.1,0.4375,1.3125,1.75,NFL


In [3]:
df.tail(2)

Unnamed: 0,team,year,games_played,wins,loss,win_pct,off_pts,def_pts,SRS,SOS,...,opp_rush_att,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot,league
797,Idaho,2014,11,1,10,0.091,25.0,37.3,-19.67,-7.21,...,43.9,245.9,2.9,66.8,462.1,6.9,0.8,0.7,1.5,College
798,Georgia State,2014,12,1,11,0.083,22.7,43.3,-20.41,-4.5,...,48.5,303.4,3.4,75.7,497.1,6.6,0.4,0.3,0.7,College


In [4]:
df.drop(columns=['wins','loss','games_played'], inplace=True)

In [5]:
df[df['league'] == 'NFL'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,160.0,2016.0,1.418654,2014.0,2015.0,2016.0,2017.0,2018.0
win_pct,160.0,0.500256,0.191683,0.0,0.375,0.5,0.625,0.938
off_pts,160.0,22.647656,4.232434,14.0,19.921875,22.4375,25.578125,35.3125
def_pts,160.0,22.647656,3.267652,15.625,19.984375,22.53125,25.0,30.0
SRS,160.0,0.000625,5.845048,-11.8,-4.0,-0.15,4.325,12.3
SOS,160.0,0.003125,1.591701,-3.9,-1.2,-0.05,1.3,3.8
pass_comp,160.0,22.098047,2.713474,16.9375,20.09375,22.1875,23.828125,29.5
pass_att,160.0,35.012109,3.444853,26.6875,32.359375,35.1875,37.765625,43.0625
comp_pct,160.0,0.630187,0.038746,0.54,0.6,0.63,0.66,0.73
pass_yds,160.0,236.8525,34.590999,157.7,209.35,236.45,259.225,320.3


In [6]:
df[df['league'] == 'College'].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,639.0,2016.00939,1.416397,2014.0,2015.0,2016.0,2017.0,2018.0
win_pct,639.0,0.51993,0.215435,0.0,0.333,0.538,0.692,1.0
off_pts,639.0,29.273083,7.124658,11.1,24.6,28.8,33.9,48.4
def_pts,639.0,27.698435,6.819518,11.9,22.8,27.1,32.0,50.4
SRS,639.0,0.914491,9.940742,-23.75,-6.38,1.38,8.055,26.45
SOS,639.0,-0.017981,4.575337,-10.62,-4.055,0.39,4.01,9.78
pass_comp,639.0,18.74867,5.107373,1.5,16.0,18.9,21.8,42.5
pass_att,639.0,31.706416,7.340296,5.0,27.85,32.1,36.15,64.3
comp_pct,639.0,0.585696,0.055221,0.31,0.55,0.59,0.62,0.72
pass_yds,639.0,231.516745,61.149896,27.8,192.55,229.8,266.3,477.7


In [7]:
df[['comp_pct','opp_comp_pct']]

Unnamed: 0,comp_pct,opp_comp_pct
0,0.66,0.61
1,0.64,0.66
2,0.54,0.64
3,0.57,0.63
4,0.60,0.58
...,...,...
794,0.60,0.58
795,0.67,0.61
796,0.61,0.62
797,0.58,0.63


In [8]:
# [x*100 for x in df[df['league'] == 'NFL'].opp_comp_pct]

In [9]:
# df['opp_comp_pct'] = [round(x * 100,1) if x < 1 else x for x in df['opp_comp_pct']]

## Recommender

In [10]:
rec_df = df.copy()

In [11]:
rec_df.shape

(799, 33)

In [12]:
sys.getsizeof(rec_df)

303330

There are 799 rows and 34 columns in the original DataFrame. The memory allocation of this DataFrame is very large at 309,722 bytes. Creating a sparse matrix will reduce this tremendously.

In [13]:
rec_df.set_index(['team','year','league'], inplace=True)

Setting my object columns of interest as the index, to allow vector calculations for all of the remaining numerical columns.

In [14]:
print(rec_df.shape)
rec_df.head(2)

(799, 30)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,win_pct,off_pts,def_pts,SRS,SOS,pass_comp,pass_att,comp_pct,pass_yds,pass_TD,...,opp_pass_TD,opp_rush_att,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot
team,year,league,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
New England Patriots,2018,NFL,0.688,27.25,20.3125,5.2,-1.8,23.625,35.875,0.66,266.1,1.8125,...,1.8125,22.9375,112.6875,0.4375,62.625,359.125,5.7,0.625,1.125,1.75
Miami Dolphins,2018,NFL,0.438,19.9375,27.0625,-8.8,-1.7,18.25,28.4375,0.64,181.3,1.625,...,1.9375,30.3125,145.3125,1.0625,64.0,391.0625,6.1,0.4375,1.3125,1.75


In [15]:
rec_df.dtypes

win_pct          float64
off_pts          float64
def_pts          float64
SRS              float64
SOS              float64
pass_comp        float64
pass_att         float64
comp_pct         float64
pass_yds         float64
pass_TD          float64
rush_att         float64
rush_yds         float64
rush_TD          float64
Fum              float64
Int              float64
TO_Tot           float64
opp_pass_comp    float64
opp_pass_att     float64
opp_comp_pct     float64
opp_pass_yds     float64
opp_pass_TD      float64
opp_rush_att     float64
opp_rush_yds     float64
opp_rush_TD      float64
opp_plays        float64
opp_yds          float64
opp_yds_play     float64
opp_Fum          float64
opp_Int          float64
opp_TO_Tot       float64
dtype: object

In [16]:
sparse_df = sparse.csr_matrix(rec_df)

In [17]:
sys.getsizeof(sparse_df)

64

In [18]:
print(sparse_df[:1])

  (0, 0)	0.688
  (0, 1)	27.25
  (0, 2)	20.3125
  (0, 3)	5.2
  (0, 4)	-1.8
  (0, 5)	23.625
  (0, 6)	35.875
  (0, 7)	0.66
  (0, 8)	266.1
  (0, 9)	1.8125
  (0, 10)	29.875
  (0, 11)	127.3
  (0, 12)	1.125
  (0, 13)	0.6875
  (0, 14)	0.6875
  (0, 15)	1.375
  (0, 16)	23.125
  (0, 17)	37.8125
  (0, 18)	0.61
  (0, 19)	246.4375
  (0, 20)	1.8125
  (0, 21)	22.9375
  (0, 22)	112.6875
  (0, 23)	0.4375
  (0, 24)	62.625
  (0, 25)	359.125
  (0, 26)	5.7
  (0, 27)	0.625
  (0, 28)	1.125
  (0, 29)	1.75


### Pairwise Distance

In [19]:
recommender = pairwise_distances(sparse_df, metric = 'cosine')

In [20]:
recommender[:2]

array([[0.        , 0.0170701 , 0.00882532, ..., 0.06295302, 0.02784271,
        0.05344622],
       [0.0170701 , 0.        , 0.00871986, ..., 0.03984631, 0.01796958,
        0.03452304]])

In [21]:
rec_df.index

MultiIndex([('New England Patriots', 2018,     'NFL'),
            (      'Miami Dolphins', 2018,     'NFL'),
            (       'Buffalo Bills', 2018,     'NFL'),
            (       'New York Jets', 2018,     'NFL'),
            (    'Baltimore Ravens', 2018,     'NFL'),
            ( 'Pittsburgh Steelers', 2018,     'NFL'),
            (    'Cleveland Browns', 2018,     'NFL'),
            (  'Cincinnati Bengals', 2018,     'NFL'),
            (      'Houston Texans', 2018,     'NFL'),
            (  'Indianapolis Colts', 2018,     'NFL'),
            ...
            (   'Appalachian State', 2014, 'College'),
            (           'Louisiana', 2014, 'College'),
            (      'Arkansas State', 2014, 'College'),
            (         'Texas State', 2014, 'College'),
            (       'South Alabama', 2014, 'College'),
            (    'Louisiana-Monroe', 2014, 'College'),
            (                'Troy', 2014, 'College'),
            (    'New Mexico State', 2014, 'Colle

In [22]:
recommender_df = pd.DataFrame(recommender, index = rec_df.index, columns = rec_df.index)

In [23]:
recommender_df

Unnamed: 0_level_0,Unnamed: 1_level_0,team,New England Patriots,Miami Dolphins,Buffalo Bills,New York Jets,Baltimore Ravens,Pittsburgh Steelers,Cleveland Browns,Cincinnati Bengals,Houston Texans,Indianapolis Colts,...,Appalachian State,Louisiana,Arkansas State,Texas State,South Alabama,Louisiana-Monroe,Troy,New Mexico State,Idaho,Georgia State
Unnamed: 0_level_1,Unnamed: 1_level_1,year,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,...,2014,2014,2014,2014,2014,2014,2014,2014,2014,2014
Unnamed: 0_level_2,Unnamed: 1_level_2,league,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,...,College,College,College,College,College,College,College,College,College,College
team,year,league,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3
New England Patriots,2018,NFL,0.000000,0.017070,0.008825,0.010605,0.005709,0.008731,0.002719,0.013009,0.002974,0.001617,...,0.032881,0.025920,0.023031,0.028663,0.025923,0.021284,0.043682,0.062953,0.027843,0.053446
Miami Dolphins,2018,NFL,0.017070,0.000000,0.008720,0.001614,0.028441,0.043623,0.006795,0.001300,0.017726,0.025500,...,0.039207,0.018167,0.023587,0.019483,0.012549,0.027818,0.018295,0.039846,0.017970,0.034523
Buffalo Bills,2018,NFL,0.008825,0.008720,0.000000,0.008565,0.010243,0.031182,0.005642,0.010692,0.012664,0.016140,...,0.016231,0.009466,0.007172,0.007702,0.005706,0.023689,0.017934,0.034098,0.014490,0.037047
New York Jets,2018,NFL,0.010605,0.001614,0.008565,0.000000,0.021743,0.031812,0.003212,0.000281,0.010000,0.016686,...,0.041118,0.020878,0.027499,0.025084,0.018288,0.026159,0.028008,0.051737,0.023020,0.042372
Baltimore Ravens,2018,NFL,0.005709,0.028441,0.010243,0.021743,0.000000,0.019432,0.012187,0.025240,0.006029,0.009663,...,0.017346,0.017748,0.021082,0.024091,0.026907,0.041080,0.052263,0.073361,0.041192,0.075730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Louisiana-Monroe,2014,College,0.021284,0.027818,0.023689,0.026159,0.041080,0.023043,0.017076,0.028332,0.036821,0.020067,...,0.067914,0.059839,0.029119,0.043451,0.032219,0.000000,0.031391,0.034196,0.010556,0.015057
Troy,2014,College,0.043682,0.018295,0.017934,0.028008,0.052263,0.077212,0.030455,0.028226,0.055041,0.056146,...,0.037264,0.026600,0.012177,0.011250,0.005455,0.031391,0.000000,0.005302,0.006501,0.013629
New Mexico State,2014,College,0.062953,0.039846,0.034098,0.051737,0.073361,0.092512,0.049879,0.052522,0.081878,0.074355,...,0.053613,0.049244,0.018881,0.023563,0.016978,0.034196,0.005302,0.000000,0.009107,0.009226
Idaho,2014,College,0.027843,0.017970,0.014490,0.023020,0.041192,0.047014,0.019325,0.024277,0.042321,0.034280,...,0.042675,0.035147,0.011218,0.018011,0.010317,0.010556,0.006501,0.009107,0.000000,0.006320


In [24]:
recommender_df.columns

MultiIndex([('New England Patriots', 2018,     'NFL'),
            (      'Miami Dolphins', 2018,     'NFL'),
            (       'Buffalo Bills', 2018,     'NFL'),
            (       'New York Jets', 2018,     'NFL'),
            (    'Baltimore Ravens', 2018,     'NFL'),
            ( 'Pittsburgh Steelers', 2018,     'NFL'),
            (    'Cleveland Browns', 2018,     'NFL'),
            (  'Cincinnati Bengals', 2018,     'NFL'),
            (      'Houston Texans', 2018,     'NFL'),
            (  'Indianapolis Colts', 2018,     'NFL'),
            ...
            (   'Appalachian State', 2014, 'College'),
            (           'Louisiana', 2014, 'College'),
            (      'Arkansas State', 2014, 'College'),
            (         'Texas State', 2014, 'College'),
            (       'South Alabama', 2014, 'College'),
            (    'Louisiana-Monroe', 2014, 'College'),
            (                'Troy', 2014, 'College'),
            (    'New Mexico State', 2014, 'Colle

In [25]:
# movies.loc[movies['title'].str.contains('Gump'), 'title']
df.loc[df['team'].str.contains('State'), 'team']

162    North Carolina State
165           Florida State
188              Iowa State
192          Oklahoma State
193            Kansas State
               ...         
789       Appalachian State
791          Arkansas State
792             Texas State
796        New Mexico State
798           Georgia State
Name: team, Length: 135, dtype: object

In [26]:
recommender_df[('New England Patriots', 2018,     'NFL')].sort_values()[1:10]

team                  year  league
Los Angeles Chargers  2018  NFL       0.000190
New England Patriots  2017  NFL       0.000304
Philadelphia Eagles   2014  NFL       0.000454
Kansas City Chiefs    2017  NFL       0.000504
Atlanta Falcons       2017  NFL       0.000545
Houston Texans        2015  NFL       0.000603
Pittsburgh Steelers   2016  NFL       0.000618
New England Patriots  2014  NFL       0.000625
New Orleans Saints    2017  NFL       0.000640
Name: (New England Patriots, 2018, NFL), dtype: float64

* This recommender is based off of pairwise distance. Meaning the closer to 0 the more similar two teams are.
* A drawback noticed is that understandably so, only College teams are compared to other College teams. With NFL this is the same. 

NEXT STEPS - A few things I will try next include:
* Scaling the data and see if that will impact the results.
* Check for errors.
* Remove obvious polarizing columns (like games_played)
* Create a recommender using cosine similarity.
* Comparing 1 NFL team at a time to the entire college DataFrame. (create a function to accomplish this)
* Attempt to reduce the amount of columns passed into my recommender.

## Scaled Data

### Scaling columns of interest

In [27]:
# rec_df.dtypes

In [28]:
rec_df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,win_pct,off_pts,def_pts,SRS,SOS,pass_comp,pass_att,comp_pct,pass_yds,pass_TD,...,opp_pass_TD,opp_rush_att,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot
team,year,league,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
New England Patriots,2018,NFL,0.688,27.25,20.3125,5.2,-1.8,23.625,35.875,0.66,266.1,1.8125,...,1.8125,22.9375,112.6875,0.4375,62.625,359.125,5.7,0.625,1.125,1.75
Miami Dolphins,2018,NFL,0.438,19.9375,27.0625,-8.8,-1.7,18.25,28.4375,0.64,181.3,1.625,...,1.9375,30.3125,145.3125,1.0625,64.0,391.0625,6.1,0.4375,1.3125,1.75


In [29]:
ss = StandardScaler()
scaled = ss.fit_transform(rec_df)
# X_scaled = ss.transform(X)

In [30]:
scaled_df = pd.DataFrame(scaled, index=rec_df.index, columns=rec_df.columns)
scaled_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,win_pct,off_pts,def_pts,SRS,SOS,pass_comp,pass_att,comp_pct,pass_yds,pass_TD,...,opp_pass_TD,opp_rush_att,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot
team,year,league,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
New England Patriots,2018,NFL,0.816013,-0.097386,-0.968227,0.482304,-0.430455,0.856847,0.510762,1.183982,0.589842,0.261813,...,0.54514,-2.078775,-0.987367,-1.537361,-1.215614,-0.48958,0.177385,-0.158509,0.734858,0.493326
Miami Dolphins,2018,NFL,-0.369984,-1.120067,0.057033,-1.028772,-0.406356,-0.238249,-0.572559,0.821876,-0.902593,-0.039091,...,0.82717,-0.94473,-0.289452,-0.636255,-0.961799,0.062932,0.81366,-0.952049,1.314895,0.493326
Buffalo Bills,2018,NFL,-0.668855,-1.55711,-0.503063,-0.823697,-0.06898,-0.505656,-0.172003,-0.988653,-1.020509,-1.34301,...,-0.441964,-1.377205,-0.939235,-0.636255,-1.607873,-1.614068,-1.095166,0.106004,0.348167,0.340382
New York Jets,2018,NFL,-1.261853,-0.997695,0.132978,-0.920838,-0.261767,-0.149113,0.055585,-0.445494,-0.612202,-0.841503,...,0.54514,-1.377205,-0.6959,-0.726366,-0.534929,-0.12196,0.177385,-0.952049,-0.231869,-0.730227
Baltimore Ravens,2018,NFL,0.517142,-0.508206,-1.328966,0.676586,0.147904,0.296566,0.346898,0.097664,-0.179255,-0.841503,...,-0.582979,-2.203712,-1.623779,-1.176919,-1.538651,-1.634611,-1.254235,-1.481075,-0.425215,-1.18906


In [31]:
sparse_df2 = sparse.csr_matrix(scaled_df)

creating sparse matrix of scaled data

In [32]:
scale_rec = pairwise_distances(sparse_df2, metric = 'cosine')

pairwise distance calculation of the created sparse matrix. This is now the created recommender on the scaled data.

In [33]:
scale_rec_df = pd.DataFrame(scale_rec, index = rec_df.index, columns = rec_df.index)

In [34]:
scale_rec_df

Unnamed: 0_level_0,Unnamed: 1_level_0,team,New England Patriots,Miami Dolphins,Buffalo Bills,New York Jets,Baltimore Ravens,Pittsburgh Steelers,Cleveland Browns,Cincinnati Bengals,Houston Texans,Indianapolis Colts,...,Appalachian State,Louisiana,Arkansas State,Texas State,South Alabama,Louisiana-Monroe,Troy,New Mexico State,Idaho,Georgia State
Unnamed: 0_level_1,Unnamed: 1_level_1,year,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,...,2014,2014,2014,2014,2014,2014,2014,2014,2014,2014
Unnamed: 0_level_2,Unnamed: 1_level_2,league,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,...,College,College,College,College,College,College,College,College,College,College
team,year,league,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3
New England Patriots,2018,NFL,0.000000,0.540459,0.911371,0.722085,0.489625,0.376400,0.449504,0.477784,0.147069,0.259505,...,1.104986,0.866419,1.368517,1.087359,1.554721,1.081850,1.410922,1.611096,1.527140,1.528957
Miami Dolphins,2018,NFL,0.540459,0.000000,0.543190,0.338006,0.758318,0.649828,0.517973,0.329256,0.457369,0.490117,...,1.182274,1.130383,1.373549,1.193323,1.256980,0.863980,0.857772,1.055343,1.012163,0.910129
Buffalo Bills,2018,NFL,0.911371,0.543190,0.000000,0.313806,0.452133,0.625996,0.600325,0.901005,0.694783,0.652621,...,1.025374,1.372817,1.387240,1.637919,0.733243,0.913933,1.254804,0.914743,0.903518,0.994356
New York Jets,2018,NFL,0.722085,0.338006,0.313806,0.000000,0.482679,0.423440,0.354068,0.357802,0.545828,0.555746,...,1.133640,1.223666,1.495349,1.435345,1.004092,0.945745,1.174585,1.011157,0.950023,0.779990
Baltimore Ravens,2018,NFL,0.489625,0.758318,0.452133,0.482679,0.000000,0.330659,0.684573,0.769544,0.409241,0.521760,...,0.900522,1.132695,1.485812,1.464817,1.326922,0.957693,1.449638,1.580307,1.531972,1.424419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Louisiana-Monroe,2014,College,1.081850,0.863980,0.913933,0.945745,0.957693,0.829428,1.123337,0.961489,1.187205,0.914865,...,1.270803,1.355496,0.990055,1.018469,0.950927,0.000000,0.551428,0.706261,0.656819,0.584976
Troy,2014,College,1.410922,0.857772,1.254804,1.174585,1.449638,1.441509,1.379213,1.055954,1.384159,1.368327,...,0.847479,0.813916,0.562964,0.557386,0.830556,0.551428,0.000000,0.468681,0.478894,0.427461
New Mexico State,2014,College,1.611096,1.055343,0.914743,1.011157,1.580307,1.278725,1.136314,1.278955,1.592750,1.254154,...,0.943739,1.182578,0.558471,0.829363,0.433751,0.706261,0.468681,0.000000,0.089820,0.196172
Idaho,2014,College,1.527140,1.012163,0.903518,0.950023,1.531972,1.185546,1.200781,1.154230,1.544975,1.203964,...,0.939540,1.094811,0.712784,0.937851,0.458250,0.656819,0.478894,0.089820,0.000000,0.207546


In [47]:
scale_rec_df[('New England Patriots', 2018,     'NFL')].sort_values()[1:10]

team                  year  league
New England Patriots  2014  NFL       0.110430
                      2017  NFL       0.136031
                      2015  NFL       0.145527
Kansas City Chiefs    2017  NFL       0.146318
Houston Texans        2018  NFL       0.147069
Cincinnati Bengals    2015  NFL       0.147356
Pittsburgh Steelers   2014  NFL       0.159661
Atlanta Falcons       2016  NFL       0.163532
Denver Broncos        2014  NFL       0.171624
Name: (New England Patriots, 2018, NFL), dtype: float64

My initial recommenders (no scaled data & scaled data), with approximately 30 plus columns and pairwise distance are providing interesting results.

![image.png](attachment:image.png)

The images above shows the results between the two by searching for the 2018 Super Bowl Winning, New England Patriots

### Eliminating Specific Columns
---

In [36]:
scaled_df.T

team,New England Patriots,Miami Dolphins,Buffalo Bills,New York Jets,Baltimore Ravens,Pittsburgh Steelers,Cleveland Browns,Cincinnati Bengals,Houston Texans,Indianapolis Colts,...,Appalachian State,Louisiana,Arkansas State,Texas State,South Alabama,Louisiana-Monroe,Troy,New Mexico State,Idaho,Georgia State
year,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,...,2014,2014,2014,2014,2014,2014,2014,2014,2014,2014
league,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,NFL,...,College,College,College,College,College,College,College,College,College,College
win_pct,0.816013,-0.369984,-0.668855,-1.261853,0.517142,0.370079,-0.22292,-0.668855,0.816013,0.517142,...,0.317895,0.834989,0.104415,0.317895,-0.256128,-0.868102,-1.261853,-1.655604,-2.016147,-2.054099
off_pts,-0.097386,-1.120067,-1.55711,-0.997695,-0.508206,-0.167313,-0.770432,-0.691764,-0.394575,-0.123608,...,1.084379,0.217285,1.224233,0.818657,-0.761691,-1.097341,-0.859589,-0.481984,-0.412057,-0.733721
def_pts,-0.968227,0.057033,-0.503063,0.132978,-1.328966,-0.635967,-0.332186,0.265882,-1.053665,-0.787857,...,0.093106,-0.104351,0.579155,0.153863,-0.043595,-0.058784,1.44493,1.885411,1.612009,2.52335
SRS,0.482304,-1.028772,-0.823697,-0.920838,0.676586,0.525478,-0.111333,-0.445928,0.331197,0.288023,...,-0.787,-0.248409,-0.506371,-0.772968,-0.902489,-1.138865,-1.662345,-2.073573,-2.202015,-2.281886
SOS,-0.430455,-0.406356,-0.06898,-0.261767,0.147904,0.316593,0.412986,0.485281,-0.35816,-0.526848,...,-2.382417,-1.338961,-1.654648,-2.247466,-1.240158,-1.097978,-1.48355,-1.840205,-1.734173,-1.081109
pass_comp,0.856847,-0.238249,-0.505656,-0.149113,0.296566,1.888275,0.538505,0.258365,0.436636,1.544466,...,-0.574418,-0.228062,0.362781,0.281285,-0.53367,1.422223,0.077546,0.770258,0.790632,0.525772
pass_att,0.510762,-0.572559,-0.172003,0.055585,0.346898,1.557668,0.510762,0.219449,-0.108279,1.148009,...,-0.796506,-0.563455,0.310484,-0.082789,-0.097354,1.708787,-0.374102,0.776585,1.140726,0.703757
comp_pct,1.183982,0.821876,-0.988653,-0.445494,0.097664,1.365035,0.278717,0.278717,1.546087,1.365035,...,0.45977,0.821876,0.45977,1.002929,-1.169706,0.097664,1.365035,0.278717,-0.264441,-0.083389
pass_yds,0.589842,-0.902593,-1.020509,-0.612202,-0.179255,1.415257,0.31353,-0.474926,0.065378,0.813355,...,-0.33413,-0.723078,0.484245,-0.117657,-0.626281,0.832715,-0.601642,0.100577,0.691919,0.816875
pass_TD,0.261813,-0.039091,-1.34301,-0.841503,-0.841503,0.863622,0.261813,0.06121,-0.039091,1.264828,...,0.241753,-0.721141,0.402235,0.241753,-0.560659,-0.560659,-1.042106,-0.079212,-0.721141,0.402235


In [37]:
scaled_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
win_pct,799.0,7.169901e-17,1.000626,-2.44785,-0.868102,0.104415,0.834989,2.296137
off_pts,799.0,1.179699e-16,1.000626,-2.356025,-0.713617,-0.104378,0.657825,2.860523
def_pts,799.0,3.540486e-16,1.000626,-2.246003,-0.711912,-0.134729,0.662695,3.601771
SRS,799.0,-2.056483e-17,1.000626,-2.642386,-0.713605,0.039775,0.693315,2.775902
SOS,799.0,4.39087e-17,1.000626,-2.555925,-0.760601,0.037052,0.84314,2.360128
pass_comp,799.0,-8.00361e-17,1.000626,-3.650874,-0.513296,0.036799,0.627641,4.702418
pass_att,799.0,1.169972e-16,1.000626,-3.986384,-0.476061,0.064689,0.622735,4.651049
comp_pct,799.0,-5.843087e-16,1.000626,-5.152869,-0.626547,0.097664,0.640823,2.451352
pass_yds,799.0,-5.838745e-16,1.000626,-3.604111,-0.610442,0.0073,0.560803,4.31389
pass_TD,799.0,-2.097474e-16,1.000626,-2.325965,-0.721141,-0.079212,0.562718,3.611882


In [38]:
scaled_df.columns

Index(['win_pct', 'off_pts', 'def_pts', 'SRS', 'SOS', 'pass_comp', 'pass_att',
       'comp_pct', 'pass_yds', 'pass_TD', 'rush_att', 'rush_yds', 'rush_TD',
       'Fum', 'Int', 'TO_Tot', 'opp_pass_comp', 'opp_pass_att', 'opp_comp_pct',
       'opp_pass_yds', 'opp_pass_TD', 'opp_rush_att', 'opp_rush_yds',
       'opp_rush_TD', 'opp_plays', 'opp_yds', 'opp_yds_play', 'opp_Fum',
       'opp_Int', 'opp_TO_Tot'],
      dtype='object')

In [39]:
drop_df = scaled_df.drop(columns=['SRS','SOS','rush_att','opp_rush_att'])

These are the columns that have an obvious descrepancy between the NFL & College Football. I need to research further on why this is the case and see if there is something I can do to accound for this or do I have to remove these columns all around.

In [40]:
drop_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,win_pct,off_pts,def_pts,pass_comp,pass_att,comp_pct,pass_yds,pass_TD,rush_yds,rush_TD,...,opp_pass_yds,opp_pass_TD,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot
team,year,league,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
New England Patriots,2018,NFL,0.816013,-0.097386,-0.968227,0.856847,0.510762,1.183982,0.589842,0.261813,-0.671573,-0.638376,...,0.532965,0.545140,-0.987367,-1.537361,-1.215614,-0.489580,0.177385,-0.158509,0.734858,0.493326
Miami Dolphins,2018,NFL,-0.369984,-1.120067,0.057033,-0.238249,-0.572559,0.821876,-0.902593,-0.039091,-1.016443,-1.554049,...,0.512443,0.827170,-0.289452,-0.636255,-0.961799,0.062932,0.813660,-0.952049,1.314895,0.493326
Buffalo Bills,2018,NFL,-0.668855,-1.557110,-0.503063,-0.505656,-0.172003,-0.988653,-1.020509,-1.343010,-0.732432,-0.888105,...,-1.474498,-0.441964,-0.939235,-0.636255,-1.607873,-1.614068,-1.095166,0.106004,0.348167,0.340382
New York Jets,2018,NFL,-1.261853,-0.997695,0.132978,-0.149113,0.055585,-0.445494,-0.612202,-0.841503,-1.149227,-1.221077,...,0.760577,0.545140,-0.695900,-0.726366,-0.534929,-0.121960,0.177385,-0.952049,-0.231869,-0.730227
Baltimore Ravens,2018,NFL,0.517142,-0.508206,-1.328966,0.296566,0.346898,0.097664,-0.179255,-0.841503,-0.204984,-0.555134,...,-0.554722,-0.582979,-1.623779,-1.176919,-1.538651,-1.634611,-1.254235,-1.481075,-0.425215,-1.189060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Louisiana-Monroe,2014,College,-0.868102,-1.097341,-0.058784,1.422223,1.708787,0.097664,0.832715,-0.560659,-1.737536,-1.071240,...,-1.154722,-0.611182,0.523177,0.427050,0.201134,-0.244356,-0.458891,-1.110756,0.038814,-0.607872
Troy,2014,College,-1.261853,-0.859589,1.444930,0.077546,-0.374102,1.365035,-0.601642,-1.042106,0.010791,-0.138919,...,-0.674125,0.065689,1.845204,2.012997,0.293431,1.101569,1.290867,-0.264314,0.038814,-0.118450
New Mexico State,2014,College,-1.655604,-0.481984,1.885411,0.770258,0.776585,0.278717,0.100577,-0.079212,0.268982,-0.005730,...,-1.614423,-0.611182,3.207875,2.589704,0.902586,1.658623,1.449936,0.582127,0.038814,0.615682
Idaho,2014,College,-2.016147,-0.412057,1.612009,0.790632,1.140726,-0.264441,0.691919,-0.721141,-0.426291,0.527025,...,-0.369647,0.065689,1.862317,2.012997,-0.444940,1.291867,2.086211,0.582127,-0.579891,-0.118450


### Recommender Creation
---

In [41]:
sparse_df3 = sparse.csr_matrix(drop_df)

creating sparse matrix of scaled data. With the games played column dropped.

In [42]:
col_drop_rec = pairwise_distances(sparse_df3, metric = 'cosine')

pairwise distance calculation of the created sparse matrix. This is now the created recommender on the scaled data.

In [43]:
pd.DataFrame(col_drop_rec, index = rec_df.index, columns = rec_df.index)[('Indianapolis Colts', 2018,     'NFL')].sort_values()[1:50]


team                  year  league
Pittsburgh Steelers   2016  NFL       0.085115
Detroit Lions         2015  NFL       0.138609
Washington Redskins   2016  NFL       0.173747
Detroit Lions         2017  NFL       0.184045
Green Bay Packers     2016  NFL       0.190905
Pittsburgh Steelers   2014  NFL       0.191527
Baltimore Ravens      2016  NFL       0.193722
New Orleans Saints    2016  NFL       0.194668
Los Angeles Chargers  2018  NFL       0.196712
Pittsburgh Steelers   2018  NFL       0.196970
Carolina Panthers     2018  NFL       0.208398
Dallas Cowboys        2014  NFL       0.212251
Atlanta Falcons       2015  NFL       0.220309
Philadelphia Eagles   2018  NFL       0.221936
Baltimore Ravens      2014  NFL       0.222381
Tampa Bay Buccaneers  2016  NFL       0.225080
Denver Broncos        2014  NFL       0.225939
Pittsburgh Steelers   2015  NFL       0.231485
Los Angeles Rams      2018  NFL       0.238115
Indianapolis Colts    2016  NFL       0.246040
Dallas Cowboys        201

### Comparing one NFL to the whole CFB DF

In [44]:
df[df['league'] == 'NFL']

Unnamed: 0,team,year,win_pct,off_pts,def_pts,SRS,SOS,pass_comp,pass_att,comp_pct,...,opp_rush_att,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot,league
0,New England Patriots,2018,0.688,27.2500,20.3125,5.2,-1.8,23.6250,35.8750,0.66,...,22.9375,112.6875,0.4375,62.6250,359.1250,5.7,0.6250,1.1250,1.7500,NFL
1,Miami Dolphins,2018,0.438,19.9375,27.0625,-8.8,-1.7,18.2500,28.4375,0.64,...,30.3125,145.3125,1.0625,64.0000,391.0625,6.1,0.4375,1.3125,1.7500,NFL
2,Buffalo Bills,2018,0.375,16.8125,23.3750,-6.9,-0.3,16.9375,31.1875,0.54,...,27.5000,114.9375,1.0625,60.5000,294.1250,4.9,0.6875,1.0000,1.6875,NFL
3,New York Jets,2018,0.250,20.8125,27.5625,-7.8,-1.1,18.6875,32.7500,0.57,...,27.5000,126.3125,1.0000,66.3125,380.3750,5.7,0.4375,0.8125,1.2500,NFL
4,Baltimore Ravens,2018,0.625,24.3125,17.9375,7.0,0.6,20.8750,34.7500,0.60,...,22.1250,82.9375,0.6875,60.8750,292.9375,4.8,0.3125,0.7500,1.0625,NFL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,Tampa Bay Buccaneers,2014,0.125,17.3125,25.6250,-9.8,-1.5,18.8125,33.1875,0.57,...,29.2500,113.6875,0.9375,66.6875,368.9375,5.5,0.6875,0.8750,1.5625,NFL
156,Seattle Seahawks,2014,0.750,24.6250,15.8750,9.5,0.8,17.9375,28.3750,0.63,...,23.7500,81.5000,0.5000,57.7500,267.1250,4.6,0.6875,0.8125,1.5000,NFL
157,Arizona Cardinals,2014,0.688,19.3750,18.6875,2.0,1.3,20.0000,35.5000,0.56,...,24.7500,108.6875,0.5625,63.1250,368.1875,5.8,0.4375,1.1250,1.5625,NFL
158,San Francisco 49ers,2014,0.500,19.1250,21.2500,-1.0,1.2,18.2500,30.4375,0.60,...,25.1250,100.7500,0.4375,61.8125,321.4375,5.2,0.3750,1.4375,1.8125,NFL


In [45]:
# Creating a function to select an NFL team out of a DataFrame and add it to a full College DataFrame

# Need DataFrame, NFL Team & Year
def college_recommender(df, team, year):
    # select team of interest from provided DataFrame
    team_int = df[(df['team'] == team) & (df['year'] == year)]
    
    # select all of the college teams from DataFrame
    colleges = df[df['league'] == 'College']
    
    # join the two DFs together for analysis
    new_df = pd.concat([team_int, colleges], ignore_index=True)
    # Make team, year, league the index
    new_df.set_index(['team','year','league'], inplace=True)
    
    ss = StandardScaler()
    scaled = ss.fit_transform(new_df)
    
    # create sparse matrix & pairwise distances with the new DataFrame
    spar = sparse.csr_matrix(scaled)
    rec = pairwise_distances(spar, metric = 'cosine')
    
    # based off of the input, search the team of interest in our new recommender
    sim_teams = pd.DataFrame(rec, index = new_df.index, columns = new_df.index)[(team, year,     'NFL')].sort_values()[1:6]
    
    
    # return the top 5 most similar teams
    return sim_teams

The purpose of this function is to allow me to return filter out a 1 NFL team from a DataFrame and combine it with other colleges to perform a college only comparison and recommend the top similar college teams to a specific NFL team.

In [48]:
# test code
college_recommender(df, 'New England Patriots', 2018)

team                  year  league 
North Carolina State  2018  College    0.256362
Louisiana Tech        2015  College    0.290855
North Carolina State  2016  College    0.341058
Georgia State         2017  College    0.351078
Iowa State            2018  College    0.358009
Name: (New England Patriots, 2018, NFL), dtype: float64

### Recommender for specific stats

In [49]:
# ['win_pct', 'off_pts', 'def_pts', 'SRS', 'SOS', 'pass_comp', 'pass_att',
#        'comp_pct', 'pass_yds', 'pass_TD', 'rush_att', 'rush_yds', 'rush_TD',
#        'Fum', 'Int', 'TO_Tot', 'opp_pass_comp', 'opp_pass_att', 'opp_comp_pct',
#        'opp_pass_yds', 'opp_pass_TD', 'opp_rush_att', 'opp_rush_yds',
#        'opp_rush_TD', 'opp_plays', 'opp_yds', 'opp_yds_play', 'opp_Fum',
#        'opp_Int', 'opp_TO_Tot']

In [85]:
overall_cols = ['win_pct', 'off_pts', 'def_pts','pass_TD','rush_TD', 'SOS','opp_pass_TD','opp_rush_TD','TO_Tot','opp_TO_Tot']
offense = ['off_pts','pass_comp', 'pass_att','comp_pct', 'pass_yds', 'pass_TD', 'rush_att', 'rush_yds', 'rush_TD','Fum', 'Int', 'TO_Tot']
defense = ['def_pts','opp_pass_comp', 'opp_pass_att', 'opp_comp_pct','opp_pass_yds', 'opp_pass_TD', 'opp_rush_att', 'opp_rush_yds','opp_rush_TD', 'opp_plays', 'opp_yds', 'opp_yds_play', 'opp_Fum','opp_Int', 'opp_TO_Tot']
pass_o = ['pass_att','comp_pct', 'pass_yds', 'pass_TD','Int']
rush_o = ['rush_att', 'rush_yds', 'rush_TD', 'Fum']
pass_d = ['opp_pass_att', 'opp_comp_pct','opp_pass_yds', 'opp_pass_TD','opp_Int']
rush_d = ['opp_rush_att', 'opp_rush_yds','opp_rush_TD', 'opp_Fum']

I have created different lists that has only the stats that correlates to that specific category.

In [53]:
df[pass_o]

Unnamed: 0,pass_att,comp_pct,pass_yds,pass_TD,Int
0,35.8750,0.66,266.1,1.8125,0.6875
1,28.4375,0.64,181.3,1.6250,0.8125
2,31.1875,0.54,174.6,0.8125,1.4375
3,32.7500,0.57,197.8,1.1250,1.1875
4,34.7500,0.60,222.4,1.1250,0.5625
...,...,...,...,...,...
794,44.1000,0.60,279.9,1.3000,0.6000
795,29.8000,0.67,198.4,1.0000,0.3000
796,37.7000,0.61,238.3,1.6000,2.0000
797,40.2000,0.58,271.9,1.2000,1.8000


In [55]:
# Creating a function to select only specific columns from the DataFrame to create a recommender that's relevant to specific categories.

def category_selector(df, category = 'all stats'):
    if category == 'offense':
        return_df = df[offense]
    elif category == 'defense':
        return_df = df[defense]
    elif category == 'passing offense':
        return_df = df[pass_o]
    elif category == 'rushing offense':
        return_df = df[rush_o]
    elif category == 'passing defense':
        return_df = df[pass_d]
    elif category == 'rushing defense':
        return_df = df[rush_d]
    elif category == 'general':
        return_df = df[overall_cols]
    else:
        return_df = df
        
    return return_df

In [59]:
category_selector(df)

Unnamed: 0,team,year,win_pct,off_pts,def_pts,SRS,SOS,pass_comp,pass_att,comp_pct,...,opp_rush_att,opp_rush_yds,opp_rush_TD,opp_plays,opp_yds,opp_yds_play,opp_Fum,opp_Int,opp_TO_Tot,league
0,New England Patriots,2018,0.688,27.2500,20.3125,5.20,-1.80,23.6250,35.8750,0.66,...,22.9375,112.6875,0.4375,62.6250,359.1250,5.7,0.6250,1.1250,1.7500,NFL
1,Miami Dolphins,2018,0.438,19.9375,27.0625,-8.80,-1.70,18.2500,28.4375,0.64,...,30.3125,145.3125,1.0625,64.0000,391.0625,6.1,0.4375,1.3125,1.7500,NFL
2,Buffalo Bills,2018,0.375,16.8125,23.3750,-6.90,-0.30,16.9375,31.1875,0.54,...,27.5000,114.9375,1.0625,60.5000,294.1250,4.9,0.6875,1.0000,1.6875,NFL
3,New York Jets,2018,0.250,20.8125,27.5625,-7.80,-1.10,18.6875,32.7500,0.57,...,27.5000,126.3125,1.0000,66.3125,380.3750,5.7,0.4375,0.8125,1.2500,NFL
4,Baltimore Ravens,2018,0.625,24.3125,17.9375,7.00,0.60,20.8750,34.7500,0.60,...,22.1250,82.9375,0.6875,60.8750,292.9375,4.8,0.3125,0.7500,1.0625,NFL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,Louisiana-Monroe,2014,0.333,20.1000,26.3000,-9.82,-4.57,26.4000,44.1000,0.60,...,42.3000,183.3000,1.8000,70.3000,373.3000,5.3,0.4000,0.9000,1.3000,College
795,Troy,2014,0.250,21.8000,36.2000,-14.67,-6.17,19.8000,29.8000,0.67,...,43.2000,245.1000,2.9000,70.8000,451.1000,6.4,0.6000,0.9000,1.5000,College
796,New Mexico State,2014,0.167,24.5000,39.1000,-18.48,-7.65,23.2000,37.7000,0.61,...,49.2000,308.8000,3.3000,74.1000,483.3000,6.5,0.8000,0.9000,1.8000,College
797,Idaho,2014,0.091,25.0000,37.3000,-19.67,-7.21,23.3000,40.2000,0.58,...,43.9000,245.9000,2.9000,66.8000,462.1000,6.9,0.8000,0.7000,1.5000,College


In [77]:
# Creating a function to select an NFL team out of a DataFrame and add it to a full College DataFrame

# Need DataFrame, NFL Team & Year
def college_recommender2(df, team, year, category = 'all stats'):
    # select team of interest from provided DataFrame
    team_int = df[(df['team'] == team) & (df['year'] == year)]
    
    # select all of the college teams from DataFrame
    colleges = df[df['league'] == 'College']
    
    # join the two DFs together for analysis
    new_df = pd.concat([team_int, colleges], ignore_index=True)
    # Make team, year, league the index
    new_df.set_index(['team','year','league'], inplace=True)

    
    if category == 'offense':
        return_df = new_df[offense]
    elif category == 'defense':
        return_df = new_df[defense]
    elif category == 'passing offense':
        return_df = new_df[pass_o]
    elif category == 'rushing offense':
        return_df = new_df[rush_o]
    elif category == 'passing defense':
        return_df = new_df[pass_d]
    elif category == 'rushing defense':
        return_df = new_df[rush_d]
    elif category == 'general':
        return_df = new_df[overall_cols]
    else:
        return_df = new_df
    
    
    ss = StandardScaler()
    scaled = ss.fit_transform(return_df)
    
    # create sparse matrix & pairwise distances with the new DataFrame
    spar = sparse.csr_matrix(scaled)
    rec = pairwise_distances(spar, metric = 'cosine')
    
    # based off of the input, search the team of interest in our new recommender
    sim_teams = pd.DataFrame(rec, index = new_df.index, columns = new_df.index)[(team, year,     'NFL')].sort_values()[1:11]
    
    
    # return the top 5 most similar teams
    return sim_teams


In [86]:
college_recommender2(df, 'Seattle Seahawks', 2018, 'rushing offense')

team                 year  league 
Bowling Green State  2017  College    0.013447
Florida              2017  College    0.019639
Marshall             2015  College    0.023675
Florida Atlantic     2014  College    0.031092
Virginia             2018  College    0.031159
North Texas          2015  College    0.032284
Louisiana Tech       2017  College    0.034459
Connecticut          2014  College    0.038096
Kansas               2016  College    0.038936
West Virginia        2017  College    0.041806
Name: (Seattle Seahawks, 2018, NFL), dtype: float64

Looks like I have a functioning recommender, but I need to work on a way to scale the stats to the level of play. per game average works partially. Certain stats like yardage, are not translating well between college and NFL. For instance, the top rushers in the NFL are only capturing the middle of the pack rushing teams in College.