## Import some 'braries

In [424]:
import pandas as pd
pd.set_option('display.max_columns',100)

from scipy.spatial.distance import euclidean as euc
import numpy as np
np.random.seed(0)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Original

### Setting up df

In [387]:
# Creating initial dataframe
df1 = pd.read_pickle('final_df.pickle')

dupes = df1[df1.duplicated(['pid','age'],keep=False)].sort_values(by=['player','g'],ascending=False)

df1 = df1.drop(dupes.index)

df1 = pd.concat([df1, dupes[dupes.tm == 'TOT']])

df1 = df1.rename(columns={'3p':'fg3',
                   '3pa':'fg3a',
                   '2p':'fg2',
                   '2pa':'fg2a',
                   '3p_pct':'fg3_pct',
                   '2p_pct':'fg2_pct'})

In [388]:
# Features of interest 
cols = ['player', 'pos', 'mp', 'fg2_pct', 'fg3_pct', 'ft_pct', 'pts', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'ws']

In [437]:
# Setting up df to have only recent players 
test = df1[df1.season == 2019][cols]
test.player = [name.title() for name in test.player]

In [390]:
# Created dummies 
test['PG']=list(map(lambda x: 1 if 'PG' in x else 0, list(test.pos)))
test['SG']=list(map(lambda x: 1 if 'SG' in x else 0, list(test.pos)))
test['SF']=list(map(lambda x: 1 if 'SF' in x else 0, list(test.pos)))
test['PF']=list(map(lambda x: 1 if 'PF' in x else 0, list(test.pos)))
test['C']=list(map(lambda x: 1 if 'C' in x else 0, list(test.pos)))

### With position specified, all stats and win-share only

In [391]:
position = 'PG'

In [537]:
player = test[test.player == input('Please enter player name: \n').title()].index[0]
player = test.loc[player]

Please enter player name: 
lebron james


In [538]:
indie = (test[test.columns[2:]] - player[2:]).abs().sum(axis=1).sort_values().index[1:]

###### Filtered by position

In [539]:
filt = test.loc[indie]
filt[filt.pos == position].iloc[:10]

Unnamed: 0,player,pos,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,ws
25951,Kyrie Irving,PG,2214,0.533,0.401,0.873,1596,335,464,103,34,172,167,9.1
25773,Mike Conley,PG,2342,0.483,0.364,0.845,1478,239,449,94,22,130,123,8.0
25691,Eric Bledsoe,PG,2272,0.582,0.329,0.75,1241,362,430,116,29,165,156,8.2
26119,Jamal Murray,PG,2447,0.476,0.367,0.848,1367,317,363,67,27,158,153,5.1
25809,Spencer Dinwiddie,PG,1914,0.528,0.335,0.806,1143,166,311,40,17,152,187,4.8
26161,Chris Paul,PG,1857,0.479,0.358,0.862,906,265,473,114,18,152,146,6.6
25789,Stephen Curry,PG,2331,0.525,0.437,0.916,1881,369,361,92,25,192,166,9.7
26202,Ricky Rubio,PG,1899,0.454,0.311,0.855,864,243,416,91,10,180,180,3.7
26211,Dennis Schröder,PG,2314,0.45,0.341,0.819,1224,284,323,65,12,172,189,2.9
26203,D'Angelo Russell,PG,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,5.0


###### Not filtered by position

In [540]:
filt.iloc[:10]

Unnamed: 0,player,pos,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,ws
26314,Lou Williams,SG,1993,0.447,0.361,0.876,1498,222,402,57,11,181,80,5.1
26027,Kawhi Leonard,SF,2040,0.542,0.371,0.854,1596,439,199,106,24,121,87,9.5
25811,Luka Dončić,SG,2318,0.503,0.327,0.713,1526,563,429,77,25,247,137,4.9
25951,Kyrie Irving,PG,2214,0.533,0.401,0.873,1596,335,464,103,34,172,167,9.1
26018,Zach Lavine,SG,2171,0.504,0.374,0.832,1492,294,283,60,26,215,140,2.8
25860,Danilo Gallinari,SF,2059,0.484,0.433,0.904,1346,417,178,49,23,99,129,8.2
25773,Mike Conley,PG,2342,0.483,0.364,0.845,1478,239,449,94,22,130,123,8.0
26087,Khris Middleton,SF,2393,0.485,0.378,0.837,1407,461,331,80,7,174,172,6.1
25791,Anthony Davis,C,1850,0.547,0.331,0.794,1452,672,218,88,135,112,132,9.5
25691,Eric Bledsoe,PG,2272,0.582,0.329,0.75,1241,362,430,116,29,165,156,8.2


## Using different df

In [129]:
df = pd.read_pickle('stats_and_lr_preds.pickle')

### With all stats and win-share only

In [406]:
main = df[df.season==2019].drop(['season','season_max','draft_year','lasso_preds','ridge_preds',
                                 'lin_preds','age', 'experience','g'],axis=1)
main.player = [name.title() for name in main.player]

print('Possible positions: \n PG \n SG \n SF \n PF \n C')

unavail = main[(main.player == input('Please input player name: \n').title())].index[0]
unavail = main.loc[unavail]

Possible positions: 
 PG 
 SG 
 SF 
 PF 
 C
Please input player name: 
james harden


In [430]:
ind = (main[main.columns[1:]] - unavail[1:]).abs().sum(axis=1).sort_values().index[1:11]

In [431]:
main.loc[ind]

Unnamed: 0,player,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,pos_PF,pos_PG,pos_SF,pos_SG,ridge_preds
15806,Damian Lillard,2838,0.499,0.369,0.912,2067,371,551,88,34,212,148,0,1,0,0,22.789686
15978,Kemba Walker,2863,0.494,0.356,0.844,2102,361,484,102,34,211,131,0,1,0,0,22.024131
15693,Paul George,2841,0.484,0.386,0.839,2159,628,318,170,34,205,214,0,0,1,0,24.246018
15565,Bradley Beal,3028,0.548,0.351,0.808,2099,411,448,121,58,224,226,0,0,0,1,21.629501
15662,Kevin Durant,2702,0.587,0.353,0.885,2027,497,457,58,84,225,155,0,0,1,0,22.029555
15985,Russell Westbrook,2630,0.481,0.29,0.656,1675,807,784,142,33,325,245,0,1,0,0,23.333615
15708,Blake Griffin,2622,0.525,0.362,0.753,1841,565,402,52,28,253,199,1,0,0,0,17.178479
15936,Ben Simmons,2700,0.566,0.0,0.6,1337,697,610,112,61,274,209,0,1,0,0,18.092748
15921,D'Angelo Russell,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,0,1,0,0,16.759463
15648,Demar Derozan,2688,0.492,0.156,0.83,1635,462,475,86,36,199,177,0,0,0,1,19.016495


### With all stats and win-share and ridge regression predicted win-share

In [416]:
main1 = df[df.season==2019].drop(['season','season_max','draft_year','lasso_preds',
                                 'lin_preds','age', 'experience','g'],axis=1)
main1.player = [name.title() for name in main1.player]

print('Possible positions: \n PG \n SG \n SF \n PF \n C')

unavail = main1[(main1.player == input('Please input player name: \n').title())].index[0]
unavail = main1.loc[unavail]

Possible positions: 
 PG 
 SG 
 SF 
 PF 
 C
Please input player name: 
james harden


In [432]:
ind = (main1[main1.columns[1:]] - unavail[1:]).abs().sum(axis=1).sort_values().index[1:11]

In [433]:
main1.loc[ind]

Unnamed: 0,player,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,ws,pos_PF,pos_PG,pos_SF,pos_SG,ridge_preds
15806,Damian Lillard,2838,0.499,0.369,0.912,2067,371,551,88,34,212,148,12.1,0,1,0,0,22.789686
15978,Kemba Walker,2863,0.494,0.356,0.844,2102,361,484,102,34,211,131,7.4,0,1,0,0,22.024131
15693,Paul George,2841,0.484,0.386,0.839,2159,628,318,170,34,205,214,11.9,0,0,1,0,24.246018
15565,Bradley Beal,3028,0.548,0.351,0.808,2099,411,448,121,58,224,226,7.6,0,0,0,1,21.629501
15662,Kevin Durant,2702,0.587,0.353,0.885,2027,497,457,58,84,225,155,11.5,0,0,1,0,22.029555
15985,Russell Westbrook,2630,0.481,0.29,0.656,1675,807,784,142,33,325,245,6.8,0,1,0,0,23.333615
15708,Blake Griffin,2622,0.525,0.362,0.753,1841,565,402,52,28,253,199,8.0,1,0,0,0,17.178479
15936,Ben Simmons,2700,0.566,0.0,0.6,1337,697,610,112,61,274,209,8.2,0,1,0,0,18.092748
15921,D'Angelo Russell,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,5.0,0,1,0,0,16.759463
15648,Demar Derozan,2688,0.492,0.156,0.83,1635,462,475,86,36,199,177,6.3,0,0,0,1,19.016495


### With all stats and ridge regression predicted win-share only

In [451]:
main2 = df[df.season==2019].drop(['season','season_max','draft_year','lasso_preds','ws',
                                 'lin_preds','age', 'experience','g'],axis=1)
main2.player = [name.title() for name in main2.player]

print('Possible positions: \n PG \n SG \n SF \n PF \n C')

unavail = main2[(main2.player == input('Please input player name: \n').title())].index[0]
unavail = main2.loc[unavail]

Possible positions: 
 PG 
 SG 
 SF 
 PF 
 C
Please input player name: 
james harden


In [494]:
ind = (main2[main2.columns[1:]] - unavail[1:]).abs().sum(axis=1).sort_values().index[1:11]
main2.loc[ind]
# main2.merge(ind, left_index=True, right_index=True)

Unnamed: 0,player,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,pos_PF,pos_PG,pos_SF,pos_SG,ridge_preds
15806,Damian Lillard,2838,0.499,0.369,0.912,2067,371,551,88,34,212,148,0,1,0,0,22.789686
15978,Kemba Walker,2863,0.494,0.356,0.844,2102,361,484,102,34,211,131,0,1,0,0,22.024131
15693,Paul George,2841,0.484,0.386,0.839,2159,628,318,170,34,205,214,0,0,1,0,24.246018
15565,Bradley Beal,3028,0.548,0.351,0.808,2099,411,448,121,58,224,226,0,0,0,1,21.629501
15662,Kevin Durant,2702,0.587,0.353,0.885,2027,497,457,58,84,225,155,0,0,1,0,22.029555
15985,Russell Westbrook,2630,0.481,0.29,0.656,1675,807,784,142,33,325,245,0,1,0,0,23.333615
15708,Blake Griffin,2622,0.525,0.362,0.753,1841,565,402,52,28,253,199,1,0,0,0,17.178479
15936,Ben Simmons,2700,0.566,0.0,0.6,1337,697,610,112,61,274,209,0,1,0,0,18.092748
15921,D'Angelo Russell,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,0,1,0,0,16.759463
15648,Demar Derozan,2688,0.492,0.156,0.83,1635,462,475,86,36,199,177,0,0,0,1,19.016495


In [435]:
main2.loc[ind]

Unnamed: 0,player,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,pos_PF,pos_PG,pos_SF,pos_SG,ridge_preds
15806,Damian Lillard,2838,0.499,0.369,0.912,2067,371,551,88,34,212,148,0,1,0,0,22.789686
15978,Kemba Walker,2863,0.494,0.356,0.844,2102,361,484,102,34,211,131,0,1,0,0,22.024131
15693,Paul George,2841,0.484,0.386,0.839,2159,628,318,170,34,205,214,0,0,1,0,24.246018
15565,Bradley Beal,3028,0.548,0.351,0.808,2099,411,448,121,58,224,226,0,0,0,1,21.629501
15662,Kevin Durant,2702,0.587,0.353,0.885,2027,497,457,58,84,225,155,0,0,1,0,22.029555
15985,Russell Westbrook,2630,0.481,0.29,0.656,1675,807,784,142,33,325,245,0,1,0,0,23.333615
15708,Blake Griffin,2622,0.525,0.362,0.753,1841,565,402,52,28,253,199,1,0,0,0,17.178479
15936,Ben Simmons,2700,0.566,0.0,0.6,1337,697,610,112,61,274,209,0,1,0,0,18.092748
15921,D'Angelo Russell,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,0,1,0,0,16.759463
15648,Demar Derozan,2688,0.492,0.156,0.83,1635,462,475,86,36,199,177,0,0,0,1,19.016495


Notice no difference in recommendation list

## Function

In [474]:
# DataFrame
data = pd.read_pickle('final_df.pickle')

dupes = data[data.duplicated(['pid','age'],keep=False)].sort_values(by=['player','g'],ascending=False)

data = data.drop(dupes.index)

data = pd.concat([data, dupes[dupes.tm == 'TOT']])

data = data.rename(columns={'3p':'fg3',
                   '3pa':'fg3a',
                   '2p':'fg2',
                   '2pa':'fg2a',
                   '3p_pct':'fg3_pct',
                   '2p_pct':'fg2_pct'})

# Features of interest 
cols = ['player', 'pos', 'mp', 'fg2_pct', 'fg3_pct', 'ft_pct', 'pts', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'ws']

# Setting up df to have only recent players 
main_df = data[data.season == 2019][cols]
main_df.player = [name.title() for name in main_df.player]

In [516]:
# helper function for rec
def check_position(pos):
    pos_dict = {'1': 'PG',
               '2': 'SG',
               '3': 'SF',
               '4': 'PF',
               '5': 'C'}
    if int(pos) > 5:
        return input('Please select a position: \n')
    
    return pos_dict[pos]

In [541]:
# Recommendation system 
def rec():
    # ask for player name that you want to compare
    name = input('Please input player name: \n').title()
    
    # check to make sure a name is given
    while name == '':
        name = input('You did not specify player. Please input player name: \n').title()
    
    # check to see if player exists in dataframe
    if name not in list(main_df.player):
        while name not in list(main_df.player):
            name = input('Player is not in current database. Please input player name: \n').title()
    
    print('Possible positions: \n 1. PG \n 2. SG \n 3. SF \n 4. PF \n 5. C')
    
    # ask for a position to recommend
    position = input('(Optional) \nPlease specify position: \n')
    
    # check if position input is numerical
    try:
        if int(position):
            position = check_position(position)
    except ValueError:
        pass
    
    # set player to the inputted name
    player = main_df[main_df.player == name].index[0]
    player = main_df.loc[player]
    
    # subtract all other players' stats from player and aggregate for a 'distance' from input player
    # sort resulting dataframe by this distance
    # save the indeces of these players
    indeces = (main_df[main_df.columns[2:]] - player[2:]).abs().sum(axis=1).sort_values().index[1:]
    
    # select the players from the main dataframe
    neighbors = main_df.loc[indeces]
    
    # return the top 10
    if position == '':
        return neighbors.iloc[:10]
    else:
        return neighbors[neighbors.pos == position.upper()].iloc[:10]    

In [542]:
rec()

Please input player name: 
james harden
Possible positions: 
 1. PG 
 2. SG 
 3. SF 
 4. PF 
 5. C
(Optional) 
Please specify position: 
4


Unnamed: 0,player,pos,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,ws
25891,Blake Griffin,PF,2622,0.525,0.362,0.753,1841,565,402,52,28,253,199,8.0
25903,Tobias Harris,PF,2847,0.528,0.397,0.866,1644,645,229,51,37,151,184,7.1
25649,Giannis Antetokounmpo,PF,2358,0.641,0.256,0.729,1994,898,424,92,110,268,232,14.4
26226,Pascal Siakam,PF,2548,0.602,0.369,0.785,1354,549,248,73,52,154,241,9.3
25876,Aaron Gordon,PF,2633,0.499,0.349,0.731,1246,574,289,57,56,162,172,5.1
26184,Julius Randle,PF,2232,0.564,0.344,0.731,1565,634,229,52,45,208,246,6.1
26328,Thaddeus Young,PF,2489,0.564,0.349,0.644,1024,523,204,123,36,123,194,6.9
25881,Jerami Grant,PF,2612,0.555,0.392,0.71,1090,417,79,61,100,67,214,7.3
26013,Kyle Kuzma,PF,2314,0.553,0.303,0.752,1308,382,178,41,26,133,170,3.1
26280,P.J. Tucker,PF,2802,0.449,0.377,0.695,601,479,96,132,39,63,252,5.0


## SKLearn's KNN (did not use)

In [14]:
X = df.drop(['player','season_max', 'lasso_preds','ridge_preds','lin_preds','ws'],axis=1)
y = df.ws

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [22]:
pickle = pd.read_pickle('final_df.pickle')
pickle.groupby('pos').count()

Unnamed: 0_level_0,rk,player,age,tm,g,gs,mp,fg,fga,fg_pct,3p,3pa,3p_pct,2p,2pa,2p_pct,efg_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,tov,pf,pts,pid,per,ts_pct,3par,ftr,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws/48,obpm,dbpm,bpm,vorp,season,year,from,to,pk,experience,ftsy_pts
pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
C,3622,3622,3622,3622,3622,3498,3622,3622,3622,3599,3622,3622,3622,3622,3622,3599,3599,3622,3622,3501,3622,3622,3622,3622,3622,3622,3622,3622,3622,3622,3622,3604,3599,3599,3622,3622,3622,3622,3622,3622,3608,3622,3622,3622,3622,3622,3622,3622,3622,3622,3622,3622,3622,3622,3622,3622,3622
C-PF,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25
PF,3716,3716,3716,3716,3716,3576,3716,3716,3716,3703,3716,3716,3716,3716,3716,3699,3703,3716,3716,3600,3716,3716,3716,3716,3716,3716,3716,3716,3716,3716,3716,3704,3703,3703,3716,3716,3716,3716,3716,3716,3707,3716,3716,3716,3716,3716,3716,3716,3716,3716,3716,3716,3716,3716,3716,3716,3716
PF-C,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
PF-SF,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,20,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22
PG,3248,3248,3248,3248,3248,3120,3248,3248,3248,3244,3248,3248,3248,3248,3248,3237,3244,3248,3248,3145,3248,3248,3248,3248,3248,3248,3248,3248,3248,3248,3247,3245,3244,3244,3247,3247,3247,3247,3247,3247,3245,3247,3248,3248,3248,3247,3248,3248,3248,3248,3248,3248,3248,3248,3248,3248,3248
PG-SF,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
PG-SG,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
SF,3328,3328,3328,3328,3328,3191,3328,3328,3328,3317,3328,3328,3328,3328,3328,3310,3317,3328,3328,3213,3328,3328,3328,3328,3328,3328,3328,3328,3328,3328,3326,3318,3317,3317,3326,3326,3326,3326,3326,3326,3320,3326,3328,3328,3328,3326,3328,3328,3328,3328,3328,3328,3328,3328,3328,3328,3328
SF-PF,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19
