## Import some 'braries

In [1]:
import pandas as pd
pd.set_option('display.max_columns',100)

from scipy.spatial.distance import euclidean as euc
import numpy as np
np.random.seed(0)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Original

### Setting up df

In [387]:
# Creating initial dataframe
df1 = pd.read_pickle('final_df.pickle')

dupes = df1[df1.duplicated(['pid','age'],keep=False)].sort_values(by=['player','g'],ascending=False)

df1 = df1.drop(dupes.index)

df1 = pd.concat([df1, dupes[dupes.tm == 'TOT']])

df1 = df1.rename(columns={'3p':'fg3',
                   '3pa':'fg3a',
                   '2p':'fg2',
                   '2pa':'fg2a',
                   '3p_pct':'fg3_pct',
                   '2p_pct':'fg2_pct'})

In [388]:
# Features of interest 
cols = ['player', 'pos', 'mp', 'fg2_pct', 'fg3_pct', 'ft_pct', 'pts', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'ws']

In [437]:
# Setting up df to have only recent players 
test = df1[df1.season == 2019][cols]
test.player = [name.title() for name in test.player]

In [390]:
# Created dummies 
test['PG']=list(map(lambda x: 1 if 'PG' in x else 0, list(test.pos)))
test['SG']=list(map(lambda x: 1 if 'SG' in x else 0, list(test.pos)))
test['SF']=list(map(lambda x: 1 if 'SF' in x else 0, list(test.pos)))
test['PF']=list(map(lambda x: 1 if 'PF' in x else 0, list(test.pos)))
test['C']=list(map(lambda x: 1 if 'C' in x else 0, list(test.pos)))

### With position specified, all stats and win-share only

In [391]:
position = 'PG'

In [537]:
player = test[test.player == input('Please enter player name: \n').title()].index[0]
player = test.loc[player]

Please enter player name: 
lebron james


In [538]:
indie = (test[test.columns[2:]] - player[2:]).abs().sum(axis=1).sort_values().index[1:]

###### Filtered by position

In [539]:
filt = test.loc[indie]
filt[filt.pos == position].iloc[:10]

Unnamed: 0,player,pos,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,ws
25951,Kyrie Irving,PG,2214,0.533,0.401,0.873,1596,335,464,103,34,172,167,9.1
25773,Mike Conley,PG,2342,0.483,0.364,0.845,1478,239,449,94,22,130,123,8.0
25691,Eric Bledsoe,PG,2272,0.582,0.329,0.75,1241,362,430,116,29,165,156,8.2
26119,Jamal Murray,PG,2447,0.476,0.367,0.848,1367,317,363,67,27,158,153,5.1
25809,Spencer Dinwiddie,PG,1914,0.528,0.335,0.806,1143,166,311,40,17,152,187,4.8
26161,Chris Paul,PG,1857,0.479,0.358,0.862,906,265,473,114,18,152,146,6.6
25789,Stephen Curry,PG,2331,0.525,0.437,0.916,1881,369,361,92,25,192,166,9.7
26202,Ricky Rubio,PG,1899,0.454,0.311,0.855,864,243,416,91,10,180,180,3.7
26211,Dennis Schröder,PG,2314,0.45,0.341,0.819,1224,284,323,65,12,172,189,2.9
26203,D'Angelo Russell,PG,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,5.0


###### Not filtered by position

In [540]:
filt.iloc[:10]

Unnamed: 0,player,pos,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,ws
26314,Lou Williams,SG,1993,0.447,0.361,0.876,1498,222,402,57,11,181,80,5.1
26027,Kawhi Leonard,SF,2040,0.542,0.371,0.854,1596,439,199,106,24,121,87,9.5
25811,Luka Dončić,SG,2318,0.503,0.327,0.713,1526,563,429,77,25,247,137,4.9
25951,Kyrie Irving,PG,2214,0.533,0.401,0.873,1596,335,464,103,34,172,167,9.1
26018,Zach Lavine,SG,2171,0.504,0.374,0.832,1492,294,283,60,26,215,140,2.8
25860,Danilo Gallinari,SF,2059,0.484,0.433,0.904,1346,417,178,49,23,99,129,8.2
25773,Mike Conley,PG,2342,0.483,0.364,0.845,1478,239,449,94,22,130,123,8.0
26087,Khris Middleton,SF,2393,0.485,0.378,0.837,1407,461,331,80,7,174,172,6.1
25791,Anthony Davis,C,1850,0.547,0.331,0.794,1452,672,218,88,135,112,132,9.5
25691,Eric Bledsoe,PG,2272,0.582,0.329,0.75,1241,362,430,116,29,165,156,8.2


## Using different df

In [129]:
df = pd.read_pickle('stats_and_lr_preds.pickle')

### With all stats and win-share only

In [406]:
main = df[df.season==2019].drop(['season','season_max','draft_year','lasso_preds','ridge_preds',
                                 'lin_preds','age', 'experience','g'],axis=1)
main.player = [name.title() for name in main.player]

print('Possible positions: \n PG \n SG \n SF \n PF \n C')

unavail = main[(main.player == input('Please input player name: \n').title())].index[0]
unavail = main.loc[unavail]

Possible positions: 
 PG 
 SG 
 SF 
 PF 
 C
Please input player name: 
james harden


In [430]:
ind = (main[main.columns[1:]] - unavail[1:]).abs().sum(axis=1).sort_values().index[1:11]

In [431]:
main.loc[ind]

Unnamed: 0,player,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,pos_PF,pos_PG,pos_SF,pos_SG,ridge_preds
15806,Damian Lillard,2838,0.499,0.369,0.912,2067,371,551,88,34,212,148,0,1,0,0,22.789686
15978,Kemba Walker,2863,0.494,0.356,0.844,2102,361,484,102,34,211,131,0,1,0,0,22.024131
15693,Paul George,2841,0.484,0.386,0.839,2159,628,318,170,34,205,214,0,0,1,0,24.246018
15565,Bradley Beal,3028,0.548,0.351,0.808,2099,411,448,121,58,224,226,0,0,0,1,21.629501
15662,Kevin Durant,2702,0.587,0.353,0.885,2027,497,457,58,84,225,155,0,0,1,0,22.029555
15985,Russell Westbrook,2630,0.481,0.29,0.656,1675,807,784,142,33,325,245,0,1,0,0,23.333615
15708,Blake Griffin,2622,0.525,0.362,0.753,1841,565,402,52,28,253,199,1,0,0,0,17.178479
15936,Ben Simmons,2700,0.566,0.0,0.6,1337,697,610,112,61,274,209,0,1,0,0,18.092748
15921,D'Angelo Russell,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,0,1,0,0,16.759463
15648,Demar Derozan,2688,0.492,0.156,0.83,1635,462,475,86,36,199,177,0,0,0,1,19.016495


### With all stats and win-share and ridge regression predicted win-share

In [416]:
main1 = df[df.season==2019].drop(['season','season_max','draft_year','lasso_preds',
                                 'lin_preds','age', 'experience','g'],axis=1)
main1.player = [name.title() for name in main1.player]

print('Possible positions: \n PG \n SG \n SF \n PF \n C')

unavail = main1[(main1.player == input('Please input player name: \n').title())].index[0]
unavail = main1.loc[unavail]

Possible positions: 
 PG 
 SG 
 SF 
 PF 
 C
Please input player name: 
james harden


In [432]:
ind = (main1[main1.columns[1:]] - unavail[1:]).abs().sum(axis=1).sort_values().index[1:11]

In [433]:
main1.loc[ind]

Unnamed: 0,player,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,ws,pos_PF,pos_PG,pos_SF,pos_SG,ridge_preds
15806,Damian Lillard,2838,0.499,0.369,0.912,2067,371,551,88,34,212,148,12.1,0,1,0,0,22.789686
15978,Kemba Walker,2863,0.494,0.356,0.844,2102,361,484,102,34,211,131,7.4,0,1,0,0,22.024131
15693,Paul George,2841,0.484,0.386,0.839,2159,628,318,170,34,205,214,11.9,0,0,1,0,24.246018
15565,Bradley Beal,3028,0.548,0.351,0.808,2099,411,448,121,58,224,226,7.6,0,0,0,1,21.629501
15662,Kevin Durant,2702,0.587,0.353,0.885,2027,497,457,58,84,225,155,11.5,0,0,1,0,22.029555
15985,Russell Westbrook,2630,0.481,0.29,0.656,1675,807,784,142,33,325,245,6.8,0,1,0,0,23.333615
15708,Blake Griffin,2622,0.525,0.362,0.753,1841,565,402,52,28,253,199,8.0,1,0,0,0,17.178479
15936,Ben Simmons,2700,0.566,0.0,0.6,1337,697,610,112,61,274,209,8.2,0,1,0,0,18.092748
15921,D'Angelo Russell,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,5.0,0,1,0,0,16.759463
15648,Demar Derozan,2688,0.492,0.156,0.83,1635,462,475,86,36,199,177,6.3,0,0,0,1,19.016495


### With all stats and ridge regression predicted win-share only

In [451]:
main2 = df[df.season==2019].drop(['season','season_max','draft_year','lasso_preds','ws',
                                 'lin_preds','age', 'experience','g'],axis=1)
main2.player = [name.title() for name in main2.player]

print('Possible positions: \n PG \n SG \n SF \n PF \n C')

unavail = main2[(main2.player == input('Please input player name: \n').title())].index[0]
unavail = main2.loc[unavail]

Possible positions: 
 PG 
 SG 
 SF 
 PF 
 C
Please input player name: 
james harden


In [494]:
ind = (main2[main2.columns[1:]] - unavail[1:]).abs().sum(axis=1).sort_values().index[1:11]
main2.loc[ind]
# main2.merge(ind, left_index=True, right_index=True)

Unnamed: 0,player,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,pos_PF,pos_PG,pos_SF,pos_SG,ridge_preds
15806,Damian Lillard,2838,0.499,0.369,0.912,2067,371,551,88,34,212,148,0,1,0,0,22.789686
15978,Kemba Walker,2863,0.494,0.356,0.844,2102,361,484,102,34,211,131,0,1,0,0,22.024131
15693,Paul George,2841,0.484,0.386,0.839,2159,628,318,170,34,205,214,0,0,1,0,24.246018
15565,Bradley Beal,3028,0.548,0.351,0.808,2099,411,448,121,58,224,226,0,0,0,1,21.629501
15662,Kevin Durant,2702,0.587,0.353,0.885,2027,497,457,58,84,225,155,0,0,1,0,22.029555
15985,Russell Westbrook,2630,0.481,0.29,0.656,1675,807,784,142,33,325,245,0,1,0,0,23.333615
15708,Blake Griffin,2622,0.525,0.362,0.753,1841,565,402,52,28,253,199,1,0,0,0,17.178479
15936,Ben Simmons,2700,0.566,0.0,0.6,1337,697,610,112,61,274,209,0,1,0,0,18.092748
15921,D'Angelo Russell,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,0,1,0,0,16.759463
15648,Demar Derozan,2688,0.492,0.156,0.83,1635,462,475,86,36,199,177,0,0,0,1,19.016495


In [435]:
main2.loc[ind]

Unnamed: 0,player,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,pos_PF,pos_PG,pos_SF,pos_SG,ridge_preds
15806,Damian Lillard,2838,0.499,0.369,0.912,2067,371,551,88,34,212,148,0,1,0,0,22.789686
15978,Kemba Walker,2863,0.494,0.356,0.844,2102,361,484,102,34,211,131,0,1,0,0,22.024131
15693,Paul George,2841,0.484,0.386,0.839,2159,628,318,170,34,205,214,0,0,1,0,24.246018
15565,Bradley Beal,3028,0.548,0.351,0.808,2099,411,448,121,58,224,226,0,0,0,1,21.629501
15662,Kevin Durant,2702,0.587,0.353,0.885,2027,497,457,58,84,225,155,0,0,1,0,22.029555
15985,Russell Westbrook,2630,0.481,0.29,0.656,1675,807,784,142,33,325,245,0,1,0,0,23.333615
15708,Blake Griffin,2622,0.525,0.362,0.753,1841,565,402,52,28,253,199,1,0,0,0,17.178479
15936,Ben Simmons,2700,0.566,0.0,0.6,1337,697,610,112,61,274,209,0,1,0,0,18.092748
15921,D'Angelo Russell,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,0,1,0,0,16.759463
15648,Demar Derozan,2688,0.492,0.156,0.83,1635,462,475,86,36,199,177,0,0,0,1,19.016495


Notice no difference in recommendation list

## Function

In [2]:
# DataFrame
data = pd.read_pickle('final_df.pickle')

dupes = data[data.duplicated(['pid','age'],keep=False)].sort_values(by=['player','g'],ascending=False)

data = data.drop(dupes.index)

data = pd.concat([data, dupes[dupes.tm == 'TOT']])

data = data.rename(columns={'3p':'fg3',
                   '3pa':'fg3a',
                   '2p':'fg2',
                   '2pa':'fg2a',
                   '3p_pct':'fg3_pct',
                   '2p_pct':'fg2_pct'})

# Features of interest 
cols = ['player', 'pos', 'mp', 'fg2_pct', 'fg3_pct', 'ft_pct',
        'pts', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'ws']

# Setting up df to have only recent players 
main_df = data[data.season == 2019][cols]
main_df.player = [name.title() for name in main_df.player]

In [7]:
name = 'james harden'.title()
position = 'pg'.upper()
category = ['trb']

playa = data[data.player == name].index[0]
playa = data.loc[playa]



In [11]:
data

Unnamed: 0,rk,player,pos,age,tm,g,gs,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,fg2,fg2a,fg2_pct,efg_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,tov,pf,pts,pid,per,ts_pct,3par,ftr,orb_pct,drb_pct,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws,ws,ws/48,obpm,dbpm,bpm,vorp,season,year,from,to,pk,experience,ftsy_pts
5991,1,Kareem Abdul-Jabbar,C,32,LAL,82,,3143,835,1383,0.604,0,1,0.000,835,1382,0.604,0.604,364,476,0.765,190,696,886,371,81,280,297,216,2034,abdulka01,25.3,0.639,0.001,0.344,7.2,22.2,15.4,16.5,1.2,4.6,15.7,24.1,9.5,5.3,14.8,0.227,4.3,2.4,6.7,6.8,1980,1969,1970,1989,1,10,4439.7
5992,2,Tom Abernethy,PF,25,GSW,67,,1222,153,318,0.481,0,1,0.000,153,317,0.483,0.481,56,82,0.683,62,129,191,87,35,12,39,118,362,abernto01,11.0,0.511,0.003,0.258,5.4,12.0,8.6,9.3,1.4,0.6,9.9,13.3,1.2,0.8,2.0,0.080,-1.8,0.2,-1.6,0.1,1980,1976,1977,1981,43,3,823.7
5993,3,Alvan Adams,C,25,PHO,75,,2168,465,875,0.531,0,2,0.000,465,873,0.533,0.531,188,236,0.797,158,451,609,322,108,55,218,237,1118,adamsal01,19.2,0.571,0.002,0.270,8.2,22.4,15.4,21.6,2.3,1.4,18.2,21.9,3.1,3.9,7.0,0.155,2.4,2.1,4.4,3.5,1980,1975,1976,1988,4,4,2602.8
5994,4,Tiny Archibald,PG,31,BOS,80,80.0,2864,383,794,0.482,4,18,0.222,379,776,0.488,0.485,361,435,0.830,59,138,197,671,106,10,242,218,1131,architi01,15.3,0.574,0.023,0.548,2.3,5.3,3.8,30.2,1.7,0.2,19.7,17.0,5.9,2.9,8.9,0.148,1.9,-1.9,0.0,1.5,1980,1970,1971,1984,19,9,2479.9
5995,5,Dennis Awtrey,C,31,CHI,26,,560,27,60,0.450,0,0,0.000,27,60,0.450,0.450,32,50,0.640,29,86,115,40,12,15,27,66,86,awtrede01,7.4,0.524,0.000,0.833,6.0,16.9,11.5,9.0,1.0,1.5,24.8,7.9,0.1,0.5,0.6,0.053,-3.3,1.9,-1.4,0.1,1980,1970,1971,1982,46,9,338.0
5996,6,Gus Bailey,SG,28,WSB,20,,180,16,35,0.457,1,1,1.000,15,34,0.441,0.471,5,13,0.385,6,22,28,26,7,4,11,18,38,bailegu01,9.3,0.467,0.029,0.371,3.3,12.4,7.8,17.8,1.8,1.2,21.3,11.3,0.0,0.2,0.2,0.043,-3.8,1.4,-2.4,0.0,1980,1974,1975,1980,23,5,132.6
5997,7,James Bailey,PF,22,SEA,67,,726,122,271,0.450,0,0,0.000,122,271,0.450,0.450,68,101,0.673,71,126,197,28,21,54,79,116,312,baileja01,12.3,0.495,0.000,0.373,10.2,18.3,14.3,5.3,1.4,4.1,20.0,21.4,-0.4,1.4,1.0,0.063,-3.5,0.0,-3.5,-0.3,1980,1979,1980,1988,6,0,736.4
5998,8,Greg Ballard,SF,25,WSB,82,,2438,545,1101,0.495,16,47,0.340,529,1054,0.502,0.502,171,227,0.753,240,398,638,159,90,36,133,197,1277,ballagr01,18.1,0.532,0.043,0.206,9.8,16.6,13.1,9.7,1.7,0.8,10.0,21.6,4.1,2.8,6.9,0.136,1.9,0.6,2.5,2.7,1980,1977,1978,1989,4,2,2526.1
5999,9,Mike Bantom,SF,28,IND,77,,2330,384,760,0.505,1,3,0.333,383,757,0.506,0.506,139,209,0.665,192,264,456,279,85,49,189,268,908,bantomi01,13.7,0.533,0.004,0.275,8.3,12.1,10.1,15.9,1.7,1.1,18.2,17.3,2.1,1.9,3.9,0.081,-0.1,0.5,0.3,1.4,1980,1973,1974,1982,8,6,2086.7
6000,10,Marvin Barnes,PF,27,SDC,20,,287,24,60,0.400,0,0,0.000,24,60,0.400,0.400,16,32,0.500,34,43,77,18,5,12,18,52,64,barnema01,8.2,0.432,0.000,0.533,12.4,16.8,14.5,7.8,0.8,2.3,19.5,12.8,-0.1,0.2,0.1,0.019,-4.3,1.2,-3.1,-0.1,1980,1974,1977,1980,2,3,216.4


In [3]:
# helper function for rec 
def check_position(pos):
    pos_dict = {'1': 'PG',
               '2': 'SG',
               '3': 'SF',
               '4': 'PF',
               '5': 'C'}
    if int(pos) > 5:
        return input('Please select a position: \n')
    
    return pos_dict[pos]

In [9]:
# Recommendation system 
def rec():
    # ask for player name that you want to compare
    name = input('Please input player name: \n').title()
    
    # check to make sure a name is given
    while name == '':
        name = input('You did not specify player. Please input player name: \n').title()
    
    # check to see if player exists in dataframe
    if name not in list(main_df.player):
        while name not in list(main_df.player):
            name = input('Player is not in current database. Please input player name: \n').title()
    
    print('Possible positions: \n 1. PG \n 2. SG \n 3. SF \n 4. PF \n 5. C')
    
    # ask for a position to recommend
    position = input('(Optional) \nPlease specify position: \n')
    
    # check if position input is numerical
    try:
        if int(position):
            position = check_position(position)
    except ValueError:
        pass
    
    # set player to the inputted name
    player = main_df[main_df.player == name].index[0]
    player = main_df.loc[player]
    
    # subtract all other players' stats from player and aggregate for a 'distance' from input player
    # sort resulting dataframe by this distance
    # save the indeces of these players
    indeces = (main_df[main_df.columns[2:]] - player[2:]).abs().sum(axis=1).sort_values().index[1:]
    
    # select the players from the main dataframe
    neighbors = main_df.loc[indeces]
    
    # return the top 10
    if position == '':
        return neighbors.iloc[:10]
    else:
        return neighbors[neighbors.pos == position.upper()].iloc[:10]    

In [21]:
rec()

Please input player name: 
stephen curry
Possible positions: 
 1. PG 
 2. SG 
 3. SF 
 4. PF 
 5. C
(Optional) 
Please specify position: 
1


Unnamed: 0,player,pos,mp,fg2_pct,fg3_pct,ft_pct,pts,trb,ast,stl,blk,tov,pf,ws
25951,Kyrie Irving,PG,2214,0.533,0.401,0.873,1596,335,464,103,34,172,167,9.1
26203,D'Angelo Russell,PG,2448,0.482,0.369,0.78,1712,315,563,100,20,253,141,5.0
25773,Mike Conley,PG,2342,0.483,0.364,0.845,1478,239,449,94,22,130,123,8.0
26119,Jamal Murray,PG,2447,0.476,0.367,0.848,1367,317,363,67,27,158,153,5.1
25691,Eric Bledsoe,PG,2272,0.582,0.329,0.75,1241,362,430,116,29,165,156,8.2
26211,Dennis Schröder,PG,2314,0.45,0.341,0.819,1224,284,323,65,12,172,189,2.9
26031,Damian Lillard,PG,2838,0.499,0.369,0.912,2067,371,551,88,34,212,148,12.1
26295,Kemba Walker,PG,2863,0.494,0.356,0.844,2102,361,484,102,34,211,131,7.4
25961,Reggie Jackson,PG,2289,0.464,0.369,0.864,1260,216,344,55,9,148,208,5.0
26329,Trae Young,PG,2503,0.477,0.324,0.829,1549,301,653,72,15,308,140,3.3
