### Dataset A - Web Scraper client

In [1]:
# UNCOMMENT IF RUNNING FIRST TIME: !pip install basketball_reference_web_scraper

In [2]:
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import OutputType

In [3]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [4]:
# Get advanced season statistics for all players, from the 1951-1952 season to the 2018-2019 season (68 seasons).

# UNCOMMENT IF RUNNING FIRST TIME:
# for i in list(range(1952,2020)):
#    i_year_csv = 'season_end_year_' + str(i) + '.csv'
#    client.players_advanced_season_totals(season_end_year=i, output_type= OutputType.CSV, output_file_path= str(i_year_csv))

In [5]:
# Data Cleaning
# Running a loop to:
# 1) Keeping and renaming the essential columns: playerID, name, position, age, PER, minutes played
# 2) Creating a new column for the year (to be used later on to determine the first season played by each player)
# 3) Saving the data for each season into a dataframe, and collating all the dataframes into a dictionary (season_data)
season_data={}
for i in list(range(1952,2020)):

    i_year_csv = 'season_end_year_' + str(i) + '.csv'
    
    df = pd.read_csv(str(i_year_csv))
    df = df[['slug', 'name', 'positions', 'age','player_efficiency_rating', 'minutes_played']]
    df = df.rename(columns={'slug': 'playerID', 'positions': 'position', 'player_efficiency_rating': 'PER', 'minutes_played': 'mp'})
    df['yr'] = int(i)
    
    season_data[i] = df

In [6]:
# Previewing the data
season_data[2004].head()

Unnamed: 0,playerID,name,position,age,PER,mp,yr
0,abdursh01,Shareef Abdur-Rahim,POWER FORWARD,27,21.2,1956,2004
1,abdursh01,Shareef Abdur-Rahim,POWER FORWARD,27,16.5,728,2004
2,allenma01,Malik Allen,POWER FORWARD,25,10.5,616,2004
3,allenra02,Ray Allen,SHOOTING GUARD,28,21.7,2152,2004
4,alstora01,Rafer Alston,POINT GUARD,27,13.7,2581,2004


In [7]:
# Data Cleaning
# Running a loop to ensure there is only one set of player data for each season
# (duplicates were a result of player transferring teams mid-season)
for i in list(range(1952,2020)):
    season_data[i] = season_data[i].groupby('playerID').agg({'name':'first', 'position': 'first', 'age': 'first', 'PER': 'mean', 'mp': 'sum', 'yr': 'first'}).reset_index()

season_data[2004].head()

Unnamed: 0,playerID,name,position,age,PER,mp,yr
0,abdursh01,Shareef Abdur-Rahim,POWER FORWARD,27,18.85,2684,2004
1,allenma01,Malik Allen,POWER FORWARD,25,10.5,616,2004
2,allenra02,Ray Allen,SHOOTING GUARD,28,21.7,2152,2004
3,alstora01,Rafer Alston,POINT GUARD,27,13.7,2581,2004
4,anderch01,Chris Andersen,POWER FORWARD,25,14.4,1029,2004


In [8]:
# Methodology - Identifying key data for each player
# First, creating an empty dataframe for each of the players with data
player_data = {}
for i in list(range(1952,2020)):
    for index, row in season_data[i].iterrows():
        player_data[row['name']] = pd.DataFrame(columns=['playerID', 'name', 'position', 'age', 'PER', 'yr'])

In [9]:
# Methodology
# Second, appending each season's data to the dataframe as a row, based on player name
for i in list(range(1952,2020)):
    for index, row in season_data[i].iterrows():
        player_data[row['name']] = player_data[row['name']].append(row, ignore_index = True)

In [10]:
# Previewing the data (no. of players + sample of 1 player's data)
print(len(player_data))
player_data['Kareem Abdul-Jabbar']

3996


Unnamed: 0,playerID,name,position,age,PER,yr,mp
0,abdulka01,Kareem Abdul-Jabbar,CENTER,22,22.5,1970,3534.0
1,abdulka01,Kareem Abdul-Jabbar,CENTER,23,29.0,1971,3288.0
2,abdulka01,Kareem Abdul-Jabbar,CENTER,24,29.9,1972,3583.0
3,abdulka01,Kareem Abdul-Jabbar,CENTER,25,28.5,1973,3254.0
4,abdulka01,Kareem Abdul-Jabbar,CENTER,26,24.4,1974,3548.0
5,abdulka01,Kareem Abdul-Jabbar,CENTER,27,26.4,1975,2747.0
6,abdulka01,Kareem Abdul-Jabbar,CENTER,28,27.2,1976,3379.0
7,abdulka01,Kareem Abdul-Jabbar,CENTER,29,27.8,1977,3016.0
8,abdulka01,Kareem Abdul-Jabbar,CENTER,30,29.2,1978,2265.0
9,abdulka01,Kareem Abdul-Jabbar,CENTER,31,25.5,1979,3157.0


In [11]:
# Filtering each player's data for the first 3 years
# Finding the first year, average PER and average minutes played in the first 3 years
for key, value in player_data.items():
    player_data[key] = player_data[key].head(3)
    player_data[key] = player_data[key].groupby('playerID').agg({'name':'first', 'position': 'first', 'age': 'first', 'PER': 'mean', 'mp': 'mean', 'yr': 'first'}).reset_index()
    player_data[key] = player_data[key].rename(columns={'age': 'firstage', 'PER': 'avgfirst3PER', 'mp': 'avgfirst3MP', 'yr': 'firstyr'})

In [12]:
player_data['Kareem Abdul-Jabbar']

Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr
0,abdulka01,Kareem Abdul-Jabbar,CENTER,22,27.133333,3468.333333,1970


In [13]:
# Concatenating all the player data into a dataframe
collated_player_data = pd.DataFrame(columns=['playerID', 'name', 'position', 'firstage', 'avgfirst3PER', 'avgfirst3MP', 'firstyr'])
for key,value in player_data.items():
    collated_player_data = pd.concat([collated_player_data,player_data[key]])

In [14]:
# Checking no. of entries: 4011
print(collated_player_data.shape)
collated_player_data.head()

(4011, 7)


Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr
0,arizipa01,Paul Arizin,SMALL FORWARD,23,21.666667,2872.0,1952
0,barkecl01,Cliff Barker,SHOOTING GUARD,31,10.8,494.0,1952
0,barksdo01,Don Barksdale,POWER FORWARD,28,15.566667,1890.0,1952
0,barnhle01,Leo Barnhorst,SMALL FORWARD,27,10.633333,2426.333333,1952
0,behnkel01,Elmer Behnke,CENTER,22,7.8,55.0,1952


### Dataset B - From Part I (peak years)

In [15]:
# Joining Dataset B (Obtained from Part I of the research)
# Contains all the players whose peakage have been identified
peak_year_players = pd.read_csv('Peak Year Players.csv')
peak_year_players = peak_year_players.rename(columns={'age': 'peakage'})
print(peak_year_players.shape)
peak_year_players.head()

(1503, 9)


Unnamed: 0,playerID,name,position,peakage,PER,usage%,gp,yr,draftyr
0,arizipa01,Paul Arizin,SMALL FORWARD,23,25.5,0.0,66,1952,1952
1,cliftna01,Nat Clifton,POWER FORWARD,29,16.4,0.0,62,1952,1952
2,colemja01,Jack Coleman,POWER FORWARD,30,18.2,0.0,72,1955,1952
3,cousybo01,Bob Cousy,POINT GUARD,24,21.7,0.0,71,1953,1952
4,foustla01,Larry Foust,CENTER,27,23.8,0.0,72,1956,1952


In [16]:
# Using a join to add the peakage data (of those that are available) to existing databank of all players
collated_player_data1 = collated_player_data.join(peak_year_players.set_index('playerID'), how = 'left', lsuffix='_caller', rsuffix='_other', on='playerID')

In [17]:
# Cleaning data after the join
collated_player_data1 = collated_player_data1[['playerID', 'name_caller', 'position_caller', 'firstage', 'avgfirst3PER', 'avgfirst3MP', 'firstyr', 'peakage']]

collated_player_data1 = collated_player_data1.reset_index(drop=True)
collated_player_data1 = collated_player_data1.rename(columns = {'name_caller': 'name', 'position_caller': 'position'})

# Previewing data
# Checking to no. of entries: still 4011
print(collated_player_data1.shape)
print(collated_player_data1.head())

# Checking an entry of a player with peakage yet to be identified
print(collated_player_data1.loc[collated_player_data1['name'] == 'Kyle Kuzma'])

(4011, 8)
    playerID           name        position firstage  avgfirst3PER  \
0  arizipa01    Paul Arizin   SMALL FORWARD       23     21.666667   
1  barkecl01   Cliff Barker  SHOOTING GUARD       31     10.800000   
2  barksdo01  Don Barksdale   POWER FORWARD       28     15.566667   
3  barnhle01  Leo Barnhorst   SMALL FORWARD       27     10.633333   
4  behnkel01   Elmer Behnke          CENTER       22      7.800000   

   avgfirst3MP firstyr  peakage  
0  2872.000000    1952     23.0  
1   494.000000    1952      NaN  
2  1890.000000    1952      NaN  
3  2426.333333    1952      NaN  
4    55.000000    1952      NaN  
       playerID        name       position firstage  avgfirst3PER  \
3848  kuzmaky01  Kyle Kuzma  POWER FORWARD       22          14.1   

      avgfirst3MP firstyr  peakage  
3848       2357.5    2018      NaN  


### Dataset C - Player Details I

In [18]:
# Dataset C contains height weight and college of players from 1950 to 2017
datasetc = pd.read_csv('Dataset C - since 1950.csv')
datasetc.head()

Unnamed: 0.1,Unnamed: 0,Player,height,weight,collage,born,birth_city,birth_state
0,0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
1,1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
2,2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,
3,3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0,,
4,4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0,Hardinsburg,Kentucky


In [19]:
# Cleaning Dataset C
datasetc = datasetc[['Player', 'height', 'weight', 'collage']]
datasetc = datasetc.rename(columns = {'Player': 'name', 'height': 'height1', 'weight': 'weight1', 'collage': 'college1'})
datasetc.head()

Unnamed: 0,name,height1,weight1,college1
0,Curly Armstrong,180.0,77.0,Indiana University
1,Cliff Barker,188.0,83.0,University of Kentucky
2,Leo Barnhorst,193.0,86.0,University of Notre Dame
3,Ed Bartels,196.0,88.0,North Carolina State University
4,Ralph Beard,178.0,79.0,University of Kentucky


In [20]:
# Joining the relevant data from Dataset C to our collated player data
collated_player_data2 = collated_player_data1.join(datasetc.set_index('name'), on='name')

In [21]:
# Check no. of entries: still 4011
# Previewing data
print(collated_player_data2.shape)
collated_player_data2.head()

(4011, 11)


Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr,peakage,height1,weight1,college1
0,arizipa01,Paul Arizin,SMALL FORWARD,23,21.666667,2872.0,1952,23.0,,,
1,barkecl01,Cliff Barker,SHOOTING GUARD,31,10.8,494.0,1952,,188.0,83.0,University of Kentucky
2,barksdo01,Don Barksdale,POWER FORWARD,28,15.566667,1890.0,1952,,,,
3,barnhle01,Leo Barnhorst,SMALL FORWARD,27,10.633333,2426.333333,1952,,193.0,86.0,University of Notre Dame
4,behnkel01,Elmer Behnke,CENTER,22,7.8,55.0,1952,,201.0,95.0,Bradley University


### Dataset D - Player Details II

In [22]:
# Dataset D contains height weight and college of players from 1996 to 2019
datasetd = pd.read_csv('Dataset D - 1996-2019.csv')
datasetd.head()

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
0,0,Dennis Rodman,CHI,36.0,198.12,99.79024,Southeastern Oklahoma State,USA,1986,2,...,5.7,16.1,3.1,16.1,0.186,0.323,0.1,0.479,0.113,1996-97
1,1,Dwayne Schintzius,LAC,28.0,215.9,117.93392,Florida,USA,1990,1,...,2.3,1.5,0.3,12.3,0.078,0.151,0.175,0.43,0.048,1996-97
2,2,Earl Cureton,TOR,39.0,205.74,95.25432,Detroit Mercy,USA,1979,3,...,0.8,1.0,0.4,-2.1,0.105,0.102,0.103,0.376,0.148,1996-97
3,3,Ed O'Bannon,DAL,24.0,203.2,100.697424,UCLA,USA,1995,1,...,3.7,2.3,0.6,-8.7,0.06,0.149,0.167,0.399,0.077,1996-97
4,4,Ed Pinckney,MIA,34.0,205.74,108.86208,Villanova,USA,1985,1,...,2.4,2.4,0.2,-11.2,0.109,0.179,0.127,0.611,0.04,1996-97


In [23]:
# Cleaning Dataset D
# Previewing data
datasetd = datasetd[['player_name', 'player_height', 'player_weight', 'college']]
datasetd = datasetd.rename(columns = {'player_name': 'name', 'player_height': 'height2', 'player_weight': 'weight2', 'college': 'college2'})
datasetd = datasetd.groupby('name').agg({'height2': 'first', 'weight2': 'first', 'college2': 'first'}).reset_index()
print(datasetd.shape)
datasetd.head()

(2235, 4)


Unnamed: 0,name,height2,weight2,college2
0,A.C. Green,205.74,102.0582,Oregon State
1,A.J. Bramlett,208.28,102.965384,Arizona
2,A.J. Guyton,185.42,81.64656,Indiana
3,AJ Hammons,213.36,117.93392,Purdue
4,AJ Price,187.96,82.100152,Connecticut


In [24]:
# Using a join to extract relevant data from Dataset D to collated player data
collated_player_data3 = collated_player_data2.join(datasetd.set_index('name'), on='name')

In [25]:
# Check no. of entires: still 4011
# Previewing data
print(collated_player_data3.shape)
collated_player_data3.tail()

(4011, 14)


Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr,peakage,height1,weight1,college1,height2,weight2,college2
4006,welshth01,Thomas Welsh,CENTER,22,16.1,36.0,2019,,,,,213.36,115.66596,UCLA
4007,willijo04,Johnathan Williams,CENTER,23,15.1,372.0,2019,,,,,205.74,103.418976,Gonzaga
4008,willike04,Kenrich Williams,SMALL FORWARD,24,9.7,1079.0,2019,,,,,200.66,95.25432,
4009,williro04,Robert Williams,CENTER,21,18.8,283.0,2019,,,,,,,
4010,youngtr01,Trae Young,POINT GUARD,20,17.0,2503.0,2019,,,,,187.96,81.64656,Oklahoma


In [26]:
# Cleaning information obtained from both Dataset C and Dataset D:
# If there was data from both C and D, finding the average of those values
# Ensuring that each player has a maximum of one value for height, weight and college
collated_player_data3['height'] = collated_player_data3[['height1', 'height2']].mean(axis = 1)
collated_player_data3['weight'] = collated_player_data3[['weight1', 'weight2']].mean(axis = 1)
collated_player_data3['college'] = np.nan

collated_player_data3['college'] = collated_player_data3['college'].fillna(collated_player_data3['college1'])
collated_player_data3['college'] = collated_player_data3['college'].fillna(collated_player_data3['college2'])

collated_player_data3 = collated_player_data3.drop(columns = ['height1', 'weight1', 'college1', 'height2', 'weight2', 'college2'])

In [27]:
# Check no. of entries: still 4011
# Previewing data
print(collated_player_data3.shape)
collated_player_data3.tail()

(4011, 11)


Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr,peakage,height,weight,college
4006,welshth01,Thomas Welsh,CENTER,22,16.1,36.0,2019,,213.36,115.66596,UCLA
4007,willijo04,Johnathan Williams,CENTER,23,15.1,372.0,2019,,205.74,103.418976,Gonzaga
4008,willike04,Kenrich Williams,SMALL FORWARD,24,9.7,1079.0,2019,,200.66,95.25432,
4009,williro04,Robert Williams,CENTER,21,18.8,283.0,2019,,,,
4010,youngtr01,Trae Young,POINT GUARD,20,17.0,2503.0,2019,,187.96,81.64656,Oklahoma


In [28]:
collated_player_data3.loc[collated_player_data3['name'] == 'Kyle Kuzma']

Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr,peakage,height,weight,college
3848,kuzmaky01,Kyle Kuzma,POWER FORWARD,22,14.1,2357.5,2018,,205.74,99.79024,Utah


In [29]:
# Splitting players into 2 groups:
# Dataset will be the players to be used in train/test split for machine learning. 
# These players already have their peakages identified from Part I.
dataset = collated_player_data3.loc[collated_player_data3['firstyr']<=2016]
print(dataset.shape)
dataset.to_csv('dataset.csv', index = False)

# Prediction set will be the players without a peak age yet, whose peakage we will try to predict
predictionset = collated_player_data3.loc[collated_player_data3['firstyr']>=2017]
print(predictionset.shape)
predictionset.to_csv('predictionset.csv', index = False)

# Checking the size of each set separately

(3700, 11)
(311, 11)


# Machine Learning

## Preparing Train/Test Set

In [30]:
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr,peakage,height,weight,college
0,arizipa01,Paul Arizin,SMALL FORWARD,23,21.666667,2872.0,1952,23.0,,,
1,barkecl01,Cliff Barker,SHOOTING GUARD,31,10.8,494.0,1952,,188.0,83.0,University of Kentucky
2,barksdo01,Don Barksdale,POWER FORWARD,28,15.566667,1890.0,1952,,,,
3,barnhle01,Leo Barnhorst,SMALL FORWARD,27,10.633333,2426.333333,1952,,193.0,86.0,University of Notre Dame
4,behnkel01,Elmer Behnke,CENTER,22,7.8,55.0,1952,,201.0,95.0,Bradley University


In [31]:
# Giving the players with no height and weight data the NBA averages of 200 cm and 98 kg. 
# Removing players with no peakage identified in Part I,
# because they did not meet the criteria of enough games/seasons to have their peak age calculated.
values = {'height': 200, 'weight': 98}
dataset = dataset.fillna(value = values)
dataset = dataset[dataset['peakage'].notna()]

# Previewing the dataset, numbers have been reduced
print(dataset.shape)
dataset.head()

(1496, 11)


Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr,peakage,height,weight,college
0,arizipa01,Paul Arizin,SMALL FORWARD,23,21.666667,2872.0,1952,23.0,200.0,98.0,
18,cliftna01,Nat Clifton,POWER FORWARD,29,15.533333,2258.666667,1952,29.0,200.0,98.0,
20,colemja01,Jack Coleman,POWER FORWARD,27,15.833333,2536.0,1952,30.0,201.0,88.0,University of Louisville
22,cousybo01,Bob Cousy,POINT GUARD,23,21.366667,2827.666667,1952,24.0,200.0,98.0,
31,foustla01,Larry Foust,CENTER,23,21.5,2537.0,1952,27.0,206.0,97.0,La Salle University


In [32]:
# Splitting data into the indepedent and dependent variables
X = dataset.drop(columns = ['playerID','name','peakage','position','college'])
Y = dataset['peakage'].astype(int)

In [33]:
# Dropping the columns produced by data collation error in Dataset D

# X = pd.concat([X,pd.get_dummies(dataset['college'])], axis=1)
X = pd.concat([X,pd.get_dummies(dataset['position'])], axis=1)
X = X.drop(columns = ['CENTER-FORWARD','FORWARD-CENTER','GUARD','GUARD-FORWARD'])
X.head()

Unnamed: 0,firstage,avgfirst3PER,avgfirst3MP,firstyr,height,weight,CENTER,POINT GUARD,POWER FORWARD,SHOOTING GUARD,SMALL FORWARD
0,23,21.666667,2872.0,1952,200.0,98.0,0,0,0,0,1
18,29,15.533333,2258.666667,1952,200.0,98.0,0,0,1,0,0
20,27,15.833333,2536.0,1952,201.0,88.0,0,0,1,0,0
22,23,21.366667,2827.666667,1952,200.0,98.0,0,1,0,0,0
31,23,21.5,2537.0,1952,206.0,97.0,1,0,0,0,0


In [34]:
from sklearn import preprocessing
from sklearn import metrics

In [35]:
# Fitting and transforming data for modeling using sklearn tools
X = preprocessing.StandardScaler().fit(X).transform(X)

In [36]:
# Train test split. Train to test ratio will be 4:1
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=4)

# Previewing the numbers
print('Train set:', X_train.shape, Y_train.shape)
print('Test set:', X_test.shape, Y_test.shape)

Train set: (1196, 11) (1196,)
Test set: (300, 11) (300,)


## Applying different models and assessing their performance

In [37]:
# K Nearest Neighbours
from sklearn.neighbors import KNeighborsClassifier

In [38]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    
    # Train Model and Predict
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,Y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(Y_test, yhat)
    
    std_acc[n-1]=np.std(yhat==Y_test)/np.sqrt(yhat.shape[0])

mean_acc
print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) 

The best accuracy was with 0.15333333333333332 with k= 7


In [39]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

In [40]:
playertree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
playertree

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [41]:
playertree.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [42]:
# Support Vector Machine
from sklearn import svm

In [43]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [44]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

In [45]:
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,Y_train)
LR

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
# Multilinear Regression
from sklearn import linear_model

In [47]:
regr = linear_model.LinearRegression()
regr.fit (X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [48]:
# Prediction for KNN
neigh = KNeighborsClassifier(n_neighbors = 7).fit(X_train,Y_train)
yhatfinal = neigh.predict(X_test)

print("KNN Accuracy Score: ", metrics.accuracy_score(Y_test, yhatfinal))
print("KNN Precision Score: ", metrics.precision_score(Y_test, yhatfinal, average= 'weighted'))
print("KNN Jaccard Similarity Score: ", metrics.jaccard_similarity_score(Y_test, yhatfinal))

KNN Accuracy Score:  0.15333333333333332
KNN Precision Score:  0.14611827964013635
KNN Jaccard Similarity Score:  0.15333333333333332


In [49]:
# Prediction for Decision Tree
predtree = playertree.predict(X_test)

print("Decision Tree Accuracy Score: ", metrics.accuracy_score(Y_test, predtree))
print("Decision Tree Precision Score: ", metrics.precision_score(Y_test, predtree, average= 'weighted'))
print("Decision Tree Jaccard Similarity Score: ", metrics.jaccard_similarity_score(Y_test, predtree))

Decision Tree Accuracy Score:  0.16333333333333333
Decision Tree Precision Score:  0.14911771305829552
Decision Tree Jaccard Similarity Score:  0.16333333333333333


In [50]:
# Prediction for SVM
yhatfinal2 = clf.predict(X_test)

print("SVM Accuracy Score: ", metrics.accuracy_score(Y_test, yhatfinal2))
print("SVM Precision Score: ", metrics.precision_score(Y_test, yhatfinal2, average= 'weighted'))
print("SVM Jaccard Similarity Score: ", metrics.jaccard_similarity_score(Y_test, yhatfinal2))

SVM Accuracy Score:  0.16666666666666666
SVM Precision Score:  0.10620353034540679
SVM Jaccard Similarity Score:  0.16666666666666666


In [51]:
# Prediction for LogRegression
yhatfinal3 = LR.predict(X_test)
yhatfinal3_prob = LR.predict_proba(X_test)

print("LR Accuracy Score: ", metrics.accuracy_score(Y_test, yhatfinal3))
print("LR Precision Score: ", metrics.precision_score(Y_test, yhatfinal3, average= 'weighted'))
print("LR Jaccard Similarity Score: ", metrics.jaccard_similarity_score(Y_test, yhatfinal3))

from sklearn.metrics import log_loss
print("LR LogLoss Score: ", metrics.log_loss(Y_test, yhatfinal3_prob))

LR Accuracy Score:  0.14666666666666667
LR Precision Score:  0.09359044943820224
LR Jaccard Similarity Score:  0.14666666666666667
LR LogLoss Score:  2.629974416833511


In [52]:
# Prediction for Multilinear Regression
yhatfinal4= regr.predict(X_test)

print("MLR Rsquared Score: ", metrics.r2_score(Y_test, yhatfinal4))
#print("MLR Accuracy Score: ", metrics.accuracy_score(Y_test, yhatfinal4))
#print("MLR Precision Score: ", metrics.precision_score(Y_test, yhatfinal4, average= 'weighted'))
#print("MLR Jaccard Similarity Score: ", metrics.jaccard_similarity_score(Y_test, yhatfinal4))
#print("MLR f1 Score: ", metrics.f1_score(Y_test, yhatfinal4,  pos_label = 1, average= 'weighted'))

MLR Rsquared Score:  0.15579688280237203


## Proceeding to use choice model for prediction

In [53]:
# Preparing prediction set - similar steps to the preparation of dataset earlier
predictionset.head()

Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr,peakage,height,weight,college
3700,abrinal01,Álex Abrines,SHOOTING GUARD,23,8.466667,925.666667,2017,,,,
3701,bakerro01,Ron Baker,SHOOTING GUARD,23,6.0,464.666667,2017,,193.02,99.39512,Wichita State University
3702,baldwwa01,Wade Baldwin,POINT GUARD,20,8.133333,193.0,2017,,193.0,91.0,Vanderbilt University
3703,beaslma01,Malik Beasley,SHOOTING GUARD,20,12.3,875.666667,2017,,195.79,88.452016,Florida State University
3704,bembrde01,DeAndre' Bembry,SMALL FORWARD,22,9.066667,919.0,2017,,198.06,95.12716,Saint Joseph's University


In [54]:
values = {'height': 200, 'weight': 98}
predictionset = predictionset.fillna(value = values)

print(predictionset.shape)
predictionset.head()

(311, 11)


Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr,peakage,height,weight,college
3700,abrinal01,Álex Abrines,SHOOTING GUARD,23,8.466667,925.666667,2017,,200.0,98.0,
3701,bakerro01,Ron Baker,SHOOTING GUARD,23,6.0,464.666667,2017,,193.02,99.39512,Wichita State University
3702,baldwwa01,Wade Baldwin,POINT GUARD,20,8.133333,193.0,2017,,193.0,91.0,Vanderbilt University
3703,beaslma01,Malik Beasley,SHOOTING GUARD,20,12.3,875.666667,2017,,195.79,88.452016,Florida State University
3704,bembrde01,DeAndre' Bembry,SMALL FORWARD,22,9.066667,919.0,2017,,198.06,95.12716,Saint Joseph's University


In [55]:
Xfinal = predictionset.drop(columns = ['playerID','name','peakage','position','college'])

In [56]:
# Xfinal = pd.concat([Xfinal,pd.get_dummies(predictionset['college'])], axis=1)
Xfinal = pd.concat([Xfinal,pd.get_dummies(predictionset['position'])], axis=1)
Xfinal.head()

Unnamed: 0,firstage,avgfirst3PER,avgfirst3MP,firstyr,height,weight,CENTER,POINT GUARD,POWER FORWARD,SHOOTING GUARD,SMALL FORWARD
3700,23,8.466667,925.666667,2017,200.0,98.0,0,0,0,1,0
3701,23,6.0,464.666667,2017,193.02,99.39512,0,0,0,1,0
3702,20,8.133333,193.0,2017,193.0,91.0,0,1,0,0,0
3703,20,12.3,875.666667,2017,195.79,88.452016,0,0,0,1,0
3704,22,9.066667,919.0,2017,198.06,95.12716,0,0,0,0,1


In [57]:
Xfinal = preprocessing.StandardScaler().fit(Xfinal).transform(Xfinal)

In [58]:
# Prediction for KNN
#neigh = KNeighborsClassifier(n_neighbors = 7).fit(X_train,Y_train)

predictedlist = neigh.predict(Xfinal)

#Previewing the predicted list of peak ages, arranged in order of index of players in the set
predictedlist

array([26, 26, 26, 24, 23, 24, 29, 23, 27, 31, 23, 26, 24, 23, 25, 24, 25,
       24, 23, 28, 25, 23, 25, 23, 24, 27, 25, 26, 28, 26, 26, 23, 24, 24,
       31, 26, 22, 25, 25, 23, 24, 22, 26, 24, 25, 23, 25, 24, 23, 23, 26,
       26, 24, 26, 24, 23, 28, 24, 24, 24, 24, 25, 25, 25, 27, 23, 23, 25,
       26, 25, 23, 26, 24, 23, 24, 24, 22, 24, 24, 26, 26, 25, 24, 24, 28,
       24, 23, 26, 26, 26, 25, 25, 28, 25, 26, 26, 25, 24, 24, 26, 28, 23,
       29, 24, 28, 24, 26, 25, 26, 23, 28, 21, 26, 26, 27, 25, 26, 24, 28,
       23, 28, 26, 24, 25, 22, 30, 28, 25, 28, 24, 26, 27, 28, 29, 23, 28,
       26, 26, 30, 24, 27, 24, 24, 27, 28, 24, 23, 28, 23, 24, 26, 26, 23,
       25, 26, 28, 25, 26, 25, 27, 27, 24, 24, 26, 23, 23, 26, 23, 27, 25,
       25, 27, 23, 25, 22, 26, 30, 24, 29, 26, 25, 23, 26, 26, 24, 24, 25,
       25, 25, 23, 23, 25, 24, 26, 22, 22, 23, 26, 27, 25, 29, 26, 26, 24,
       26, 23, 24, 26, 24, 28, 24, 28, 24, 26, 25, 26, 24, 26, 24, 25, 21,
       28, 25, 21, 30, 24

In [59]:
# Filling the predicted peak ages into the relevant column in our dataset

prediction_set = pd.read_csv('predictionset.csv')
prediction_set.head()
prediction_set['peakage'] = pd.Series(predictedlist, index=prediction_set.index)
#predictedlist = prediction_set['peakage']
prediction_set

Unnamed: 0,playerID,name,position,firstage,avgfirst3PER,avgfirst3MP,firstyr,peakage,height,weight,college
0,abrinal01,Álex Abrines,SHOOTING GUARD,23,8.466667,925.666667,2017,26,,,
1,bakerro01,Ron Baker,SHOOTING GUARD,23,6.000000,464.666667,2017,26,193.02,99.395120,Wichita State University
2,baldwwa01,Wade Baldwin,POINT GUARD,20,8.133333,193.000000,2017,26,193.00,91.000000,Vanderbilt University
3,beaslma01,Malik Beasley,SHOOTING GUARD,20,12.300000,875.666667,2017,24,195.79,88.452016,Florida State University
4,bembrde01,DeAndre' Bembry,SMALL FORWARD,22,9.066667,919.000000,2017,23,198.06,95.127160,Saint Joseph's University
...,...,...,...,...,...,...,...,...,...,...,...
306,welshth01,Thomas Welsh,CENTER,22,16.100000,36.000000,2019,25,213.36,115.665960,UCLA
307,willijo04,Johnathan Williams,CENTER,23,15.100000,372.000000,2019,26,205.74,103.418976,Gonzaga
308,willike04,Kenrich Williams,SMALL FORWARD,24,9.700000,1079.000000,2019,26,200.66,95.254320,
309,williro04,Robert Williams,CENTER,21,18.800000,283.000000,2019,26,,,


In [60]:
# Saving the predicted data to csv
prediction_set.to_csv('predictedset.csv')