In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, f_regression, f_classif
from sklearn.preprocessing import RobustScaler, OneHotEncoder
import csv
import warnings
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.cluster import hierarchy as hc
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.display.max_rows = 9999

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/nba-career/Data/master/player_data.csv')
df.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University


In [3]:
df.dtypes

name           object
year_start      int64
year_end        int64
position       object
height         object
weight        float64
birth_date     object
college        object
dtype: object

In [4]:
df.isna().sum()

name            0
year_start      0
year_end        0
position        1
height          1
weight          6
birth_date     31
college       302
dtype: int64

In [5]:
df['college'] = df['college'].fillna('none')
df = df.dropna()
df.isna().sum()

name          0
year_start    0
year_end      0
position      0
height        0
weight        0
birth_date    0
college       0
dtype: int64

In [6]:
#convert height to float

new = df["height"].str.split("-", n = 1, expand = True).astype(float)
df['height'] = new[0] + (new[1]*(12/100))

#convert year_end into longevity

df['longevity'] = df['year_end'] - df['year_start'] 

#convert birth_date to start_age

df['start_age'] = df['year_start'] - df['birth_date'].str[-4:].astype(int) 

# convert position to float

position_dict = {'F-C' : 1,
                 'C-F' : 2,
                 'G-F' : 3,
                 'F-G' : 4,
                 'G' : 5,
                 'F' : 6,
                 'C' : 7}

df = df.replace(position_dict)
df.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,longevity,start_age
0,Alaa Abdelnaby,1991,1995,1,7.2,240.0,"June 24, 1968",Duke University,4,23
1,Zaid Abdul-Aziz,1969,1978,2,7.08,235.0,"April 7, 1946",Iowa State University,9,23
2,Kareem Abdul-Jabbar,1970,1989,7,7.24,225.0,"April 16, 1947","University of California, Los Angeles",19,23
3,Mahmoud Abdul-Rauf,1991,2001,5,6.12,162.0,"March 9, 1969",Louisiana State University,10,22
4,Tariq Abdul-Wahad,1998,2003,6,6.72,223.0,"November 3, 1974",San Jose State University,5,24


In [7]:
# Clean up unused columns
rl_df = df[['name', 'year_start', 'start_age', 'position', 'height', 'weight', 'longevity']]
rl_df.head()

Unnamed: 0,name,year_start,start_age,position,height,weight,longevity
0,Alaa Abdelnaby,1991,23,1,7.2,240.0,4
1,Zaid Abdul-Aziz,1969,23,2,7.08,235.0,9
2,Kareem Abdul-Jabbar,1970,23,7,7.24,225.0,19
3,Mahmoud Abdul-Rauf,1991,22,5,6.12,162.0,10
4,Tariq Abdul-Wahad,1998,24,6,6.72,223.0,5


In [8]:
# divide into feature and lable set
df_X = rl_df.drop(columns = ['longevity', 'name'])
df_y = rl_df['longevity']
df_X

Unnamed: 0,year_start,start_age,position,height,weight
0,1991,23,1,7.2,240.0
1,1969,23,2,7.08,235.0
2,1970,23,7,7.24,225.0
3,1991,22,5,6.12,162.0
4,1998,24,6,6.72,223.0
5,1997,21,6,7.08,225.0
6,1977,23,6,6.84,220.0
7,1957,25,5,6.36,180.0
8,1947,28,6,6.36,195.0
9,2017,24,3,6.72,190.0


In [9]:
# split
X_train, X_test, y_train, y_test = train_test_split(df_X,df_y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3385, 5) (1129, 5) (3385,) (1129,)


In [10]:
# Baseline using mode
majority_class = df_y.mode()
y_act = df_y
y_pred = np.full(shape = y_act.shape, fill_value = majority_class)
y_act.shape, y_pred.shape


((4514,), (4514,))

In [11]:
accuracy_score(y_act, y_pred)

0.28444838280903856

In [12]:
df_X_gd = pd.get_dummies(data=df)
print(df_X_gd.shape)



(4514, 9100)


In [13]:
log_reg = LogisticRegression().fit(X_train, y_train.values.ravel())
log_reg.score(X_train, y_train)

0.28862629246676513

In [14]:
m = RandomForestClassifier(n_estimators=100,min_samples_leaf=3 ,n_jobs=-1,max_features=0.25)
%time m.fit(X_train, y_train.values.ravel())
y_pred= m.predict(X_test)
accuracy_score(y_test, y_pred)

CPU times: user 371 ms, sys: 71.7 ms, total: 443 ms
Wall time: 302 ms


0.2887511071744907