# Import and Clean Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('cleaned_data.csv')

In [None]:
df.shape

In [None]:
#df.head()

In [None]:
#df.tail()

In [None]:
#df['Player'].value_counts()

In [None]:
#df.loc[df['Player'] == 'Lou Amundson']

In [None]:
df.keys()

In [None]:
#df[df['WS/48'].isnull()]

In [None]:
#df[df['PER'].isnull()]

In [None]:
#df[df['YrsExperience'].isnull()]

In [None]:
#df[df['FG%'].isnull()]

In [None]:
#df = df.drop(df.index[[842, 1812, 3700, 5472, 5735]])
df = df[np.isfinite(df['WS/48'])]
df = df[np.isfinite(df['FG%'])]
df = df[np.isfinite(df['YrsExperience'])]
df.shape

# Linear Regression Prediction of Player Efficiency Rating

In [None]:
X = df[['WS/48', 'BPM', 'USG%']]
y = df['PER'].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

In [None]:
predictions = model.predict(X_test_scaled)
model.fit(X_train_scaled, y_train_scaled)
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# LASSO model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Lasso

### BEGIN SOLUTION
lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = lasso.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = lasso.score(X_test_scaled, y_test_scaled)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# Ridge model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Ridge

### BEGIN SOLUTION
ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
# ElasticNet model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import ElasticNet

### BEGIN SOLUTION
elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)

predictions = elasticnet.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = elasticnet.score(X_test_scaled, y_test_scaled)
### END SOLUTION

print(f"MSE: {MSE}, R2: {r2}")

# Linear Regression Prediction of Minutes Played

In [None]:
X = df[['GS', 'G', 'PTS']]
y = df['MP'].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

In [None]:
predictions = model.predict(X_test_scaled)
model.fit(X_train_scaled, y_train_scaled)
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

# Logistic Regression of Player Making it Past Rookie Season

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Dummy Regression

In [None]:
#dummy_df = pd.get_dummies(df)
#dummy_df.head()

# K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df['Pos'].value_counts()

In [None]:
df.shape

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.index[df['Pos'] == 'C-SF'].tolist()

In [None]:
df = df.drop(df.index[[df.index[df['Pos'] == 'C-SF'].tolist()]])

In [None]:
df.shape

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.index[df['Pos'] == 'C-SF'].tolist()

In [None]:
df['Pos'].value_counts()

In [None]:
df = df.drop("blanl", axis=1)
df = df.drop("blank2", axis=1)
#df = df.drop(df['Pos'] == 'C-SF')

In [None]:
df = df.dropna(axis=0, how='any')

In [None]:
data = df.drop("Pos", axis=1)
data = data.drop("Tm", axis=1)
data = data.drop("Player", axis=1)
data = data.drop("TrueSalary", axis=1)
data = data.drop("RoundedPosition", axis=1)
data = data.drop("Year", axis=1)
data = data.drop("Age", axis=1)
feature_names = data.columns
target = df['Pos']
target_names = target.value_counts().keys().tolist()
data.head()

In [None]:
data.shape

In [None]:
target.shape

In [None]:
data.keys()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 100, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 100, 2), train_scores, marker='o')
plt.plot(range(1, 100, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
print('k=15 Test Acc: %.3f' % knn.score(X_test, y_test))

# Stratified KNN

In [None]:
#target.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

In [None]:
#X_train.values.reshape(-1,1)

In [None]:
X_scaler = StandardScaler().fit(X_train.values.reshape(-1, 1))

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 100, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 100, 2), train_scores, marker='o')
plt.plot(range(1, 100, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=23)
knn.fit(X_train_scaled, y_train)
print('k=23 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

# Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(data, target)
rf.score(data, target)

In [None]:
importances = rf.feature_importances_
importances

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Decision Tree and Random Forests

In [None]:
from sklearn import tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Decision Tree and Random Forests: Rookie to Final Year

In [None]:
df = pd.read_csv('cleaned_data.csv')

In [None]:
df.shape

In [None]:
df = df[np.isfinite(df['WS/48'])]
df = df[np.isfinite(df['FG%'])]
df = df[np.isfinite(df['YrsExperience'])]
df.shape

In [None]:
df = df.reset_index(drop=True)

In [None]:
df = df.drop("blanl", axis=1)
df = df.drop("blank2", axis=1)
df = df.dropna(axis=0, how='any')

In [None]:
df.shape

# Player Classification

# Team Classification

In [None]:
df = pd.read_csv('cleaned_data.csv')
df.shape

In [None]:
df = df[np.isfinite(df['WS/48'])]
df = df[np.isfinite(df['FG%'])]
df = df[np.isfinite(df['YrsExperience'])]
df.shape

In [None]:
df = df.reset_index(drop=True)

In [None]:
df = df.drop("blanl", axis=1)
df = df.drop("blank2", axis=1)
df = df.dropna(axis=0, how='any')

In [None]:
df.shape

In [None]:
data = df.drop("Pos", axis=1)
data = data.drop("Tm", axis=1)
data = data.drop("Player", axis=1)
data = data.drop("TrueSalary", axis=1)
#data = data.drop("RoundedPosition", axis=1)
#data = data.drop("Year", axis=1)
data = data.drop("Age", axis=1)
feature_names = data.columns
target = df['Tm']
target_names = target.value_counts().keys().tolist()
data.head()

In [None]:
data.shape

In [None]:
target.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

In [None]:
X_scaler = StandardScaler().fit(X_train.values.reshape(-1, 1))

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 100, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 100, 2), train_scores, marker='o')
plt.plot(range(1, 100, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
from sklearn import tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Combining With DPOY Data

In [2]:
season_stats = pd.read_csv('cleaned_data.csv')

In [3]:
season_stats.columns

Index(['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'TrueSalary', 'YrsExperience', 'Height',
       'Weight', 'RoundedPosition'],
      dtype='object')

In [4]:
dpoy_2016 = season_stats.loc[(season_stats['Year'] == 2016.0) & (season_stats['Player'] == 'Kawhi Leonard')]
dpoy_2015 = season_stats.loc[(season_stats['Year'] == 2015.0) & (season_stats['Player'] == 'Kawhi Leonard')]
dpoy_2014 = season_stats.loc[(season_stats['Year'] == 2014.0) & (season_stats['Player'] == 'Joakim Noah')]
dpoy_2013 = season_stats.loc[(season_stats['Year'] == 2013.0) & (season_stats['Player'] == 'Marc Gasol')]
dpoy_2012 = season_stats.loc[(season_stats['Year'] == 2012.0) & (season_stats['Player'] == 'Tyson Chandler')]
dpoy_2011 = season_stats.loc[(season_stats['Year'] == 2011.0) & (season_stats['Player'] == 'Dwight Howard')]
dpoy_2010 = season_stats.loc[(season_stats['Year'] == 2010.0) & (season_stats['Player'] == 'Dwight Howard')]
dpoy_2009 = season_stats.loc[(season_stats['Year'] == 2009.0) & (season_stats['Player'] == 'Dwight Howard')]
dpoy_2008 = season_stats.loc[(season_stats['Year'] == 2008.0) & (season_stats['Player'] == 'Kevin Garnett')]
dpoy_2007 = season_stats.loc[(season_stats['Year'] == 2007.0) & (season_stats['Player'] == 'Marcus Camby')]

dpoys = [dpoy_2007, dpoy_2008, dpoy_2009, dpoy_2010, dpoy_2011, dpoy_2012, dpoy_2013, dpoy_2014, dpoy_2015, dpoy_2016]

dpoys_df = pd.concat(dpoys)
dpoys_df

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,STL,BLK,TOV,PF,PTS,TrueSalary,YrsExperience,Height,Weight,RoundedPosition
77,2007,Marcus Camby,C,32,DEN,70,70,2369,19.1,0.519,...,87,231,122,183,785,19300000,10.0,83.0,220.0,5.0
696,2008,Kevin Garnett,PF,31,BOS,71,71,2328,25.3,0.588,...,100,89,138,163,1337,24200000,12.0,83.0,220.0,4.0
1360,2009,Dwight Howard,C,23,ORL,79,79,2821,25.4,0.6,...,77,231,240,270,1624,19800000,4.0,83.0,240.0,5.0
1938,2010,Dwight Howard,C,24,ORL,82,82,2843,24.0,0.63,...,75,228,274,287,1503,21900000,5.0,83.0,240.0,5.0
2565,2011,Dwight Howard,C,25,ORL,78,78,2935,26.1,0.616,...,107,186,279,258,1784,20900000,6.0,83.0,240.0,5.0
2987,2012,Tyson Chandler,C,29,NYK,62,62,2061,18.7,0.708,...,56,89,102,186,699,17100000,10.0,85.0,235.0,5.0
3623,2013,Marc Gasol,C,28,MEM,80,80,2796,19.5,0.559,...,80,139,157,255,1127,22100000,4.0,85.0,265.0,5.0
4442,2014,Joakim Noah,C,28,CHI,80,80,2820,20.0,0.531,...,99,121,194,245,1007,22400000,6.0,83.0,232.0,4.0
4978,2015,Kawhi Leonard,SF,23,SAS,64,64,2033,22.0,0.567,...,148,48,97,128,1057,21600000,3.0,79.0,230.0,3.0
5604,2016,Kawhi Leonard,SF,24,SAS,72,72,2380,26.0,0.616,...,128,71,105,133,1523,11300000,4.0,79.0,230.0,3.0


In [5]:
df = season_stats
df.columns

Index(['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'TrueSalary', 'YrsExperience', 'Height',
       'Weight', 'RoundedPosition'],
      dtype='object')

In [6]:
dpoys_index = dpoys_df.index.values.tolist()
dpoys_index

[77, 696, 1360, 1938, 2565, 2987, 3623, 4442, 4978, 5604]

In [7]:
#years = [2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007]
dpoy = []
#for year in years:
for i in range(0, len(df)):
    if i in dpoys_index:
        value = 1
        dpoy.append(value)
    else:
        value = 0
        dpoy.append(value)
            

In [8]:
len(dpoy)

5860

In [9]:
sum = 0
for i in range(0, len(dpoy)):
    sum = sum + dpoy[i]
print(sum)

10


In [10]:
df['dpoy'] = dpoy

In [11]:
df.shape

(5860, 58)

In [12]:
df.keys()

Index(['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'TrueSalary', 'YrsExperience', 'Height',
       'Weight', 'RoundedPosition', 'dpoy'],
      dtype='object')

In [13]:
df.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,BLK,TOV,PF,PTS,TrueSalary,YrsExperience,Height,Weight,RoundedPosition,dpoy
0,2007,Shareef Abdur-Rahim,C,30,SAC,80,45,2015,13.1,0.524,...,40,116,243,793,1800000,10.0,81.0,225.0,4.0,0
1,2007,Hassan Adams,SG,22,NJN,61,8,495,13.0,0.577,...,4,22,47,174,0,0.0,76.0,220.0,2.0,0
2,2007,Maurice Ager,SG,22,DAL,32,1,214,2.9,0.408,...,3,15,27,69,0,0.0,77.0,202.0,2.0,0
3,2007,LaMarcus Aldridge,C,21,POR,63,22,1392,17.1,0.533,...,73,43,186,565,1900000,0.0,83.0,240.0,5.0,0
4,2007,Malik Allen,PF,28,CHI,60,1,638,10.4,0.443,...,16,21,85,242,0,5.0,82.0,255.0,4.0,0


In [14]:
df.loc[df['dpoy'] == 1]

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,BLK,TOV,PF,PTS,TrueSalary,YrsExperience,Height,Weight,RoundedPosition,dpoy
77,2007,Marcus Camby,C,32,DEN,70,70,2369,19.1,0.519,...,231,122,183,785,19300000,10.0,83.0,220.0,5.0,1
696,2008,Kevin Garnett,PF,31,BOS,71,71,2328,25.3,0.588,...,89,138,163,1337,24200000,12.0,83.0,220.0,4.0,1
1360,2009,Dwight Howard,C,23,ORL,79,79,2821,25.4,0.6,...,231,240,270,1624,19800000,4.0,83.0,240.0,5.0,1
1938,2010,Dwight Howard,C,24,ORL,82,82,2843,24.0,0.63,...,228,274,287,1503,21900000,5.0,83.0,240.0,5.0,1
2565,2011,Dwight Howard,C,25,ORL,78,78,2935,26.1,0.616,...,186,279,258,1784,20900000,6.0,83.0,240.0,5.0,1
2987,2012,Tyson Chandler,C,29,NYK,62,62,2061,18.7,0.708,...,89,102,186,699,17100000,10.0,85.0,235.0,5.0,1
3623,2013,Marc Gasol,C,28,MEM,80,80,2796,19.5,0.559,...,139,157,255,1127,22100000,4.0,85.0,265.0,5.0,1
4442,2014,Joakim Noah,C,28,CHI,80,80,2820,20.0,0.531,...,121,194,245,1007,22400000,6.0,83.0,232.0,4.0,1
4978,2015,Kawhi Leonard,SF,23,SAS,64,64,2033,22.0,0.567,...,48,97,128,1057,21600000,3.0,79.0,230.0,3.0,1
5604,2016,Kawhi Leonard,SF,24,SAS,72,72,2380,26.0,0.616,...,71,105,133,1523,11300000,4.0,79.0,230.0,3.0,1


# Logistic Regression of Player Obtaining Defensive Player of Year

In [15]:
df = df[np.isfinite(df['WS/48'])]
df = df[np.isfinite(df['FG%'])]
df = df[np.isfinite(df['YrsExperience'])]
df = df.drop("blanl", axis=1)
df = df.drop("blank2", axis=1)
df = df.dropna(axis=0, how='any')
df = df.reset_index(drop=True) #RESET INDEX
data = df.drop("Pos", axis=1)
data = data.drop("Tm", axis=1)
data = data.drop("Player", axis=1)
data = data.drop("TrueSalary", axis=1)
data = data.drop("RoundedPosition", axis=1)
data = data.drop("Year", axis=1)
data = data.drop("Age", axis=1)
data = data.drop("dpoy", axis=1)
feature_names = data.columns
target = df['dpoy']
#target_names = target.value_counts().keys().tolist()
data.head()

Unnamed: 0,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,TRB,AST,STL,BLK,TOV,PF,PTS,YrsExperience,Height,Weight
0,80,45,2015,13.1,0.524,0.031,0.358,7.0,16.4,11.6,...,398,109,53,40,116,243,793,10.0,81.0,225.0
1,61,8,495,13.0,0.577,0.007,0.267,8.4,9.9,9.2,...,77,13,17,4,22,47,174,0.0,76.0,220.0
2,32,1,214,2.9,0.408,0.214,0.471,0.6,11.0,5.9,...,21,7,4,3,15,27,69,0.0,77.0,202.0
3,63,22,1392,17.1,0.533,0.004,0.24,12.6,15.3,13.9,...,312,24,22,73,43,186,565,0.0,83.0,240.0
4,60,1,638,10.4,0.443,0.004,0.132,7.6,13.7,10.6,...,119,16,17,16,21,85,242,5.0,82.0,255.0


In [16]:
data.shape

(4780, 48)

In [17]:
#target = target.reshape(-1,1)
target.shape

(4780,)

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1, stratify=target)

In [19]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)



In [20]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)



In [21]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
classifier.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Testing Data Score: 0.9966527196652719


In [24]:
predictions = classifier.predict(X_test)
predict_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
wrong_predict_df = predict_df.loc[predict_df['Actual'] != predict_df['Prediction']]
wrong_predict_df

Unnamed: 0,Actual,Prediction
2053,1,0
1183,0,1
65,1,0
2385,1,0


In [25]:
act_df_1 = predict_df.loc[predict_df['Actual'] == 1]
act_df_1

Unnamed: 0,Actual,Prediction
2053,1,0
65,1,0
2385,1,0


In [26]:
predict_df_1 = predict_df.loc[predict_df['Prediction'] == 1]
predict_df_1

Unnamed: 0,Actual,Prediction
1183,0,1


In [27]:
predictions.shape

(1195,)

In [28]:
data.iloc[1183]

G                  2.000
GS                 0.000
MP                 9.000
PER               15.700
TS%                0.425
3PAr               0.200
FTr                0.400
ORB%               0.000
DRB%              25.200
TRB%              12.200
AST%               0.000
STL%               0.000
BLK%               8.100
TOV%               0.000
USG%              28.400
OWS                0.000
DWS                0.000
WS                 0.000
WS/48             -0.010
OBPM              -9.900
DBPM              -4.900
BPM              -14.900
VORP               0.000
FG                 2.000
FGA                5.000
FG%                0.400
3P                 0.000
3PA                1.000
3P%                0.000
2P                 2.000
2PA                4.000
2P%                0.500
eFG%               0.400
FT                 1.000
FTA                2.000
FT%                0.500
ORB                0.000
DRB                2.000
TRB                2.000
AST                0.000


In [29]:
df = pd.read_csv('cleaned_data.csv')
df = df[np.isfinite(df['WS/48'])]
df = df[np.isfinite(df['FG%'])]
df = df[np.isfinite(df['YrsExperience'])]
df = df.drop("blanl", axis=1)
df = df.drop("blank2", axis=1)
df = df.dropna(axis=0, how='any')
df.iloc[1183]

Year                           2009
Player             Demetris Nichols
Pos                              SF
Age                              24
Tm                              NYK
G                                 2
GS                                0
MP                                9
PER                            15.7
TS%                           0.425
3PAr                            0.2
FTr                             0.4
ORB%                              0
DRB%                           25.2
TRB%                           12.2
AST%                              0
STL%                              0
BLK%                            8.1
TOV%                              0
USG%                           28.4
OWS                               0
DWS                               0
WS                                0
WS/48                         -0.01
OBPM                           -9.9
DBPM                           -4.9
BPM                           -14.9
VORP                        

In [30]:
from sklearn import tree

In [31]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [40]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.99665271966527191

In [41]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.99832635983263596

In [42]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.10840992960141244, 'DWS'),
 (0.062902843803249897, 'DBPM'),
 (0.048622846937764343, 'BPM'),
 (0.046859016971417242, 'VORP'),
 (0.035951189673640221, 'FTA'),
 (0.033224501920526665, 'TOV'),
 (0.027235585999651136, 'AST'),
 (0.027181849169031734, 'DRB'),
 (0.025500589219477038, 'ORB'),
 (0.025364861236084474, 'OBPM'),
 (0.025232905177779163, 'BLK'),
 (0.023762510253594629, 'TOV%'),
 (0.02365615493872579, 'TRB'),
 (0.022352285968918122, 'WS/48'),
 (0.021521380648744773, 'STL'),
 (0.021049420580808414, 'TS%'),
 (0.020510808035518843, 'FG%'),
 (0.020286792123041759, 'PTS'),
 (0.019942578980950844, 'FGA'),
 (0.019589825175185562, 'WS'),
 (0.01928441887583409, 'PER'),
 (0.018648397615028962, 'USG%'),
 (0.018288769103746386, 'AST%'),
 (0.017397071860511058, 'eFG%'),
 (0.017080014201215077, 'MP'),
 (0.016851539846996083, '3P%'),
 (0.016358080137786054, 'FG'),
 (0.015409195104327346, 'FTr'),
 (0.014590256916190637, 'TRB%'),
 (0.013136531132793885, 'Weight'),
 (0.012502948317668764, 'PF'),
 (

In [43]:
predictions = rf.predict(X_test)
predict_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
wrong_predict_df = predict_df.loc[predict_df['Actual'] != predict_df['Prediction']]
wrong_predict_df

Unnamed: 0,Actual,Prediction
65,1,0
2053,1,0


In [45]:
#df.iloc[65]

In [46]:
act_df_1 = predict_df.loc[predict_df['Actual'] == 1]
act_df_1

Unnamed: 0,Actual,Prediction
65,1,0
2053,1,0


In [47]:
predict_df_1 = predict_df.loc[predict_df['Prediction'] == 1]
predict_df_1

Unnamed: 0,Actual,Prediction


In [39]:
predict_df.shape

(1195, 2)