# Import and Clean Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.shape

(5860, 57)

In [4]:
df = df[np.isfinite(df['WS/48'])]
df = df[np.isfinite(df['FG%'])]
df = df[np.isfinite(df['YrsExperience'])]
df.shape

(5827, 57)

# Linear Regression Prediction of Player Efficiency Rating

In [5]:
X = df[['WS/48', 'BPM', 'USG%']]
y = df['PER'].values.reshape(-1, 1)
print(X.shape, y.shape)

(5827, 3) (5827, 1)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [8]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [9]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
predictions = model.predict(X_test_scaled)
model.fit(X_train_scaled, y_train_scaled)
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.savefig("PER_residuals")
plt.show()

TypeError: print_figure() missing 1 required positional argument: 'filename'

In [None]:
print('Coefficients: \n', X.columns, model.coef_)

In [None]:
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

# Linear Regression Prediction of Minutes Played

In [None]:
X = df[['GS', 'G', 'PTS']]
y = df['MP'].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

In [None]:
predictions = model.predict(X_test_scaled)
model.fit(X_train_scaled, y_train_scaled)
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.savefig("MP_residuals")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

# Logistic Regression and Decision Tree/Random Forest of Player Making it Past Rookie Season

In [None]:
df = pd.read_csv('cleaned_data.csv')
df = df[np.isfinite(df['WS/48'])]
df = df[np.isfinite(df['FG%'])]
df = df[np.isfinite(df['YrsExperience'])]
noExp_df = df.loc[(df['YrsExperience'] == 0) & (df['Year'] != 2016)]
noExp_df

In [None]:
noExp = noExp_df['Player']

In [None]:
oneExp_df = df.loc[(df['YrsExperience'] == 1) & (df['Year'] != 2007)]
oneExp_df

In [None]:
oneExp = oneExp_df['Player']

In [None]:
playersNone = []
for player in noExp:
    playersNone.append(player)

In [None]:
playersOne = []
for player in oneExp:
    playersOne.append(player)

In [None]:
twoYrs = []
for player in playersNone:
    if player in playersOne:
        value = 1
        twoYrs.append(value)
    else:
        value = 0
        twoYrs.append(value)

In [None]:
len(twoYrs)

In [None]:
noExp_df['atLeast2Years'] = twoYrs

In [None]:
noExp_df.head(20)

In [None]:
noExp_df.keys()

In [None]:
noExp_df = noExp_df.drop("blanl", axis=1)
noExp_df = noExp_df.drop("blank2", axis=1)
noExp_df = noExp_df.dropna(axis=0, how='any')
noExp_df = noExp_df.reset_index(drop=True) #RESET INDEX
data = noExp_df.drop("Pos", axis=1)
data = data.drop("Tm", axis=1)
data = data.drop("Player", axis=1)
data = data.drop("TrueSalary", axis=1)
data = data.drop("RoundedPosition", axis=1)
data = data.drop("Year", axis=1)
data = data.drop("Age", axis=1)
data = data.drop("atLeast2Years", axis=1)
feature_names = data.columns
target = noExp_df['atLeast2Years']
data.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1, stratify=target)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [None]:
predictions = classifier.predict(X_test_scaled)
predict_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
wrong_predict_df = predict_df.loc[predict_df['Actual'] != predict_df['Prediction']]
wrong_predict_df

In [None]:
predictions.shape

In [None]:
from sklearn import tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
x_values = np.arange(len(feature_names))
plt.figure(figsize=(10, 7))

plt.title("Feature Importance")
plt.xlabel("Feature Name")
plt.ylabel("Importance")

plt.bar(x_values, rf.feature_importances_, facecolor="darkblue",
        alpha=0.75, tick_label=feature_names)
plt.xticks(rotation='vertical')
plt.savefig()
plt.show()

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# K Nearest Neighbors Player Position

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#df['Pos'].value_counts()

In [None]:
df = df.reset_index(drop=True)

In [None]:
#df.index[df['Pos'] == 'C-SF'].tolist()

In [None]:
df = df.drop(df.index[[df.index[df['Pos'] == 'C-SF'].tolist()]])

In [None]:
df = df.reset_index(drop=True)

In [None]:
#df.index[df['Pos'] == 'C-SF'].tolist()

In [None]:
#df['Pos'].value_counts()

In [None]:
df = df.drop("blanl", axis=1)
df = df.drop("blank2", axis=1)
df = df.dropna(axis=0, how='any')

In [None]:
data = df.drop("Pos", axis=1)
data = data.drop("Tm", axis=1)
data = data.drop("Player", axis=1)
data = data.drop("TrueSalary", axis=1)
data = data.drop("RoundedPosition", axis=1)
data = data.drop("Year", axis=1)
data = data.drop("Age", axis=1)
feature_names = data.columns
target = df['Pos']
target_names = target.value_counts().keys().tolist()
data.head()

In [None]:
#data.shape

In [None]:
#target.shape

In [None]:
#data.keys()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 100, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 100, 2), train_scores, marker='o')
plt.plot(range(1, 100, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.savefig()
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
print('k=15 Test Acc: %.3f' % knn.score(X_test, y_test))

# Stratified KNN Player Position

In [None]:
#target.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

In [None]:
#X_train.values.reshape(-1,1)

In [None]:
X_scaler = StandardScaler().fit(X_train.values.reshape(-1, 1))

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 100, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 100, 2), train_scores, marker='o')
plt.plot(range(1, 100, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.savefig()
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=23)
knn.fit(X_train_scaled, y_train)
print('k=23 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

# Random Forests Player Position

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(data, target)
rf.score(data, target)

In [None]:
importances = rf.feature_importances_
importances

In [None]:
x_values = np.arange(len(feature_names))
plt.figure(figsize=(10, 7))

plt.title("Feature Importance")
plt.xlabel("Feature Name")
plt.ylabel("Importance")

plt.bar(x_values, rf.feature_importances_, facecolor="darkblue",
        alpha=0.75, tick_label=feature_names)
plt.xticks(rotation='vertical')
plt.savefig()
plt.show()

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Decision Tree and Random Forests Player Position

In [None]:
from sklearn import tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
x_values = np.arange(len(feature_names))
plt.figure(figsize=(10, 7))

plt.title("Feature Importance")
plt.xlabel("Feature Name")
plt.ylabel("Importance")

plt.bar(x_values, rf.feature_importances_, facecolor="darkblue",
        alpha=0.75, tick_label=feature_names)
plt.xticks(rotation='vertical')
plt.savefig()
plt.show()

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Team Classification

In [None]:
df = pd.read_csv('cleaned_data.csv')
df.shape

In [None]:
df = df[np.isfinite(df['WS/48'])]
df = df[np.isfinite(df['FG%'])]
df = df[np.isfinite(df['YrsExperience'])]
df.shape

In [None]:
df = df.reset_index(drop=True)

In [None]:
df = df.drop("blanl", axis=1)
df = df.drop("blank2", axis=1)
df = df.dropna(axis=0, how='any')

In [None]:
df.shape

In [None]:
data = df.drop("Pos", axis=1)
data = data.drop("Tm", axis=1)
data = data.drop("Player", axis=1)
data = data.drop("TrueSalary", axis=1)
#data = data.drop("RoundedPosition", axis=1)
#data = data.drop("Year", axis=1)
data = data.drop("Age", axis=1)
feature_names = data.columns
target = df['Tm']
target_names = target.value_counts().keys().tolist()
data.head()

In [None]:
data.shape

In [None]:
target.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

In [None]:
X_scaler = StandardScaler().fit(X_train.values.reshape(-1, 1))

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 100, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 100, 2), train_scores, marker='o')
plt.plot(range(1, 100, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.savefig()
plt.show()

In [None]:
from sklearn import tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
x_values = np.arange(len(feature_names))
plt.figure(figsize=(10, 7))

plt.title("Feature Importance")
plt.xlabel("Feature Name")
plt.ylabel("Importance")

plt.bar(x_values, rf.feature_importances_, facecolor="darkblue",
        alpha=0.75, tick_label=feature_names)
plt.xticks(rotation='vertical')
plt.savefig()
plt.show()

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Combining With DPOY Data

In [None]:
season_stats = pd.read_csv('cleaned_data.csv')

In [None]:
season_stats.columns

In [None]:
dpoy_2016 = season_stats.loc[(season_stats['Year'] == 2016.0) & (season_stats['Player'] == 'Kawhi Leonard')]
dpoy_2015 = season_stats.loc[(season_stats['Year'] == 2015.0) & (season_stats['Player'] == 'Kawhi Leonard')]
dpoy_2014 = season_stats.loc[(season_stats['Year'] == 2014.0) & (season_stats['Player'] == 'Joakim Noah')]
dpoy_2013 = season_stats.loc[(season_stats['Year'] == 2013.0) & (season_stats['Player'] == 'Marc Gasol')]
dpoy_2012 = season_stats.loc[(season_stats['Year'] == 2012.0) & (season_stats['Player'] == 'Tyson Chandler')]
dpoy_2011 = season_stats.loc[(season_stats['Year'] == 2011.0) & (season_stats['Player'] == 'Dwight Howard')]
dpoy_2010 = season_stats.loc[(season_stats['Year'] == 2010.0) & (season_stats['Player'] == 'Dwight Howard')]
dpoy_2009 = season_stats.loc[(season_stats['Year'] == 2009.0) & (season_stats['Player'] == 'Dwight Howard')]
dpoy_2008 = season_stats.loc[(season_stats['Year'] == 2008.0) & (season_stats['Player'] == 'Kevin Garnett')]
dpoy_2007 = season_stats.loc[(season_stats['Year'] == 2007.0) & (season_stats['Player'] == 'Marcus Camby')]

dpoys = [dpoy_2007, dpoy_2008, dpoy_2009, dpoy_2010, dpoy_2011, dpoy_2012, dpoy_2013, dpoy_2014, dpoy_2015, dpoy_2016]

dpoys_df = pd.concat(dpoys)
dpoys_df

In [None]:
df = season_stats
df.columns

In [None]:
dpoys_index = dpoys_df.index.values.tolist()
dpoys_index

In [None]:
dpoy = []
for i in range(0, len(df)):
    if i in dpoys_index:
        value = 1
        dpoy.append(value)
    else:
        value = 0
        dpoy.append(value)
            

In [None]:
len(dpoy)

In [None]:
sum = 0
for i in range(0, len(dpoy)):
    sum = sum + dpoy[i]
print(sum)

In [None]:
df['dpoy'] = dpoy

In [None]:
df.shape

In [None]:
df.keys()

In [None]:
df.head()

In [None]:
df.loc[df['dpoy'] == 1]

# Logistic Regression/Random Forests of Player Obtaining Defensive Player of Year

In [None]:
df = df[np.isfinite(df['WS/48'])]
df = df[np.isfinite(df['FG%'])]
df = df[np.isfinite(df['YrsExperience'])]
df = df.drop("blanl", axis=1)
df = df.drop("blank2", axis=1)
df = df.dropna(axis=0, how='any')
df = df.reset_index(drop=True) #RESET INDEX
data = df.drop("Pos", axis=1)
data = data.drop("Tm", axis=1)
data = data.drop("Player", axis=1)
data = data.drop("TrueSalary", axis=1)
data = data.drop("RoundedPosition", axis=1)
data = data.drop("Year", axis=1)
data = data.drop("Age", axis=1)
data = data.drop("dpoy", axis=1)
feature_names = data.columns
target = df['dpoy']
#target_names = target.value_counts().keys().tolist()
data.head()

In [None]:
data.shape

In [None]:
#target = target.reshape(-1,1)
target.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1, stratify=target)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [None]:
predictions = classifier.predict(X_test_scaled)
predict_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
wrong_predict_df = predict_df.loc[predict_df['Actual'] != predict_df['Prediction']]
wrong_predict_df

In [None]:
act_df_1 = predict_df.loc[predict_df['Actual'] == 1]
act_df_1

In [None]:
predict_df_1 = predict_df.loc[predict_df['Prediction'] == 1]
predict_df_1

In [None]:
predictions.shape

In [None]:
data.iloc[1183]

In [None]:
from sklearn import tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
x_values = np.arange(len(feature_names))
plt.figure(figsize=(10, 7))

plt.title("Feature Importance")
plt.xlabel("Feature Name")
plt.ylabel("Importance")

plt.bar(x_values, rf.feature_importances_, facecolor="darkblue",
        alpha=0.75, tick_label=feature_names)
plt.xticks(rotation='vertical')
plt.savefig()
plt.show()

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
predictions = rf.predict(X_test)
predict_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
wrong_predict_df = predict_df.loc[predict_df['Actual'] != predict_df['Prediction']]
wrong_predict_df

In [None]:
df.iloc[2053]

In [None]:
act_df_1 = predict_df.loc[predict_df['Actual'] == 1]
act_df_1

In [None]:
predict_df_1 = predict_df.loc[predict_df['Prediction'] == 1]
predict_df_1

In [None]:
predict_df.shape

# Combining DPOY, 6th Man, ROY, and MVP Data

In [None]:
season_stats = pd.read_csv('cleaned_data.csv')

In [None]:
dpoy_2016 = season_stats.loc[(season_stats['Year'] == 2016.0) & (season_stats['Player'] == 'Kawhi Leonard')]
dpoy_2015 = season_stats.loc[(season_stats['Year'] == 2015.0) & (season_stats['Player'] == 'Kawhi Leonard')]
dpoy_2014 = season_stats.loc[(season_stats['Year'] == 2014.0) & (season_stats['Player'] == 'Joakim Noah')]
dpoy_2013 = season_stats.loc[(season_stats['Year'] == 2013.0) & (season_stats['Player'] == 'Marc Gasol')]
dpoy_2012 = season_stats.loc[(season_stats['Year'] == 2012.0) & (season_stats['Player'] == 'Tyson Chandler')]
dpoy_2011 = season_stats.loc[(season_stats['Year'] == 2011.0) & (season_stats['Player'] == 'Dwight Howard')]
dpoy_2010 = season_stats.loc[(season_stats['Year'] == 2010.0) & (season_stats['Player'] == 'Dwight Howard')]
dpoy_2009 = season_stats.loc[(season_stats['Year'] == 2009.0) & (season_stats['Player'] == 'Dwight Howard')]
dpoy_2008 = season_stats.loc[(season_stats['Year'] == 2008.0) & (season_stats['Player'] == 'Kevin Garnett')]
dpoy_2007 = season_stats.loc[(season_stats['Year'] == 2007.0) & (season_stats['Player'] == 'Marcus Camby')]

dpoys = [dpoy_2007, dpoy_2008, dpoy_2009, dpoy_2010, dpoy_2011, dpoy_2012, dpoy_2013, dpoy_2014, dpoy_2015, dpoy_2016]

dpoys_df = pd.concat(dpoys)
#dpoys_df

In [None]:
sixman_2016 = season_stats.loc[(season_stats['Year'] == 2016.0) & (season_stats['Player'] == 'Jamal Crawford')]
sixman_2015 = season_stats.loc[(season_stats['Year'] == 2015.0) & (season_stats['Player'] == 'Lou Williams')]
sixman_2014 = season_stats.loc[(season_stats['Year'] == 2014.0) & (season_stats['Player'] == 'Jamal Crawford')]
sixman_2013 = season_stats.loc[(season_stats['Year'] == 2013.0) & (season_stats['Player'] == 'J.R. Smith')]
sixman_2012 = season_stats.loc[(season_stats['Year'] == 2012.0) & (season_stats['Player'] == 'James Harden')]
sixman_2011 = season_stats.loc[(season_stats['Year'] == 2011.0) & (season_stats['Player'] == 'Lamar Odom')]
sixman_2010 = season_stats.loc[(season_stats['Year'] == 2010.0) & (season_stats['Player'] == 'Jamal Crawford')]
sixman_2009 = season_stats.loc[(season_stats['Year'] == 2009.0) & (season_stats['Player'] == 'Jason Terry')]
sixman_2008 = season_stats.loc[(season_stats['Year'] == 2008.0) & (season_stats['Player'] == 'Manu Ginobili')]
sixman_2007 = season_stats.loc[(season_stats['Year'] == 2007.0) & (season_stats['Player'] == 'Leandro Barbosa')]

sixmen = [sixman_2007, sixman_2008, sixman_2009, sixman_2010, sixman_2011, sixman_2012, sixman_2013, sixman_2014, sixman_2015, sixman_2016]

sixmen_df = pd.concat(sixmen)

In [None]:
roy_2016 = season_stats.loc[(season_stats['Year'] == 2016.0) & (season_stats['Player'] == 'Karl-Anthony Towns')]
roy_2015 = season_stats.loc[(season_stats['Year'] == 2015.0) & (season_stats['Player'] == 'Andrew Wiggins')]
roy_2014 = season_stats.loc[(season_stats['Year'] == 2014.0) & (season_stats['Player'] == 'Michael Carter-Williams')]
roy_2013 = season_stats.loc[(season_stats['Year'] == 2013.0) & (season_stats['Player'] == 'Damian Lillard')]
roy_2012 = season_stats.loc[(season_stats['Year'] == 2012.0) & (season_stats['Player'] == 'Kyrie Irving')]
roy_2011 = season_stats.loc[(season_stats['Year'] == 2011.0) & (season_stats['Player'] == 'Blake Griffin')]
roy_2010 = season_stats.loc[(season_stats['Year'] == 2010.0) & (season_stats['Player'] == 'Tyreke Evans')]
roy_2009 = season_stats.loc[(season_stats['Year'] == 2009.0) & (season_stats['Player'] == 'Derrick Rose')]
roy_2008 = season_stats.loc[(season_stats['Year'] == 2008.0) & (season_stats['Player'] == 'Kevin Durant')]
roy_2007 = season_stats.loc[(season_stats['Year'] == 2007.0) & (season_stats['Player'] == 'Brandon Roy')]

rookies = [roy_2007, roy_2008, roy_2009, roy_2010, roy_2011, roy_2012, roy_2013, roy_2014, roy_2015, roy_2016]

rookies_df = pd.concat(rookies)

In [None]:
mvp_2016 = season_stats.loc[(season_stats['Year'] == 2016.0) & (season_stats['Player'] == 'Stephen Curry')]
mvp_2015 = season_stats.loc[(season_stats['Year'] == 2015.0) & (season_stats['Player'] == 'Stephen Curry')]
mvp_2014 = season_stats.loc[(season_stats['Year'] == 2014.0) & (season_stats['Player'] == 'Kevin Durant')]
mvp_2013 = season_stats.loc[(season_stats['Year'] == 2013.0) & (season_stats['Player'] == 'LeBron James')]
mvp_2012 = season_stats.loc[(season_stats['Year'] == 2012.0) & (season_stats['Player'] == 'LeBron James')]
mvp_2011 = season_stats.loc[(season_stats['Year'] == 2011.0) & (season_stats['Player'] == 'Derrick Rose')]
mvp_2010 = season_stats.loc[(season_stats['Year'] == 2010.0) & (season_stats['Player'] == 'LeBron James')]
mvp_2009 = season_stats.loc[(season_stats['Year'] == 2009.0) & (season_stats['Player'] == 'LeBron James')]
mvp_2008 = season_stats.loc[(season_stats['Year'] == 2008.0) & (season_stats['Player'] == 'Kobe Bryant')]
mvp_2007 = season_stats.loc[(season_stats['Year'] == 2007.0) & (season_stats['Player'] == 'Dirk Nowitzki')]

mvps = [mvp_2007, mvp_2008, mvp_2009, mvp_2010, mvp_2011, mvp_2012, mvp_2013, mvp_2014, mvp_2015, mvp_2016]

mvps_df = pd.concat(mvps)

In [None]:
df = season_stats

In [None]:
dpoys_index = dpoys_df.index.values.tolist()
dpoys_index

In [None]:
sixmen_index = sixmen_df.index.values.tolist()
sixmen_index

In [None]:
rookies_index = rookies_df.index.values.tolist()
rookies_index

In [None]:
mvps_index = mvps_df.index.values.tolist()
mvps_index

In [None]:
dpoy = []
for i in range(0, len(df)):
    if i in dpoys_index:
        value = 1
        dpoy.append(value)
    else:
        value = 0
        dpoy.append(value)

In [None]:
sixmen = []
for i in range(0, len(df)):
    if i in sixmen_index:
        value = 1
        sixmen.append(value)
    else:
        value = 0
        sixmen.append(value)

In [None]:
rookies = []
for i in range(0, len(df)):
    if i in rookies_index:
        value = 1
        rookies.append(value)
    else:
        value = 0
        rookies.append(value)

In [None]:
mvps = []
for i in range(0, len(df)):
    if i in mvps_index:
        value = 1
        mvps.append(value)
    else:
        value = 0
        mvps.append(value)

In [None]:
len(dpoy)

In [None]:
len(sixmen)

In [None]:
len(rookies)

In [None]:
len(mvps)

In [None]:
sum = 0
for i in range(0, len(dpoy)):
    sum = sum + dpoy[i]
print(sum)

In [None]:
sum = 0
for i in range(0, len(sixmen)):
    sum = sum + sixmen[i]
print(sum)

In [None]:
sum = 0
for i in range(0, len(rookies)):
    sum = sum + rookies[i]
print(sum)

In [None]:
sum = 0
for i in range(0, len(mvps)):
    sum = sum + mvps[i]
print(sum)

In [None]:
df['dpoy'] = dpoy
df['sixmen'] = sixmen
df['rookies'] = rookies
df['mvps'] = mvps

In [None]:
df.shape

In [None]:
df.keys()

# Logistic Regression/Random Forests 6th Man

In [None]:
sixmen_df = df[np.isfinite(df['WS/48'])]
sixmen_df = sixmen_df[np.isfinite(sixmen_df['FG%'])]
sixmen_df = sixmen_df[np.isfinite(sixmen_df['YrsExperience'])]
sixmen_df = sixmen_df.drop("blanl", axis=1)
sixmen_df = sixmen_df.drop("blank2", axis=1)
sixmen_df = sixmen_df.dropna(axis=0, how='any')
sixmen_df = sixmen_df.reset_index(drop=True) #RESET INDEX
data = sixmen_df.drop("Pos", axis=1)
data = data.drop("Tm", axis=1)
data = data.drop("Player", axis=1)
data = data.drop("TrueSalary", axis=1)
data = data.drop("RoundedPosition", axis=1)
data = data.drop("Year", axis=1)
data = data.drop("Age", axis=1)
data = data.drop("sixmen", axis=1)
feature_names = data.columns
target = sixmen_df['sixmen']
#target_names = target.value_counts().keys().tolist()
#data.head()

In [None]:
data.shape

In [None]:
target.shape

In [None]:
#target = target.values.reshape(-1,1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1, stratify=target)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [None]:
predictions = classifier.predict(X_test_scaled)
predict_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
wrong_predict_df = predict_df.loc[predict_df['Actual'] != predict_df['Prediction']]
wrong_predict_df

In [None]:
act_df_1 = predict_df.loc[predict_df['Actual'] == 1]
act_df_1

In [None]:
predict_df_1 = predict_df.loc[predict_df['Prediction'] == 1]
predict_df_1

In [None]:
from sklearn import tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
x_values = np.arange(len(feature_names))
plt.figure(figsize=(10, 7))

plt.title("Feature Importance")
plt.xlabel("Feature Name")
plt.ylabel("Importance")

plt.bar(x_values, rf.feature_importances_, facecolor="darkblue",
        alpha=0.75, tick_label=feature_names)
plt.xticks(rotation='vertical')
plt.savefig()
plt.show()

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
predictions = rf.predict(X_test)
predict_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
wrong_predict_df = predict_df.loc[predict_df['Actual'] != predict_df['Prediction']]
wrong_predict_df

# Logistic Regression/Random Forests Rookie of the Year

In [None]:
rookies_df = df[np.isfinite(df['WS/48'])]
rookies_df = rookies_df[np.isfinite(rookies_df['FG%'])]
rookies_df = rookies_df[np.isfinite(rookies_df['YrsExperience'])]
rookies_df = rookies_df.drop("blanl", axis=1)
rookies_df = rookies_df.drop("blank2", axis=1)
rookies_df = rookies_df.dropna(axis=0, how='any')
rookies_df = rookies_df.reset_index(drop=True) #RESET INDEX
data = rookies_df.drop("Pos", axis=1)
data = data.drop("Tm", axis=1)
data = data.drop("Player", axis=1)
data = data.drop("TrueSalary", axis=1)
data = data.drop("RoundedPosition", axis=1)
data = data.drop("Year", axis=1)
data = data.drop("Age", axis=1)
data = data.drop("rookies", axis=1)
feature_names = data.columns
target = sixmen_df['rookies']
#target_names = target.value_counts().keys().tolist()
#data.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1, stratify=target)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [None]:
from sklearn import tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
x_values = np.arange(len(feature_names))
plt.figure(figsize=(10, 7))

plt.title("Feature Importance")
plt.xlabel("Feature Name")
plt.ylabel("Importance")

plt.bar(x_values, rf.feature_importances_, facecolor="darkblue",
        alpha=0.75, tick_label=feature_names)
plt.xticks(rotation='vertical')
plt.savefig()
plt.show()

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
predictions = rf.predict(X_test)
predict_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
wrong_predict_df = predict_df.loc[predict_df['Actual'] != predict_df['Prediction']]
wrong_predict_df

In [None]:
act_df_1 = predict_df.loc[predict_df['Actual'] == 1]
act_df_1

# Logistic Regression/Random Forests MVP

In [None]:
mvp_df = df[np.isfinite(df['WS/48'])]
mvp_df = mvp_df[np.isfinite(mvp_df['FG%'])]
mvp_df = mvp_df[np.isfinite(mvp_df['YrsExperience'])]
mvp_df = mvp_df.drop("blanl", axis=1)
mvp_df = mvp_df.drop("blank2", axis=1)
mvp_df = mvp_df.dropna(axis=0, how='any')
mvp_df = mvp_df.reset_index(drop=True) #RESET INDEX
data = mvp_df.drop("Pos", axis=1)
data = data.drop("Tm", axis=1)
data = data.drop("Player", axis=1)
data = data.drop("TrueSalary", axis=1)
data = data.drop("RoundedPosition", axis=1)
data = data.drop("Year", axis=1)
data = data.drop("Age", axis=1)
data = data.drop("mvps", axis=1)
feature_names = data.columns
target = sixmen_df['mvps']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1, stratify=target)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train_scaled, y_train)

In [None]:
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

In [None]:
from sklearn import tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
x_values = np.arange(len(feature_names))
plt.figure(figsize=(10, 7))

plt.title("Feature Importance")
plt.xlabel("Feature Name")
plt.ylabel("Importance")

plt.bar(x_values, rf.feature_importances_, facecolor="darkblue",
        alpha=0.75, tick_label=feature_names)
plt.xticks(rotation='vertical')
plt.savefig()
plt.show()

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
predictions = rf.predict(X_test)
predict_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
wrong_predict_df = predict_df.loc[predict_df['Actual'] != predict_df['Prediction']]
wrong_predict_df

In [None]:
act_df_1 = predict_df.loc[predict_df['Actual'] == 1]
act_df_1

In [None]:
mvp_df.iloc[2206]