## Dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

## Data Cleaning

In [2]:
df = pd.read_csv("Data/alldf.csv", index_col=False)

In [3]:
df = df.drop(columns= 'Unnamed: 0')

In [4]:
df = df.dropna(how="any")

In [5]:
df.loc[df["Season"]=='2017-18']

Unnamed: 0,School,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,...,TRB,AST,STL,BLK,PF,TOV,PTS,Link,DraftStatus,Season
976,Massachusetts,32,28,35.2,3.6,8.7,0.414,1.0,2.5,0.4,...,5.0,5.2,0.7,0.2,2.4,2.4,10.6,https://basketball.realgm.com/player/CJ-Anders...,Undrafted,2017-18
980,Massachusetts,32,28,35.2,3.6,8.7,0.414,1.0,2.5,0.4,...,5.0,5.2,0.7,0.2,2.4,2.4,10.6,https://basketball.realgm.com/player/CJ-Anders...,Undrafted,2017-18
981,Arkansas,35,35,31.3,6.4,13.6,0.47,2.5,5.8,0.431,...,3.9,2.5,1.0,0.3,2.5,1.9,17.9,https://basketball.realgm.com/player/Jaylen-Ba...,Undrafted,2017-18
983,Arkansas,35,35,31.3,6.4,13.6,0.47,2.5,5.8,0.431,...,3.9,2.5,1.0,0.3,2.5,1.9,17.9,https://basketball.realgm.com/player/Jaylen-Ba...,Undrafted,2017-18
984,Missouri,32,32,34.1,4.4,9.9,0.449,2.6,6.2,0.414,...,5.9,1.1,0.7,0.6,1.6,1.6,13.7,https://basketball.realgm.com/player/Jordan-Ba...,Undrafted,2017-18
988,Missouri,32,32,34.1,4.4,9.9,0.449,2.6,6.2,0.414,...,5.9,1.1,0.7,0.6,1.6,1.6,13.7,https://basketball.realgm.com/player/Jordan-Ba...,Undrafted,2017-18
989,Virginia Tech,31,30,32.8,5.0,10.3,0.48700000000000004,2.1,5.4,0.39799999999999996,...,2.5,1.5,0.6,0.2,1.5,1.2,13.3,https://basketball.realgm.com/player/Justin-Bi...,Undrafted,2017-18
993,Virginia Tech,31,30,32.8,5.0,10.3,0.48700000000000004,2.1,5.4,0.39799999999999996,...,2.5,1.5,0.6,0.2,1.5,1.2,13.3,https://basketball.realgm.com/player/Justin-Bi...,Undrafted,2017-18
994,Eastern Michigan,35,27,31.0,3.5,7.7,0.46299999999999997,0.9,2.5,0.349,...,4.9,3.5,2.4,1.1,2.1,1.8,9.2,https://basketball.realgm.com/player/Tim-Bond/...,Undrafted,2017-18
998,Eastern Michigan,35,27,31.0,3.5,7.7,0.46299999999999997,0.9,2.5,0.349,...,4.9,3.5,2.4,1.1,2.1,1.8,9.2,https://basketball.realgm.com/player/Tim-Bond/...,Undrafted,2017-18


In [6]:
df["DraftStatus"].value_counts()

Undrafted    2517
Drafted       641
Name: DraftStatus, dtype: int64

In [7]:
df_copy = df 

## Observation
### This dataset is still biased towards "Undrafted"; however there are much more records than the previous dataset that would help to train the model better

## Label Encoding and Transform

In [8]:
labelencoder  = LabelEncoder()
df = df.apply(labelencoder.fit_transform)

In [9]:
X = df[["GP", "GS", "MIN", "FGM", "FGA", "FG%", "3PM", "3PA", "3P%", "FTM", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "PF", "TOV", "PTS"]]
y = df["DraftStatus"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression

In [11]:
model = LogisticRegression()
classifier = model.fit(X_train, y_train)

In [12]:
print("Training Score : ", classifier.score(X_train,y_train))
print("Testing Score : ", classifier.score(X_test, y_test))

Training Score :  0.8598574821852731
Testing Score :  0.8591772151898734


In [13]:
logistic_predictions = classifier.predict(X_test)

In [14]:
logistic_predictions_df = pd.DataFrame({"Predictions": logistic_predictions, "Actuals": y_test})

In [15]:
logistic_predictions_df.head()

Unnamed: 0,Predictions,Actuals
3155,1,1
2783,1,1
3063,1,1
2437,1,1
158,0,0


## Random Forest

In [16]:
rf = RandomForestClassifier(n_estimators=200)
rf_classifier = rf.fit(X_train, y_train)

In [17]:
rf_perdictions = rf_classifier.predict(X_test)
print("Training Score : ", rf_classifier.score(X_train, y_train))
print("Testing Score : ", rf_classifier.score(X_test, y_test))

Training Score :  1.0
Testing Score :  0.9382911392405063


In [18]:
rf_perdictions_df = pd.DataFrame({"Predictions": rf_perdictions, "Actuals": y_test})

In [19]:
rf_perdictions_df.head()

Unnamed: 0,Predictions,Actuals
3155,1,1
2783,1,1
3063,1,1
2437,1,1
158,0,0


In [20]:
rf_perdictions_df.to_csv("Data/Output/ML.csv")

## Conclusion
#### Random Forest Model score and predictions are the best.
#### Applying that model to the 2018 data that we need to predict if players are gonna be Drafted or not
#### We will go ahead and split the data for Season 2017-2018 to predict the results for it and train the data based on the rest of seasons data

In [21]:
def draft_output(row):
    if row["DraftStatus"]  == "Drafted":
        val = 1
    else:
        val = 0
    return val;

In [22]:
df_copy = df_copy.loc[df_copy["GP"] != "-"]

In [23]:
Last_Season_df = df_copy.loc[df_copy["Season"]=="2017-18"]
Last_Season_df["Draft"] = Last_Season_df.apply(draft_output, axis=1)
Last_Season_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,School,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,...,AST,STL,BLK,PF,TOV,PTS,Link,DraftStatus,Season,Draft
976,Massachusetts,32,28,35.2,3.6,8.7,0.414,1.0,2.5,0.4,...,5.2,0.7,0.2,2.4,2.4,10.6,https://basketball.realgm.com/player/CJ-Anders...,Undrafted,2017-18,0
980,Massachusetts,32,28,35.2,3.6,8.7,0.414,1.0,2.5,0.4,...,5.2,0.7,0.2,2.4,2.4,10.6,https://basketball.realgm.com/player/CJ-Anders...,Undrafted,2017-18,0
981,Arkansas,35,35,31.3,6.4,13.6,0.47,2.5,5.8,0.431,...,2.5,1.0,0.3,2.5,1.9,17.9,https://basketball.realgm.com/player/Jaylen-Ba...,Undrafted,2017-18,0


In [24]:
season_df = df_copy.loc[df_copy["Season"] != "2017-18"]
season_df["Draft"] = season_df.apply(draft_output, axis=1)
season_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,School,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,...,AST,STL,BLK,PF,TOV,PTS,Link,DraftStatus,Season,Draft
0,Washington,25.0,25.0,35.7,8.4,17.6,0.476,2.1,5.0,0.413,...,5.9,1.6,1.2,2.5,3.2,23.2,https://www.sports-reference.com/cbb/players/m...,Drafted,2016-17,1
1,UCLA,36.0,36.0,35.1,5.3,9.5,0.551,2.2,5.4,0.412,...,7.6,1.8,0.8,1.8,2.5,14.6,https://www.sports-reference.com/cbb/players/l...,Drafted,2016-17,1
2,Duke,29.0,27.0,33.3,5.7,12.6,0.452,1.4,4.0,0.342,...,2.1,1.3,1.1,3.0,2.6,16.8,https://www.sports-reference.com/cbb/players/j...,Drafted,2016-17,1


In [25]:

X_season = season_df[["GP", "GS", "MIN", "FGM", "FGA", "FG%", "3PM", "3PA", "3P%", "FTM", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "PF", "TOV", "PTS"]]
y_season = season_df["Draft"]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_season, y_season, test_size = 0.2, random_state=42)

In [27]:
model = RandomForestClassifier(n_estimators=200)
Season_rf = model.fit(X_train, y_train)
print("Training Score : ", Season_rf.score(X_train, y_train))
print("Testing Score : ", Season_rf.score(X_test, y_test))

Training Score :  1.0
Testing Score :  0.7733990147783252


##### Still Very Good Model Score

In [34]:
X_latest_season = Last_Season_df[["GP", "GS", "MIN", "FGM", "FGA", "FG%", "3PM", "3PA", "3P%", "FTM", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "PF", "TOV", "PTS"]]
y_latest_season = Last_Season_df["Draft"]

In [35]:
predictions = Season_rf.predict(X_latest_season)

In [36]:
predictions_df = pd.DataFrame({"Predictions": predictions, "Actuals": y_latest_season})

In [37]:
predictions_df

Unnamed: 0,Predictions,Actuals
976,0,0
980,0,0
981,1,0
983,1,0
984,0,0
988,0,0
989,0,0
993,0,0
994,0,0
998,0,0
