In [1]:
# Importing necessary modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix, accuracy_score, precision_score, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

In [2]:
data = pd.read_csv('data/cleaned_data.csv')

In [3]:
data.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
#importing necessary modules
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [5]:
#training and testing data

sel_cols = ['Rec Rank', 'dbpm', 'obpm', 'bpm', 'stops', 'dporpag', 'porpag', 'ftr', 
'stl_per', 'FTA', 'ORB_per', 'yr', 'AFFILIATION', 'conf', 'Class']

df = data.loc[:, sel_cols].copy()

X = df.drop(columns='Class', axis=1)
y = df.Class

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
df.head()

Unnamed: 0,Rec Rank,dbpm,obpm,bpm,stops,dporpag,porpag,ftr,stl_per,FTA,ORB_per,yr,AFFILIATION,conf,Class
0,100.0,3.08975,3.1998,6.28955,207.722,4.47388,4.33352,38.9,2.5,156,2.3,Fr,Oklahoma State,B12,1
1,99.6,7.46638,5.53482,13.0012,256.359,5.00113,5.41065,56.6,1.4,193,9.7,Fr,Southern California,P12,1
2,99.0,3.32516,4.94571,8.27087,119.77,3.0024,2.97007,33.8,3.4,66,7.4,Fr,Florida State,ACC,1
3,98.2,4.47315,4.85317,9.32632,209.261,4.13889,3.79887,37.1,3.5,114,2.7,Fr,Gonzaga,WCC,1
4,90.2,3.22572,2.63123,5.85695,171.133,4.1209,2.56112,21.6,2.4,54,3.4,Fr,Michigan,B10,1


In [7]:
#pipelines and column transformer
from sklearn.impute import SimpleImputer

numeric_pipeline = Pipeline([('numimputer', SimpleImputer(strategy = 'mean')), ('numnorm', StandardScaler())])
nominal_pipeline = Pipeline([
    ('onehotimputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehotenc', OneHotEncoder(sparse = False, handle_unknown='ignore')), 
    ('onehotscale', StandardScaler())])

num_cols = X_train.select_dtypes(['int', 'float']).columns
ct = ColumnTransformer([('one_hot', nominal_pipeline, ['yr', 'AFFILIATION', 'conf']),
('num_transform', numeric_pipeline, num_cols)])


In [8]:
from sklearn.ensemble import GradientBoostingClassifier

pipe_gbc = Pipeline([('preprocess', ct), ('model', GradientBoostingClassifier(learning_rate=1, max_depth=9, n_estimators=300, subsample=1))])


In [9]:
pipe_gbc.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('one_hot',
                                                  Pipeline(steps=[('onehotimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotenc',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False)),
                                                                  ('onehotscale',
                                                                   StandardScaler())]),
                                                  ['yr', 'AFFILIATION',
                                                   'conf']),
                                                 ('num_transform',
                                                  Pipeline(ste

In [22]:
with open('model.pkl', 'wb') as f:
    joblib.dump(pipe_gbc, f)

In [21]:
import joblib
import numpy as np

In [23]:
def round_prediction(name, rec_rank, dbpm, obpm, bpm, stops, dporgag, porpag, ftr, stl_per, FTA, ORB_per, yr, team, conference):
    """
    Given name, rec_rank, dbpm, obpm, bpm, stops, dporgag, porpag, ftr, stl_per, FTA, ORB_per, yr, team, conference,
    predict the class of draft rounds
    """
    # Load the model from the file
    with open("model.pkl", "rb") as f:
        model = joblib.load(f)
    # Construct the 2D matrix of values that .predict is expecting
    # Get a list of predictions and select only 1st
    X = {'Rec Rank': [rec_rank], 'dbpm': [dbpm], 'obpm': [obpm], 'bpm': [bpm], 'stops': [stops], 'dporpag': [dporgag], 
    'porpag': [porpag], 'ftr': [ftr], 'stl_per': [stl_per], 'FTA': [FTA], 'ORB_per': [ORB_per], 'yr': [yr], 'AFFILIATION': [team], 'conf': [conference]}
    X_data = pd.DataFrame(data=X)

    predictions = model.predict(X_data)
    prediction = predictions[0]

    return {'Player': name, "predicted_class": prediction}

In [14]:
lst = list(X_test.columns)
lst.append('player_name')
lst.append('ROUND')

In [None]:
test = data_names.loc[:, lst]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(test, test.ROUND)

In [11]:
X_test.reset_index()

Unnamed: 0,index,Rec Rank,dbpm,obpm,bpm,stops,dporpag,porpag,ftr,stl_per,FTA,ORB_per,yr,AFFILIATION,conf
0,1317,93.4,3.272530,2.61015,5.88268,85.3578,2.02197,0.956114,45.3,2.6,39,4.7,So,Kentucky,SEC
1,1455,85.8,-2.123130,-7.40360,-9.52673,77.9117,1.03477,-0.668010,34.2,0.9,41,7.2,Sr,Wake Forest,BSth
2,439,94.4,3.034240,1.66781,4.70205,196.8530,3.74317,2.631660,28.6,1.4,89,5.2,So,Ohio State,B10
3,560,79.0,2.707350,3.86929,6.57664,131.8080,3.58036,4.063560,64.5,2.3,136,1.8,So,Xavier,BE
4,433,98.4,1.473910,6.78920,8.26311,215.9620,3.89272,5.080490,22.4,1.4,131,4.9,Jr,North Carolina,ACC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,380,95.0,0.208814,7.31952,7.52834,220.9460,3.81818,6.788200,49.1,2.1,252,2.1,So,Duke,ACC
368,1234,95.8,1.394550,6.53070,7.92525,221.4530,4.32707,6.147460,40.5,2.0,230,4.1,Sr,Duke,ACC
369,1157,,-1.650850,5.81511,4.16425,152.1720,1.76239,3.834660,29.7,2.5,95,1.4,Sr,Virginia,BSky
370,916,94.0,0.684933,2.71120,3.39614,186.7600,3.58397,3.977780,56.0,2.3,235,1.5,So,Xavier,BE


In [15]:
lst

['Rec Rank',
 'dbpm',
 'obpm',
 'bpm',
 'stops',
 'dporpag',
 'porpag',
 'ftr',
 'stl_per',
 'FTA',
 'ORB_per',
 'yr',
 'AFFILIATION',
 'conf',
 'player_name',
 'ROUND']

In [17]:
df_2022 = pd.read_csv('data/CollegeBasketballPlayers2022.csv')
df_2022

Unnamed: 0,player_name,team,conf,GP,Min_per,Ortg,usg,eFG,TS_per,ORB_per,...,dgbpm,oreb,dreb,treb,ast,stl,blk,pts,Unnamed: 64,Unnamed: 65
0,Isaiah Felder,South Carolina St.,MEAC,11,17.6,61.1,18.6,34.7,35.18,2.5,...,-4.412530,0.2727,1.4545,1.7273,0.4545,0.1818,0.0000,2.3636,Wing G,12.226400
1,Jalen Coleman-Lands,Kansas,B12,23,78.5,103.1,21.5,54.0,56.12,3.6,...,-1.019000,1.0435,2.8696,3.9130,1.1739,0.8261,0.0870,14.3043,Wing G,11.095900
2,K.J. Walton,Akron,MAC,20,63.0,108.6,26.5,54.3,58.88,9.1,...,-0.335789,2.3500,3.5500,5.9000,1.1500,1.2500,0.2000,16.3500,Wing G,0.284581
3,Jeriah Horne,Tulsa,Amer,32,61.3,116.0,20.8,55.2,58.61,5.8,...,1.721580,1.1250,4.7188,5.8438,0.9062,0.5938,0.0625,10.8125,Stretch 4,10.140700
4,Eric Curry,Minnesota,B10,29,39.2,95.1,14.7,46.3,48.72,7.0,...,2.746770,1.1034,1.9655,3.0690,0.8276,0.8276,0.2414,3.6552,Wing F,2.562430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3973,Aquan Smart,FIU,CUSA,21,11.7,69.4,17.7,32.1,36.08,1.8,...,,,,,,,,,,
3974,Chase Barrs,Florida A&M,SWAC,14,11.0,85.1,8.8,42.9,40.61,5.4,...,,,,,,,,,,
3975,Hugo Clarkin,Sacramento St.,BSky,16,22.8,99.7,7.8,70.6,68.67,2.9,...,,,,,,,,,,
3976,James Graham III,Maryland,B10,7,1.8,9.4,17.1,0.0,0.00,0.0,...,,,,,,,,,,


In [None]:
players_2022 = {'Player': [], 'predicted_class': []}

for i in range(3978):
    rec_rank = df_2022.iloc[i]['Rec Rank']
    dbpm = df_2022.iloc[i]['dbpm']
    obpm = df_2022.iloc[i]['obpm']
    bpm = df_2022.iloc[i]['bpm']
    stops = df_2022.iloc[i]['stops']
    dporpag = df_2022.iloc[i]['dporpag']
    porpag = df_2022.iloc[i]['porpag']
    ftr = df_2022.iloc[i]['ftr']
    stl_per = df_2022.iloc[i]['stl_per']
    fta = df_2022.iloc[i]['FTA']
    orb_per = df_2022.iloc[i]['ORB_per']
    yr = df_2022.iloc[i]['yr']
    team = df_2022.iloc[i]['team']
    conference = df_2022.iloc[i]['conf']
    name = df_2022.iloc[i]['player_name']

    dct = round_prediction(name, rec_rank, dbpm, obpm, bpm, stops, dporpag, porpag, ftr, stl_per, fta, orb_per, yr, team, conference)
    players_2022['Player'].append(dct['Player'])
    players_2022['predicted_class'].append(dct['predicted_class'])

In [67]:
df_predict = pd.DataFrame(players_2022)

In [106]:
df_predict[df_predict.Player == 'Jalen Duren']

Unnamed: 0,Player,predicted_class


In [103]:
df_predict.player['Johnny Davis']

AttributeError: 'DataFrame' object has no attribute 'player'

In [None]:
Ochai Agbaji
Bennedict Mathurin
