In [3]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [4]:
df = pd.read_csv('Cleaned_Stats.csv')

In [5]:
X = df.drop(['pick','Drafted','pid'],axis=1)
y = df['pick']

In [6]:
X.shape

(25745, 54)

In [7]:
y.shape

(25745,)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,stratify=y, random_state=42)

In [9]:
Lin_Reg_Pipeline = Pipeline([('Scaler', StandardScaler()),
                             ('LinearRegression',LinearRegression())])

Dec_Tree_Pipeline = Pipeline([('Scaler', StandardScaler()),
                              ('decision_tree', DecisionTreeRegressor(max_leaf_nodes=100, random_state=42))])

RF_Pipeline = Pipeline([('Scaler', StandardScaler()),
                        ('random_forest', RandomForestRegressor(n_estimators=100, random_state=42))])

Xgboost_Pipeline = Pipeline([('scaler', StandardScaler()),
                             ('xgboost', XGBRegressor(learning_rate = 0.01, n_estimators = 400, max_depth = 4))])

svr_pipeline = Pipeline([('scaler', StandardScaler()),
                         ('svr', SVR(kernel='rbf', C=1.0, epsilon=0.2))])

In [12]:
def fit_and_print(pipeline, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    pipeline.fit(X_train,y_train)
    y_pred =pipeline.predict(X_test)
    print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
    print(f'R^2: {r2_score(y_test, y_pred)}')

In [13]:
fit_and_print(Lin_Reg_Pipeline)

Mean Squared Error: 24.05164675878812
R^2: 0.22619083471038703


In [14]:
fit_and_print(Dec_Tree_Pipeline)


Mean Squared Error: 24.53218956202976
R^2: 0.21073042032830414


In [15]:
fit_and_print(RF_Pipeline)

Mean Squared Error: 13.9671781510973
R^2: 0.5506365707536071


In [16]:
fit_and_print(Xgboost_Pipeline)

Mean Squared Error: 13.5994030372708
R^2: 0.5624689312027057


In [17]:
fit_and_print(svr_pipeline)

Mean Squared Error: 29.1914031541854
R^2: 0.06083040654506344


In [44]:
## Best model for further hyperparameter tuning/feature selection is xgboost. 

In [98]:
Xgboost_Pipeline.fit(X_train,y_train)

In [99]:
XG = Xgboost_Pipeline.named_steps['xgboost']
importance = XG.feature_importances_

In [100]:
importance

array([0.00952195, 0.00411121, 0.0088977 , 0.00523848, 0.009619  ,
       0.00769347, 0.01099353, 0.01422757, 0.01195717, 0.01960264,
       0.01020367, 0.0046319 , 0.01355782, 0.01250881, 0.01567176,
       0.01618967, 0.00652964, 0.0151896 , 0.01733231, 0.01849015,
       0.01568711, 0.02119331, 0.07195165, 0.01846991, 0.0270884 ,
       0.01097432, 0.14628421, 0.01521099, 0.01865307, 0.04415694,
       0.02625873, 0.02269402, 0.01511508, 0.01958564, 0.02322108,
       0.05665876, 0.00801191, 0.02894732, 0.01425205, 0.01176086,
       0.01125188, 0.01477728, 0.01555101, 0.01194226, 0.01874851,
       0.02747335, 0.        , 0.0116513 , 0.00975585, 0.        ,
       0.00794222, 0.00821151, 0.        , 0.01435144], dtype=float32)

In [57]:
importance.shape

(56,)

In [101]:
feature_names = X.columns.tolist()

In [102]:
f_importance = pd.DataFrame({'Feature':feature_names, 'Importance': importance})

In [115]:
f_importance.sort_values(by='Importance', ascending=False).reset_index().drop(['index'],axis=1)

Unnamed: 0,Feature,Importance
0,Rec Rank,0.146284
1,porpag,0.071952
2,gbpm,0.056659
3,adrtg,0.044157
4,ogbpm,0.028947
5,pts,0.027473
6,pfr,0.027088
7,dporpag,0.026259
8,dbpm,0.023221
9,stops,0.022694
