In [1]:
# import the libraries
import pandas as pd
import numpy as np

In [2]:
# import the dataset
df = pd.read_csv('data/stats.csv')
df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,MVP,DPOY
0,Kareem Abdul-Jabbar,C,32,LAL,82,,38.3,10.2,16.9,0.604,...,9.5,5.3,14.8,0.227,4.8,2.4,7.2,7.3,1,0
1,Tom Abernethy,PF,25,GSW,67,,18.2,2.3,4.7,0.481,...,1.2,0.8,2.0,0.08,-1.0,-0.2,-1.2,0.2,0,0
2,Alvan Adams,C,25,PHO,75,,28.9,6.2,11.7,0.531,...,3.1,3.9,7.0,0.155,1.7,1.9,3.6,3.1,0,0
3,Tiny Archibald,PG,31,BOS,80,80.0,35.8,4.8,9.9,0.482,...,5.9,2.9,8.9,0.148,1.4,-0.3,1.1,2.3,0,0
4,Dennis Awtrey,C,31,CHI,26,,21.5,1.0,2.3,0.45,...,0.1,0.5,0.6,0.053,-2.3,0.9,-1.4,0.1,0,0


In [3]:
# split data to training df and df we want to predict (current season)
df_train = df[df['Year'] != 2022]
df_predict = df[df['Year'] == 2022]

In [4]:
# create X and y 
X = df_train.drop(['Player','Pos','Tm','MVP','DPOY'], axis=1)
y = df_train['MVP']

In [5]:
# now we scale the data.. i decided to use the MinMaxScaler on each year as we wat to see how the mvp compares to the other players
from sklearn.preprocessing import MinMaxScaler

years = [*range(1980,2022)]

# create empty df
X_scaled = pd.DataFrame(columns=X.columns)
X_scaled.drop(columns='Year', inplace=True)

# iterate over years and apply the MinMaxScaler
for year in years:
    data = X[X['Year'] == year].copy()
    data.drop(columns='Year', inplace=True)
    columns = data.columns

    scaler = MinMaxScaler()
    data = scaler.fit_transform(data)
    data_scaled = pd.DataFrame(data, columns=columns)
    X_scaled = pd.concat([X_scaled,data_scaled])

In [6]:
# there are some missing data, which I do not want to omit of course... using avg of the whole df does not make sense,
# so I decided to use KNNImputer to impute averages of nearest neighbors found in the df
from sklearn.impute import KNNImputer

imputer = KNNImputer()
X_scaled = imputer.fit_transform(X_scaled)

In [7]:
# now we split the df in to training and test dfs
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y.values, test_size=0.2, random_state=42)

In [8]:
# now we create classifier object, I decided to use RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# fitting the data, training the model
clf = RandomForestClassifier(criterion='entropy')
clf.fit(X_train, y_train)

In [9]:
# making a prediction on the training df
y_pred = clf.predict(X_test)

In [10]:
# let's see the accuracy score
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9986146386515816

In [11]:
# accuracy score was very high, not surprising given the nature of the data
# let's predict on the current years data (it's just the start of the season, but we should see if reasonable of completely wrong)
X_predict = df_predict.drop(['Player','Pos','Tm','MVP','Year','DPOY'], axis=1)

# scaling the data
scaler = MinMaxScaler()
X_predict = scaler.fit_transform(X_predict)

In [12]:
# imputing missing data
X_predict = imputer.fit_transform(X_predict)

In [13]:
# predicting
y_pred_2022 = clf.predict_proba(X_predict)

In [14]:
# little cleaning just to see a nicer table
df_proba = pd.DataFrame(y_pred_2022, columns=['proba_0','Probability'])
df_proba['Player'] = df_predict['Player'].reset_index(drop=True)
df_proba = df_proba[['Player', 'Probability']].copy()

In [15]:
# and let's see
df_proba.sort_values(by='Probability', ascending=False).head(10).reset_index(drop=True)

Unnamed: 0,Player,Probability
0,Nikola Jokić,0.63
1,Giannis Antetokounmpo,0.19
2,Joel Embiid,0.09
3,Jayson Tatum,0.07
4,Luka Dončić,0.06
5,Trae Young,0.05
6,LeBron James,0.02
7,DeMar DeRozan,0.01
8,James Harden,0.01
9,James Harden,0.01


It's predicting players which definitely have a strong case to be the MVP. Model seems to be working nicely!