In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import scipy.stats as stats
from sklearn import metrics

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [4]:
import warnings
warnings.filterwarnings('ignore')

#### The target feature (number of tournmanet wins) is highly imbalanced, as is expected. For every team that wins the championship (6 wins), there are 32 teams that lose in the first round (0 wins).

#### The data shows that the number of wins of a class is approximately half of the number of wins of the previous class. 

####    -If the data was 100% complete. The relative amount of entries in each class would be EXACTLY half of its preceeding.
####    - It is noted that the exception to the above statement is between the 5 wins and 6 wins classes. This is expected since there is no 7 wins class. While two teams earn 5 wins. One of these teams will also earn a 6th win. So with 100% complete data. The 5 wins class and 6 wins class would have EQUAL number of entries. 

##### Oversample the data to compensate for the imbalanced data. Use SMOTE and class_weights

In [5]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [6]:
from sklearn.utils import class_weight

In [7]:
final_df = pd.read_csv('final_df.csv')

In [8]:
final_df.drop(['Unnamed: 0', 'TeamName','TeamID'], axis=1, inplace=True)

In [9]:
final_df

Unnamed: 0,Season,Rank_WLK,Rank_DOL,Rank_COL,Rank_SAG,Rank_MOR,Rank_POM,Seed,Wins,Losses,...,Stl_pg,Stl_Ag_pg,Blk,Blk_Ag,Blk_pg,Blk_Ag_pg,PF,PF_Ag,PF_pg,PF_Ag_pg
0,2003,8,6,3,8,6,9,1,24,6,...,6.933,5.167,113.0,80.0,3.767,2.667,558.0,526.0,18.600,17.533
1,2003,9,4,8,9,20,12,2,24,5,...,6.414,7.379,127.0,94.0,4.379,3.241,536.0,639.0,18.483,22.034
2,2003,14,5,9,12,21,15,3,24,5,...,8.310,6.621,211.0,91.0,7.276,3.138,481.0,572.0,16.586,19.724
3,2003,6,16,6,6,14,14,4,24,6,...,7.200,5.767,142.0,73.0,4.733,2.433,680.0,678.0,22.667,22.600
4,2003,17,19,25,16,8,11,5,21,9,...,8.733,7.733,112.0,90.0,3.733,3.000,477.0,515.0,15.900,17.167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,2021,83,49,52,87,50,91,12,23,1,...,7.750,6.000,57.0,80.0,2.375,3.333,84.0,98.0,3.500,4.083
1193,2021,78,88,85,84,94,71,13,16,9,...,6.840,6.480,75.0,74.0,3.000,2.960,138.0,126.0,5.520,5.040
1194,2021,10,16,14,67,41,84,14,14,1,...,7.200,4.867,51.0,42.0,3.400,2.800,68.0,85.0,4.533,5.667
1195,2021,151,150,181,152,130,151,15,13,10,...,6.217,5.565,94.0,71.0,4.087,3.087,184.0,177.0,8.000,7.696


##### Normalize and Scale the data

In [10]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

##### Calculate the weight of each class and convert to a dictionary.

In [11]:
cw = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(final_df['T_Wins']),y=final_df['T_Wins'])


In [12]:
categories=list(final_df['T_Wins'].sort_values().unique())
cw_list=list(cw)
weights=dict(zip(categories, cw_list))


In [13]:
#DEFINE Train set and Test set
X = final_df
X = scaler.fit_transform(X)
X = pd.DataFrame(X)
y = final_df['T_Wins']
#X,y = smote.fit_resample(X,y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35,random_state=101)
X_train, y_train = smote.fit_resample(X_train,y_train)

##### Define Classificaiton models to consider.

In [14]:
models = [RandomForestClassifier(n_estimators=100, class_weight=weights), KNeighborsClassifier(n_neighbors=17), svm.SVC(class_weight=weights), LogisticRegression(class_weight=weights)]


#### Run different metrics on each model to determine the most suitable. 
#### Logisitic Regression performs the best but raises concerns of overfitted data.
#### Random Forest model yields the strongest results with reaosnable accuracy.

In [15]:
model_selection_rank = pd.DataFrame()
for model in models:
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_train, y_train)
    model_selection_rank[model.__str__()] = scores
    model_selection_rank   
    y_predict=model.predict(X_test)
    ac=accuracy_score(y_predict,y_test)
    cm=confusion_matrix(y_test, y_predict)
    cr=classification_report(y_test, y_predict)
    print(model)
    print(f'Train Score: {model.score(X_train, y_train)}')
    print(f'Test Score: {model.score(X_test, y_test)}')
    print(f'Accuracy: {ac}')
    print(f'Confusion Matrix: \n {cm}')
    print(f'Classification Report: \n {cr}')

RandomForestClassifier(class_weight={0.0: 0.2923076923076923,
                                     1.0: 0.5294117647058824, 2.0: 1.1875,
                                     3.0: 2.28, 4.0: 5.181818181818182,
                                     5.0: 9.0, 6.0: 9.5})
Train Score: 1.0
Test Score: 0.7565632458233891
Accuracy: 0.7565632458233891
Confusion Matrix: 
 [[198  17   2   0   0   0   0]
 [ 25  66   8   5   0   0   0]
 [  3  18  29   1   0   0   0]
 [  2   2   5  11   0   0   0]
 [  0   1   0   6   5   0   0]
 [  0   0   1   2   2   3   0]
 [  0   0   1   0   0   1   5]]
Classification Report: 
               precision    recall  f1-score   support

         0.0       0.87      0.91      0.89       217
         1.0       0.63      0.63      0.63       104
         2.0       0.63      0.57      0.60        51
         3.0       0.44      0.55      0.49        20
         4.0       0.71      0.42      0.53        12
         5.0       0.75      0.38      0.50         8
         6.0  

In [16]:
#CHOSEN MODEL
model = RandomForestClassifier(n_estimators=100, class_weight=weights)

In [17]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

0.711217183770883

In [18]:
result = pd.DataFrame({'actual':y_test, 'predicted':y_pred})

In [19]:
result['actual points earned'] = round(2**(result['actual']-1)-.01)
result['predicted points earned'] = round(2**(result['predicted']-1)-.01)

In [20]:
result['difference in score'] = abs(result['actual points earned'] - result['predicted points earned'])

In [21]:
nums = [0,1,2,3,4,5,6]
dictionary_result={}
pred_list=[]
act_list=[]
pred_correct_list=[]
for num in nums:
   # result_pred = result[(result['difference in score'] == 0) & (result['actual'] == num)]['actual'].count()
    result_pred = result[result['predicted'] == num]['predicted'].count()
    result_act = result[result['actual'] == num]['actual'].count()
    result_pred_correct = result[(result['difference in score'] == 0) & (result['actual'] == num)]['actual'].count()
    pred_list.append(result_pred)
    act_list.append(result_act)
    pred_correct_list.append(result_pred_correct)
    #dict.update({f'pred{num}' : result_pred, f'act{num}' : result_act})
dictionary_result={'predicted': pred_list, 'actual': act_list, 'predicted correct': pred_correct_list}
pd.DataFrame.from_dict(dictionary_result)

Unnamed: 0,predicted,actual,predicted correct
0,228,217,193
1,110,104,62
2,42,51,23
3,25,20,10
4,6,12,4
5,3,8,1
6,5,7,5


In [31]:
result

Unnamed: 0,actual,predicted,actual points earned,predicted points earned,difference in score
980,2.0,2.0,2.0,2.0,0.0
99,2.0,2.0,2.0,2.0,0.0
740,4.0,2.0,8.0,2.0,6.0
1138,0.0,0.0,0.0,0.0,0.0
595,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...
1181,6.0,6.0,32.0,32.0,0.0
533,0.0,0.0,0.0,0.0,0.0
416,0.0,1.0,0.0,1.0,1.0
186,1.0,1.0,1.0,1.0,0.0


In [23]:
model_correct=[]
model_incorrect=[]
for num in list(result['actual'].sort_values().unique()):
    model_correct.append(result[(result['actual']>=num) & (result['predicted']>=num)]['actual'].count())
    model_incorrect.append(result[(result['actual']>=num) & (result['predicted']<num)]['actual'].count())

model_ac = pd.DataFrame()
model_ac['Round'] = [0,1,2,3,4,5,6]
model_ac['Model Correct'] = model_correct
model_ac['Model Incorrect'] = model_incorrect
model_ac.drop(0, inplace=True)
model_ac['Model Accuracy'] = round(model_ac['Model Correct']/(model_ac['Model Correct']+model_ac['Model Incorrect']),3)
model_ac


Unnamed: 0,Round,Model Correct,Model Incorrect,Model Accuracy
1,1,167,35,0.827
2,2,64,34,0.653
3,3,32,15,0.681
4,4,14,13,0.519
5,5,8,7,0.533
6,6,5,2,0.714


In [32]:
best_seed_correct = [23.1, 10.4, 4.2, 1.3, 0.6, 0.2]
best_seed_incorrect = [32-23.1, 16-10.4, 8-4.2, 4-1.3, 2-0.6, 1-0.2]
model_ac['Best-Seed Correct']=best_seed_correct
model_ac['Best-Seed Incorrect']=best_seed_incorrect
model_ac['Best-Seed Accuracy'] = round(model_ac['Best-Seed Correct']/(model_ac['Best-Seed Correct']+model_ac['Best-Seed Incorrect']),3)
model_ac


Unnamed: 0,Round,Model Correct,Model Incorrect,Model Accuracy,Best-Seed Correct,Best-Seed Incorrect,Best-Seed Accuracy
1,1,167,35,0.827,23.1,8.9,0.722
2,2,64,34,0.653,10.4,5.6,0.65
3,3,32,15,0.681,4.2,3.8,0.525
4,4,14,13,0.519,1.3,2.7,0.325
5,5,8,7,0.533,0.6,1.4,0.3
6,6,5,2,0.714,0.2,0.8,0.2


In [33]:
model_ac['Model Points'] = round(model_ac['Model Accuracy']*32,1)
model_ac['Best-Seed Points'] = round(model_ac['Best-Seed Accuracy']*32,1)

In [34]:
model_ac

Unnamed: 0,Round,Model Correct,Model Incorrect,Model Accuracy,Best-Seed Correct,Best-Seed Incorrect,Best-Seed Accuracy,Model Points,Best-Seed Points
1,1,167,35,0.827,23.1,8.9,0.722,26.5,23.1
2,2,64,34,0.653,10.4,5.6,0.65,20.9,20.8
3,3,32,15,0.681,4.2,3.8,0.525,21.8,16.8
4,4,14,13,0.519,1.3,2.7,0.325,16.6,10.4
5,5,8,7,0.533,0.6,1.4,0.3,17.1,9.6
6,6,5,2,0.714,0.2,0.8,0.2,22.8,6.4


In [35]:
model_ac['Model Points'].sum()

125.7

In [36]:
model_ac['Best-Seed Points'].sum()

87.10000000000001