In [1]:
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.rcParams["font.size"] = 16

from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.tree import DecisionTreeClassifier

In [6]:
mp_deal_df = pd.read_csv('Monopoly_Deal_Stats.csv', index_col=0)

In [7]:
mp_deal_df

Unnamed: 0_level_0,A_win,A_wild,A_db,A_refuse,B_wild,B_db,B_refuse
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,2,2,0,0,0,1
2,1,0,1,0,0,0,1
3,0,0,1,1,1,0,2
4,0,0,0,0,1,1,0
5,1,1,2,0,1,0,1
6,0,1,1,0,1,1,0
7,1,0,1,0,1,0,0
8,1,1,1,0,1,0,0
9,0,1,0,0,0,0,1
10,0,0,2,0,1,0,2


In [8]:
train_df, test_df = train_test_split(mp_deal_df, test_size=0.2, random_state=321)

In [9]:
mp_deal_summary = train_df.describe()

In [10]:
mp_deal_summary

Unnamed: 0,A_win,A_wild,A_db,A_refuse,B_wild,B_db,B_refuse
count,14.0,14.0,14.0,14.0,14.0,14.0,14.0
mean,0.357143,0.428571,0.928571,0.428571,0.428571,0.357143,0.642857
std,0.497245,0.513553,0.730046,0.646206,0.513553,0.497245,0.744946
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.25,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,0.0,0.0,0.5
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,2.0,2.0,1.0,1.0,2.0


In [11]:
# These steps are not required
mp_deal_max = pd.Series(mp_deal_summary.loc['max'], name='mp_deal_max')
mp_deal_min = pd.Series(mp_deal_summary.loc['min'], name='mp_deal_min')
range_df = pd.concat([mp_deal_max, mp_deal_min], axis=1)
range_df.insert(2, 'feature_range', range_df['mp_deal_max'] - range_df['mp_deal_min'])
smallest_range_feature = range_df['feature_range'].idxmin()
print(smallest_range_feature)

A_win


### Seperating target value 
A_win is the target value our model will be used to predict.

In [24]:
X_train = train_df.drop(columns=['A_win'])
y_train = train_df['A_win']
X_test = test_df.drop(columns=['A_win'])
y_test = test_df['A_win']

### Using DummyClassifier to get a baseline for the model
Our model should perform better than DummyClassifier to be significant.

In [25]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(random_state=123)
cv_scores = cross_validate(dummy, X_train, y_train)
dummy_score = cv_scores['test_score'].mean()
print(dummy_score)

0.6333333333333333


### Creating a simple model with default params for DecisionTreeClassifier

In [26]:
mp_deal_tree = DecisionTreeClassifier(random_state=123)
mp_deal_tree.fit(X_train, y_train)

In [27]:
pred = mp_deal_tree.predict(X_train.iloc[[0]])[0]
print(pred)
print(y_train.iloc[0])

0
0


### CV scores differ significantly based on the number of folds 
This is because our data size is so small.

In [23]:
mean_cv_score = cross_val_score(mp_deal_tree, X_train, y_train, cv=4).mean()
print(mean_cv_score)

0.7083333333333334


In [31]:
# store the cv scores in a table
scores_df = cross_validate(mp_deal_tree, X_train, y_train, cv=4, return_train_score=True)
scores_df = pd.DataFrame.from_dict(scores_df)
print(scores_df)
scores_df['test_score'].max()

   fit_time  score_time  test_score  train_score
0  0.003401    0.001809    1.000000          1.0
1  0.004645    0.002055    0.500000          1.0
2  0.002803    0.001830    1.000000          1.0
3  0.008123    0.002780    0.333333          1.0


1.0

### Observations

The model is overfitting because the train score is consistently a value of 1.0 while the test_score varies significantly, sometimes as low as 0.33333.

I will experiment with different classifiers below in an effort to improve the accuracy of the model.

In [47]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

models = {
    "decision tree": DecisionTreeClassifier(random_state=123),
    "kNN": KNeighborsClassifier(),
    "RBF SVM": SVC(random_state=123),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=123),
    # "LightGBM": LGBMClassifier(),
}

In [48]:
# Function to help with calculating and displaying cross_val_scores

def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [49]:
from sklearn.model_selection import cross_val_score

res_dic = {}

for key, value in models.items():
    res_dic[key] = mean_std_cross_val_scores(
        value, X_train, y_train, cv=4, return_train_score=True
    )
income_pred_results_df = pd.DataFrame(res_dic).T
income_pred_results_df

Traceback (most recent call last):
  File "/Users/paigeingram/miniconda3/envs/cpsc330/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/paigeingram/miniconda3/envs/cpsc330/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
  File "/Users/paigeingram/miniconda3/envs/cpsc330/lib/python3.10/site-packages/sklearn/base.py", line 705, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/Users/paigeingram/miniconda3/envs/cpsc330/lib/python3.10/site-packages/sklearn/neighbors/_classification.py", line 246, in predict
    if self._fit_method == "brute" and ArgKminClassMode.is_usable_for(
  File "/Users/paigeingram/miniconda3/envs/cpsc330/lib/python3.10/site-packages/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py", line 471, in is_usable_for
    ArgKmin.is_usable_for(X, 

Unnamed: 0,fit_time,score_time,test_score,train_score
decision tree,0.003 (+/- 0.000),0.002 (+/- 0.000),0.708 (+/- 0.344),1.000 (+/- 0.000)
kNN,0.002 (+/- 0.000),0.001 (+/- 0.001),nan (+/- nan),nan (+/- nan)
RBF SVM,0.002 (+/- 0.000),0.002 (+/- 0.000),0.708 (+/- 0.048),0.952 (+/- 0.055)
Logistic Regression,0.003 (+/- 0.000),0.001 (+/- 0.000),0.771 (+/- 0.158),0.927 (+/- 0.049)
