In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, precision_score

In [2]:
data = pd.read_csv('../data/catanstats.csv')
data

Unnamed: 0,gameNum,player,points,me,2,3,4,5,6,7,...,settlement2_resource3,production,tradeGain,robberCardsGain,totalGain,tradeLoss,robberCardsLoss,tribute,totalLoss,totalAvailable
0,1,1,5,,1,3,5,8,7,10,...,O,38,5,2,45,10,2,4,16,29
1,1,2,9,1.0,1,3,5,8,7,10,...,O,48,8,6,62,11,1,8,20,42
2,1,3,10,,1,3,5,8,7,10,...,C,44,14,9,67,24,4,0,28,39
3,1,4,5,,1,3,5,8,7,10,...,S,42,12,0,54,24,6,0,30,24
4,2,1,10,,1,6,3,9,10,8,...,2L,60,15,16,91,28,10,0,38,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,49,4,5,,0,5,3,6,13,13,...,S,41,8,11,60,22,4,0,26,34
196,50,1,5,,1,7,4,5,7,17,...,C,41,6,7,54,14,6,4,24,30
197,50,2,10,1.0,1,7,4,5,7,17,...,C,64,5,9,78,15,6,10,31,47
198,50,3,4,,1,7,4,5,7,17,...,S,44,7,4,55,22,1,8,31,24


In [3]:
def rf(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)
    model = RandomForestRegressor()
    fitted = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    residual = y_test - y_pred
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'Accuracy: {accuracy_(y_test, y_pred)}')
    print(f'R2 Score: {r2_score(y_test, y_pred)}')

In [4]:
def ft_imp(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)
    model = RandomForestRegressor()
    fitted = model.fit(X_train, y_train)
    importance = model.feature_importances_
    names = X.columns
    result = pd.DataFrame(importance, index = names).sort_values(by = 0, ascending = False)
    return result

In [28]:
def accuracy_(y_test, prediction):
    mape = np.mean(np.abs((y_test-prediction)/np.abs(y_test)))
    return round(100*(1-mape),2)

In [5]:
win_data = data.drop(['player','me','gameNum'], axis = 1)
win_data = pd.get_dummies(win_data, columns = ['settlement1_resource1','settlement1_resource2','settlement1_resource3','settlement2_resource1','settlement2_resource2','settlement2_resource3'])

win_data = win_data[win_data['points'] >=10]
win_data.head()

Unnamed: 0,points,2,3,4,5,6,7,8,9,10,...,settlement2_resource3_2S,settlement2_resource3_2W,settlement2_resource3_3G,settlement2_resource3_B,settlement2_resource3_C,settlement2_resource3_D,settlement2_resource3_L,settlement2_resource3_O,settlement2_resource3_S,settlement2_resource3_W
2,10,1,3,5,8,7,10,6,7,3,...,0,0,0,0,1,0,0,0,0,0
4,10,1,6,3,9,10,8,14,9,3,...,0,0,0,0,0,0,0,0,0,0
9,10,0,3,3,10,10,4,5,5,6,...,0,0,0,0,0,0,0,0,1,0
13,10,1,6,5,12,14,20,12,11,4,...,0,0,0,0,1,0,0,0,0,0
17,10,1,3,7,9,12,14,9,9,3,...,0,0,0,0,0,0,1,0,0,0


In [8]:
y_win = win_data['points']
X_win = win_data.drop('points', axis = 1)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_win, y_win)
model = RandomForestRegressor()
fit = model.fit(X_train, y_train)
win_pred = np.rint(model.predict(X_test))


In [15]:
ft_imp(X_win,y_win)

Unnamed: 0,0
settlement1_resource3_D,0.161541
settlement2_num2,0.131545
settlement1_resource3_O,0.098229
settlement2_resource3_2L,0.079612
9,0.073283
...,...
settlement1_resource2_S,0.000000
settlement1_resource1_S,0.000000
settlement1_resource1_L,0.000000
robberCardsLoss,0.000000


In [16]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 10)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [20]:

rf_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   20.4s finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 111, 122, 133,
                                                         144, 155, 166, 177,
                                                         188, 200]},
                   random_state=42, verbose=2)

In [25]:
rf_random.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_win, y_win)
rfmodel = RandomForestRegressor(n_estimators = 100, min_samples_split = 5, min_samples_leaf = 4, max_features = 'auto', max_depth = 10, bootstrap = True)
fit = rfmodel.fit(X_train, y_train)
win_pred2 = np.rint(model.predict(X_test))
print(f'MSE: {mean_squared_error(y_test, win_pred2)}')
print(f'Accuracy: {accuracy_(y_test, win_pred2)}')
print(f'R2 Score: {r2_score(y_test, win_pred2)}')


MSE: 0.15384615384615385
Accuracy: 98.66
R2 Score: 0.5357142857142857


In [34]:
importance = rfmodel.feature_importances_
names = X_win.columns
important_feat = pd.DataFrame(importance, index = names).sort_values(by = 0, ascending = False)
important_feat.head(20)

Unnamed: 0,0
3,0.192672
totalAvailable,0.159089
settlement2_num2,0.094118
settlement2_resource3_S,0.058436
settlement1_num2,0.054387
robberCardsLoss,0.047059
totalLoss,0.047059
settlement2_num3,0.045502
11,0.035294
2,0.035294
