In [12]:
import os
import joblib
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

import keras_tuner
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

RANDOM_SEED = 42
NUM_ROUNDS = 10
NUM_COLUMNS = NUM_ROUNDS * 2
NUM_EPOCHS = 10
BATCH_SIZE = 32
VALIDATION_SPLIT = 0.2

DATA_DIR = "data"
MODEL_DIR = "models"

np.set_printoptions(precision=3, suppress=True) # Make numpy values easier to read.
np.random.seed(RANDOM_SEED)  # ensure random outputs are consistent

print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.18.0


# Ingest Data

In [None]:
# Read data from CSV
df = pd.read_csv(f"{DATA_DIR}/TitForTatPlayer.csv")
df.head()

Unnamed: 0,p1_1,p1_2,p1_3,p1_4,p1_5,p1_6,p1_7,p1_8,p1_9,p1_10,...,p2_3,p2_4,p2_5,p2_6,p2_7,p2_8,p2_9,p2_10,move,opponent
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,COOPERATE,CooperativePlayer
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,COOPERATE,CooperativePlayer
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,COOPERATE,CooperativePlayer
3,1,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,COOPERATE,CooperativePlayer
4,1,1,1,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,COOPERATE,CooperativePlayer


In [3]:
encoder = LabelEncoder()
encoder.fit(df["move"])
df["move"] = encoder.transform(df["move"])

df.head()

Unnamed: 0,p1_1,p1_2,p1_3,p1_4,p1_5,p1_6,p1_7,p1_8,p1_9,p1_10,...,p2_3,p2_4,p2_5,p2_6,p2_7,p2_8,p2_9,p2_10,move,opponent
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,CooperativePlayer
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,CooperativePlayer
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,CooperativePlayer
3,1,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,CooperativePlayer
4,1,1,1,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,CooperativePlayer


# Split data into training and testing

In [4]:
# Select RandomPlayer for testing
df_train = df.loc[df['opponent'] != 'RandomPlayer']
df_test = df.loc[df['opponent'] == 'RandomPlayer']

# Extract features and target
X_train = df_train.iloc[:, :NUM_COLUMNS].to_numpy()
y_train = df_train['move'].values

X_test = df_test.iloc[:, :NUM_COLUMNS].to_numpy()
y_test = df_test['move'].values

print(f"""
Training:
    X: {X_train.shape}
    y: {y_train.shape}

Testing:
    X: {X_test.shape}
    y: {y_test.shape}
""")


Training:
    X: (110, 20)
    y: (110,)

Testing:
    X: (10, 20)
    y: (10,)



In [5]:
# Xs = df.iloc[:, :NUM_COLUMNS].to_numpy()  # get the first 20 columns and convert to numpy array
# ys = df["move"].values
# # ys = df["move"].values.reshape(-1, 1)

In [6]:
# # 60 / 20 / 20 split
# # The ML model only sees the training data

# X_train, X_test, y_train, y_test = train_test_split(
#     Xs, ys, test_size=0.2, random_state=RANDOM_SEED
# )

# X_train, X_validate, y_train, y_validate = train_test_split(
#     X_train, y_train, test_size=0.2, random_state=RANDOM_SEED
# )

# print(f"""
# Training:
#     X: {X_train.shape}
#     y: {y_train.shape}

# Testing:
#     X: {X_test.shape}
#     y: {y_test.shape}

# Validation:
#     X: {X_validate.shape}
#     y: {y_validate.shape}
# """)

# RandomForestClassifier

In [7]:
# def evaluate(model, X_test, y_test):
#     predictions = model.predict(X_test)
#     errors = abs(predictions - y_test)
#     mape = 100 * np.mean(errors / y_test)
#     accuracy = 100 - mape
#     print("Model Performance")
#     print("Average Error: {:0.4f} degrees.".format(np.mean(errors)))
#     print("Accuracy = {:0.2f}%.".format(accuracy))

#     return accuracy

def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = 100 * accuracy_score(y_test, predictions)
    print("Accuracy = {:0.2f}%.".format(accuracy))

In [8]:
base_rfc = RandomForestClassifier(random_state=RANDOM_SEED)
base_rfc.fit(X_train, y_train)
print(base_rfc.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


## RandomizedSearchCV

In [59]:
rand_param_grid = {
    "bootstrap": [True, False],
    "max_depth": [int(x) for x in np.linspace(10, 110, num=11)],
    "max_features": [1.0, None, "sqrt", "log2"],
    "min_samples_leaf": [1, 2, 4],
    "min_samples_split": [2, 5, 10],
    "n_estimators": [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
}

In [60]:
# Instantiate reg for randomized search
rfc = RandomForestClassifier()

# Conduct the randomized search
rand_search = RandomizedSearchCV(
    estimator=rfc, 
    param_distributions=rand_param_grid, 
    cv=3, 
    n_jobs=-1, 
    n_iter=100, 
    verbose=2, 
    random_state=RANDOM_SEED
)

# Fit the grid search to the data
rand_search.fit(X_train, y_train)
print(rand_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


6250.56s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
6250.74s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1400; total time=   2.1s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1400; total time=   2.0s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1400; total time=   2.0s
[CV] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   3.0s
[CV] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   2.9s
[CV] END bootstrap=False, max_depth=100, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   3.1s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1600; total time=   3.3s
[CV] END bootstrap=True, max_de

In [61]:
# Create a random forest with best parameters
rand_rfc = RandomForestClassifier(
    bootstrap=True,
    max_depth=80,
    max_features=1.0,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=400,
    n_jobs=-1,
    random_state=RANDOM_SEED,
)

# Fit the model to the data
rand_rfc.fit(X_train, y_train)

## Grid Search

In [62]:
param_grid = {
    "bootstrap": [True],
    "max_depth": [80, 90, 100, 110],
    "max_features": [1.0, "sqrt", "log2"],
    "min_samples_leaf": [3, 4, 5],
    "min_samples_split": [8, 10, 12],
    "n_estimators": [100, 200, 300, 1000],
}

In [63]:
# Instantiate reg for gridsearch
rfc = RandomForestClassifier()

# Conduct the gridsearch
grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits
[CV] END bootstrap=True, max_depth=80, max_features=1.0, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=80, max_features=1.0, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=80, max_features=1.0, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=80, max_features=1.0, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=80, max_features=1.0, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=80, max_features=1.0, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=80, max_features=1.0, min_samples_leaf=3, min_samples_split=8, n_estimators=300; total time

In [64]:
# Create a random forest with best parameters
grid_rfc = RandomForestClassifier(
    bootstrap=True,
    max_depth=80,
    max_features="log2",
    min_samples_leaf=5,
    min_samples_split=12,
    n_estimators=100,
    n_jobs=-1,
    random_state=RANDOM_SEED,
)

# Fit the model to the data
grid_rfc.fit(X_train, y_train)

## Evaluate Models

In [10]:
# Evaluate the model
print("Base Model")
evaluate(base_rfc, X_test, y_test)

# print("Randomized Search")
# evaluate(rand_rfc, X_test, y_test)

# print("Grid Search")
# evaluate(grid_rfc, X_test, y_test)

Base Model
Accuracy = 80.00%.


# Save Models

In [14]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

joblib.dump(base_rfc, f"{MODEL_DIR}/base_rfc.joblib")
# joblib.dump(rand_rfc, "rand_rfc.joblib")
# joblib.dump(grid_rfc, "grid_rfc.joblib")

['models/base_rfc.joblib']

# TensorFlow Model with Hyperparameter Tuning

In [68]:
def build_model(hp):
    model = Sequential()
    model.add(Dense(hp.Choice("dense1", [32, 64, 128]), activation="relu", input_shape=(NUM_COLUMNS,)))
    model.add(Dense(hp.Choice("dense2", [64, 128, 256]), activation="relu"))
    model.add(Dense(hp.Choice("dense3", [64, 128, 256]), activation="relu"))
    model.add(Dense(hp.Choice("dense4", [64, 128, 256]), activation="relu"))
    model.add(Dense(hp.Choice("dense5", [64, 128, 256]), activation="relu"))
    model.add(Dense(hp.Choice("dense6", [64, 128, 256]), activation="relu"))
    model.add(Dense(hp.Choice("dense7", [32, 64, 128]), activation="relu"))
    model.add(Dense(hp.Choice("dense8", [16, 32, 64]), activation="relu"))
    model.add(Dense(1, activation="sigmoid"))  # For binary classification

    lr = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
    opt = Adam(learning_rate=lr)

    model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

    return model


tuner = keras_tuner.BayesianOptimization(build_model, objective="val_loss")

tuner.search(
    X_train, 
    y_train,
    epochs=NUM_EPOCHS, 
    batch_size=BATCH_SIZE, 
    validation_split=VALIDATION_SPLIT
)
best_model = tuner.get_best_models()[0]

Trial 10 Complete [00h 00m 05s]
val_loss: 0.5757136344909668

Best val_loss So Far: 0.5204418897628784
Total elapsed time: 00h 00m 42s


  saveable.load_own_variables(weights_store.get(inner_path))


In [71]:
loss, accuracy = best_model.evaluate(X_test, y_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5000 - loss: 0.7538

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.5000 - loss: 0.7538


In [72]:
best_model.summary()

In [73]:
tf.keras.utils.plot_model(
    best_model, to_file="model.png", show_shapes=True, show_layer_activations=True
)

You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.
