In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from fractions import Fraction
import pickle
import random

In [2]:
# Import data
dfWin = pd.read_csv("./DataFrames/WinnersProcessed.csv", index_col=0).dropna(subset=["expected_dmg"]).reset_index(drop=True)
dfLose = pd.read_csv("./DataFrames/LosersProcessed.csv", index_col=0).dropna(subset=["expected_dmg"]).reset_index(drop=True)
len(dfWin)

16405

I forgot to put which side the winner and loser were on, therefore I need to randomize where to put the winning creature and where to put the losing creature.
This will allow the model from not constantly predicting the creature on a specific side

In [3]:
import random
# Returns all stat values of a dataframe as a 2d array
def getStatsValues(df: pd.DataFrame):
    return df.loc[:, ["cr", "hp", "str", "dex", "con", "int", "wis", "cha", "ac", "spd", "expected_dmg"]].values

def convertToFloat(val):
    if type(val) == str:
        return float(Fraction(val))
    else:
        return float(val)

dfWin["cr"] = dfWin.apply(lambda row: convertToFloat(row["cr"]), axis=1)
dfLose["cr"] = dfLose.apply(lambda row: convertToFloat(row["cr"]), axis=1)
# Get Input values
dfWinStats = getStatsValues(dfWin)
dfLoseStats = getStatsValues(dfLose)
# Append input values together
dfStats = np.concatenate([dfWinStats, dfLoseStats], axis=1)
# Generate random list representing which side won and which side lost

# Make the right side outputs equal 1
rightSide = np.ones(dfStats.shape[0]//2)
# Make left side be zero
leftSide = np.zeros(dfStats.shape[0]//2)
# Append all values to targets
targets = np.append(leftSide, rightSide)
# Shuffle results
random.shuffle(targets)


data = []

for i in range(dfWinStats.shape[0] - 1):
    if targets[i]:
        res = np.concatenate([dfLoseStats[i], dfWinStats[i]])
    else:
        res = np.concatenate([dfWinStats[i], dfLoseStats[i]])
    data.append(res)

In [4]:
def convertToFloat(val):
    if type(val) == str:
        return float(Fraction(val))
    else:
        return float(val)

def getDifData(df1: pd.DataFrame, df2: pd.DataFrame, targets: list, stats=['ac', 'cr', 'spd', 'hp', 'str', 'dex', 'con', 'int', 'wis', 'cha', 'expected_dmg']):
    vals = np.zeros((df1.shape[0], len(stats)))
    for (i, r1), (_, r2) in zip(df1.iterrows(), df2.iterrows()):
        for j, stat in enumerate(stats):
            # If a stat is greater than a believable amount make the stat equal to its opponent
            # This allows for better generalization of the data
            if r1[stat] >= 900: r1[stat] = r2[stat]
            if r2[stat] >= 900: r2[stat] = r1[stat]
            vals[i, j] = r1[stat] - r2[stat] if targets[i] else r2[stat] - r1[stat]
    return vals


rightSize = dfWin.shape[0]//2
leftSize = dfWin.shape[0] - rightSize
# Make the right side outputs equal 1
rightSide = np.ones(rightSize)
# Make left side be zero
leftSide = np.zeros(leftSize)
# Append all values to targets
targets = np.append(leftSide, rightSide)
# Shuffle results
random.shuffle(targets)


dfWin["cr"] = dfWin.apply(lambda row: convertToFloat(row["cr"]), axis=1)
dfLose["cr"] = dfLose.apply(lambda row: convertToFloat(row["cr"]), axis=1)

data = getDifData(dfWin, dfLose, targets)

# Create scaler
scaler = MinMaxScaler()
scaler.fit(data)
data = scaler.transform(data)
data

array([[0.5862069 , 0.5625    , 0.35      , ..., 0.51785714, 0.38181818,
        0.3561021 ],
       [0.48275862, 0.4375    , 0.5       , ..., 0.55357143, 0.50909091,
        0.4216353 ],
       [0.68965517, 0.5       , 0.45      , ..., 0.5       , 0.43636364,
        0.40455855],
       ...,
       [0.48275862, 0.5       , 0.5       , ..., 0.58928571, 0.49090909,
        0.41263391],
       [0.62068966, 0.5625    , 0.75      , ..., 0.51785714, 0.47272727,
        0.41293179],
       [0.62068966, 0.5       , 0.4       , ..., 0.51785714, 0.54545455,
        0.4228873 ]])

In [5]:
# Find better ways to normalize results does not work currently

# def normalize(point):
#     if not point:
#         return 0
#     else: return 1/point

# for i in range(data.shape[0]):
#     for j in range(data.shape[1]):
#         data[i,j] = normalize(data[i,j])
# data

In [6]:
# Get the distribution of target values

one = 0
zero = 0

for val in targets:
    if val:
        one += 1
    else:
        zero += 1

print(f"One: {one}")
print(f"Zero: {zero}")

One: 8202
Zero: 8203


In [39]:
splitSize = 0.70

X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=(1-splitSize), random_state=47)


In [8]:
print(len(X_test[0]))

11


In [46]:
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

# params = { 'hidden_layer_sizes': [(200, 100, 90), (2000, 500, 80), (2000, 1500, 80), (3000, 1000, 80)]
#             'solver': ['lbfgs']
# }

lrs = [1e-4, 0.5e-4, 1e-5, 1.5e-5, 2e-5, 1e-6, 1.5e-6, 2e-6 ]
hiddenLayers = [(2500, 2000, 500), (2000, 1000, 500), ()]
history = []
# for lr in lrs:
#     # TODO: Implement RandomSearch framework
#     clf = MLPClassifier(hidden_layer_sizes=(5000), max_iter=10000, solver='lbfgs', activation='tanh', learning_rate='adaptive', learning_rate_init=lr, early_stopping=True, verbose=True)
#     clf.fit(X_train, y_train)
#     predictions = clf.predict(X_test)
#     history.append(accuracy_score(y_test, predictions))
clf = MLPClassifier(hidden_layer_sizes=(5000, 2000, 1000), max_iter=500, solver='lbfgs', activation='tanh', learning_rate='adaptive', learning_rate_init=1e-5, verbose=True, early_stopping=True)
clf.fit(X_train, y_train)
#predictions = clf.predict(X_test)
# gs = GridSearchCV(clf)
# print(history)

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =     12064001     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93644D-01    |proj g|=  1.33468D-02

At iterate    1    f=  6.93008D-01    |proj g|=  9.70227D-03

At iterate    2    f=  6.92878D-01    |proj g|=  5.05398D-03

At iterate    3    f=  6.92662D-01    |proj g|=  1.05719D-03

At iterate    4    f=  6.91985D-01    |proj g|=  8.66735D-03

At iterate    5    f=  6.90407D-01    |proj g|=  2.13345D-02

At iterate    6    f=  6.86431D-01    |proj g|=  3.81023D-02

At iterate    7    f=  6.73700D-01    |proj g|=  4.05586D-02

At iterate    8    f=  6.64872D-01    |proj g|=  2.06024D-02

At iterate    9    f=  6.64644D-01    |proj g|=  5.75320D-03

At iterate   10    f=  6.64138D-01    |proj g|=  3.33057D-03

At iterate   11    f=  6.63953D-01    |proj g|=  6.22719D-03

At iterate   12    f=  6.63522D-01    |proj g|=  7.31038D-03

At iterate   13    f=  6.6

              precision    recall  f1-score   support

         0.0       0.65      0.64      0.64      2484
         1.0       0.64      0.64      0.64      2438

    accuracy                           0.64      4922
   macro avg       0.64      0.64      0.64      4922
weighted avg       0.64      0.64      0.64      4922



0.6412027631044291

In [28]:
confusion_matrix(y_test, predictions)

array([[1080,  566],
       [ 591, 1044]])

In [None]:


# save the classifier to a file
with open("./Models/classifier_64perc.pkl", "wb") as file:
    pickle.dump(clf, file)

In [None]:
# load the saved classifier from the file
with open("./Models/classifier_64perc.pkl", "rb") as file:
    clf = pickle.load(file)

predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
clf

              precision    recall  f1-score   support

         0.0       0.60      0.59      0.59      1051
         1.0       0.63      0.64      0.63      1131

    accuracy                           0.61      2182
   macro avg       0.61      0.61      0.61      2182
weighted avg       0.61      0.61      0.61      2182



In [None]:
preds = clf.predict(X_train)
print(classification_report(y_train, preds))

              precision    recall  f1-score   support

         0.0       0.65      0.65      0.65      4403
         1.0       0.64      0.65      0.65      4323

    accuracy                           0.65      8726
   macro avg       0.65      0.65      0.65      8726
weighted avg       0.65      0.65      0.65      8726



In [None]:
clf