In [1]:
import gzip
import json
import os
import io
import csv #to read in and split up all the content from the dataset input file
import numpy as np #import numpy using np as an alias
import pandas as pd #import pandas using pd as an alias
import matplotlib.pyplot as plt #import matplotlib.pyplot using plt as an alias
import seaborn as sb #import seaborn using sns as an alias
import sys
import codecs
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder #import the encoder classes

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

In [7]:
def combineDF(dfExtension, df):
    if dfExtension == 0:
        result = pd.read_csv('D:\STS Master Deck - OneHot Encoded\DF'+ str(dfExtension) + '.csv', delimiter=',', index_col=False)
    else:
        df1 = pd.read_csv('D:\STS Master Deck - OneHot Encoded\DF'+ str(dfExtension) + '.csv', delimiter=',', index_col=False)
        frames = [df, df1]
        result = pd.concat(frames)
        
    return result

In [11]:
df = 0
for dfExtension in range(0, 180000, 10000):
    df = combineDF(dfExtension, df)

df = df.drop(['ascension_level', 'max_hp_per_floor', 'current_hp_per_floor'], axis=1)
#df.to_csv('D:\STS Master Deck - OneHot Encoded\MasterDf.csv', index=False)

In [12]:
playsArray = df.to_numpy(copy=True)
trainSet, testSet = train_test_split(playsArray, test_size = .2, random_state = 22)

In [13]:
trainSet

array([[False, 0, 0, ..., 0, 1, 0],
       [False, 0, 0, ..., 1, 0, 0],
       [True, 0, 0, ..., 0, 1, 0],
       ...,
       [True, 2, 0, ..., 0, 1, 0],
       [True, 0, 0, ..., 0, 0, 0],
       [False, 1, 0, ..., 0, 0, 0]], dtype=object)

In [14]:
#array[row][column]
trainData, trainLabels, testData, testLabels = np.delete(trainSet, 0, 1), trainSet[:,0].astype(int), np.delete(testSet, 0, 1), testSet[:,0].astype(int)

In [75]:
totalLabels = len(trainLabels)
totalFailedRuns = 0
for label in trainLabels:
    if (label == 0):
        totalFailedRuns += 1
print(f"Total Failed Runs Percent: {totalFailedRuns / totalLabels}")

Total Failed Runs Percent: 0.596815669428653


In [15]:
modelsList = [("forestClassifier", RandomForestClassifier(random_state=42)),
                ("extraTreesClassifier", ExtraTreesClassifier(random_state=42)),
                ("linearSVC", LinearSVC(random_state=42))]
testList = [("linearSVC", LinearSVC(random_state=42))]
otherTwoModels = [("forestClassifier", RandomForestClassifier(random_state=42)),
                ("extraTreesClassifier", ExtraTreesClassifier(random_state=42))]

In [16]:
def getCrossValScores(modelsList, trainData, trainLabels):
    output = ""
    for key, model in modelsList:
        accuracy = cross_val_score(model, trainData, trainLabels,
                                 scoring="accuracy", cv=3, )
        output += (f"Model: {key}\nScore: {accuracy}\nScore Mean with 3 Folds: {accuracy.mean()}\n\n")
    return output

In [17]:
print(getCrossValScores(testList, trainData, trainLabels))



Model: linearSVC
Score: [0.69701179 0.69546583 0.69874675]
Score Mean with 3 Folds: 0.6970747910085794




In [18]:
print(getCrossValScores(otherTwoModels, trainData, trainLabels))

Model: forestClassifier
Score: [0.68069636 0.68707141 0.68290376]
Score Mean with 3 Folds: 0.6835571776586118

Model: extraTreesClassifier
Score: [0.68025301 0.68494325 0.68299243]
Score Mean with 3 Folds: 0.6827295632043882




In [19]:
#Forest and Extra-Trees significantly outperformed SVC thus the latter is not
#included in the ensemble.
modelsForEnsemble = [("forestClassifier", RandomForestClassifier(random_state=42)),
                ("extraTreesClassifier", ExtraTreesClassifier(random_state=42))]

In [20]:
def getEnsembleAccuracy(classifierModelsList, trainData, trainLabels, testData, testLabels):
    result = ""
    
    voting_clf = VotingClassifier(classifierModelsList)
    voting_clf.fit(trainData, trainLabels)
    
    #set to soft voting and display soft voting accuracy
    voting_clf.voting = "soft"
    result += (f"Soft Voting Accuracy: {voting_clf.score(testData, testLabels)}\n")
    
    #set to hard voting and display hard voting accuracy
    voting_clf.voting = "hard"
    result += (f"Hard Voting Accuracy: {voting_clf.score(testData, testLabels)}\n")
    
    return result

In [24]:
print(getEnsembleAccuracy(modelsForEnsemble, trainData, trainLabels, testData, testLabels))

Soft Voting Accuracy: 0.6881182266009852
Hard Voting Accuracy: 0.6797635467980295



## Evaluating The Model

### Check percentage of labels with 0 and compare against model's accuracy.
#### - Percentage of labels with 0 in training set: 59% 
#### - Model's accuracy: 69%
#### - Conclusion: the accuracy of the model is not determined by the percent occurance of the majority label

In [97]:
#Check percentage of labels with 0 and compare against model's accuracy
totalLabels = len(trainLabels)
totalFailedRuns = 0
for label in trainLabels:
    if (label == 0):
        totalFailedRuns += 1
print(f"Total Failed Runs Percent: {totalFailedRuns / totalLabels}")

Total Failed Runs Percent: 0.596815669428653
