# Imports

In [215]:
import numpy as np
import os
import math
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import csv
import json
from scipy.stats import wasserstein_distance
import statistics
from tqdm import tqdm
import shutil
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn import svm

# Data Loading Functions

In [274]:
def read_stats_data(name, mechanics, folder):
    data = {}
    for mech in mechanics:
        data[mech] = []
    data["endTurn"] = []
    file_names = [f for f in os.listdir(folder) if name in f]
    for fn in file_names:
        with open(os.path.join(folder,fn)) as f:
            json_data = json.load(f)
            if "results" in json_data:
                json_data = json_data["results"]
            for playtrace in json_data:
                for mech in mechanics:
                    if mech in playtrace["frequencies"]:
                        data[mech].append(playtrace["frequencies"][mech])
                    else:
                        data[mech].append(0.0)
                data["endTurn"].append(playtrace["levelReport"]["turnsTaken"])
    return pd.DataFrame(data)

def build_users_data(folder, mechanics, users):
    full_data = read_stats_data(users[0], mechanics, folder)
    full_data['id'] = users[0]
    for user in users[1:]:
        user_data = read_stats_data(user, mechanics, folder)
        user_data['id'] = user
        full_data = pd.concat([full_data, user_data])
    full_data.set_index('id', inplace=True)
    return full_data

def read_questionaire(users, classes, scores, folder):
    data = {}
    for key in classes:
        data[key] = []
    data['id'] = []
    for user in users:
        data['id'].append(user)
        with open(os.path.join(folder, user + ".json")) as f:
            json_data = json.load(f)
            if "Q1" in json_data:
                for key in classes:
                    data[key].append(0.0)
                    for q in classes[key]:
                        data[key][-1] += scores[json_data[q]]
                    data[key][-1] /= len(classes[key])
    df = pd.DataFrame(data)
    df.set_index('id', inplace=True)
    return df

# Helper Functions

In [210]:
def get_mechanics(folders):
    uniques = set()
    for folder in folders:
        filenames = [f for f in os.listdir(folder) if ".json" in f]
        for fn in filenames:
            with open(os.path.join(folder,fn)) as f:
                json_data = json.load(f)
                if "results" in json_data:
                    json_data = json_data["results"]
                for playtrace in json_data:
                    for key in playtrace["frequencies"].keys():
                        uniques.add(key)
    return list(uniques)

def get_users(folders):
    uniques = set()
    for folder in folders:
        filenames = [f for f in os.listdir(folder) if ".json" in f]
        for fn in filenames:
            uuid = fn.split(".")[0]
            uniques.add(uuid)
    return list(uniques)

def get_majority_voting(clf, data):
    labels = clf.predict(data).reshape((-1,3))
    result = []
    for row in labels:
        vals,counts= np.unique(row, return_counts=True)
        index = np.argmax(counts)
        result.append(vals[index])
    return np.array(result)

# Building Data

In [305]:
mechanics = get_mechanics(["results", "results_study"])
users = get_users(["results_study"])
classes = {
    "MK": ["Q5", "Q6", "Q10"],
    "TC": ["Q3", "Q4", "Q8"],
    "R": ["Q2", "Q7", "Q9"]
}
scores = {
    "Never": 0, 
    "Rarely": 1, 
    "Sometimes": 2, 
    "Often": 3, 
    "Always": 4
}

# Reading all the data file
mk_data = read_stats_data("MK", mechanics, "results")
tc_data = read_stats_data("TC", mechanics, "results")
r_data = read_stats_data("R", mechanics, "results")
user_data = build_users_data("results_study", mechanics, users)
ques_data = read_questionaire(users, classes, scores, "results_study") / 4.0

# Removing User Data that didn't do any of the questionaire
user_filter = ques_data['MK'] + ques_data['TC'] + ques_data['R'] > 0
user_indeces = ques_data.index[user_filter]

user_data = user_data.loc[user_indeces]
ques_data = ques_data.loc[user_indeces]

# Building Data Set

In [265]:
input_data = pd.concat([mk_data, tc_data, r_data], ignore_index=True).to_numpy()
output_data = pd.Series([0]*len(mk_data) + [1]*len(tc_data) + [2]*len(r_data)).to_numpy()
avg_test_set = user_data.groupby('id').mean().to_numpy()
maj_test_set = user_data.to_numpy()

total_data = np.concatenate((input_data, avg_test_set, maj_test_set)) 
max_values = total_data.max(axis=0)

n_input_data = input_data / max_values
n_avg_test_set = avg_test_set / max_values
n_maj_test_set = maj_test_set / max_values

# Training Classifier

In [266]:
dt = tree.DecisionTreeClassifier(max_depth=9)
dt = dt.fit(n_input_data, output_data)
print(f"Decision Tree Training Score: {dt.score(n_input_data, output_data)}")

knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(n_input_data, output_data)
print(f"K Nearest Neighbor Training Score: {knn.score(n_input_data, output_data)}")

svc = svm.SVC(decision_function_shape='ovo')
svc = svc.fit(n_input_data, output_data)
print(f"Support Vector Machine Training Score: {svc.score(n_input_data, output_data)}")

Decision Tree Training Score: 0.9066666666666666
K Nearest Neighbor Training Score: 0.9156666666666666
Support Vector Machine Training Score: 0.8506666666666667


# Predicting Users

In [267]:
dt_avg_labels = dt.predict(n_avg_test_set)
knn_avg_labels = knn.predict(n_avg_test_set)
svc_avg_labels = svc.predict(n_avg_test_set)
print(f"Average DT vs KNN: {(dt_avg_labels == knn_avg_labels).sum()/len(avg_test_set)}")
print(f"Average DT vs SVC: {(dt_avg_labels == svc_avg_labels).sum()/len(avg_test_set)}")
print(f"Average KNN vs SVC: {(knn_avg_labels == svc_avg_labels).sum()/len(avg_test_set)}")
dt_maj_labels = get_majority_voting(dt, n_maj_test_set)
knn_maj_labels = get_majority_voting(knn, n_maj_test_set)
svc_maj_labels = get_majority_voting(svc, n_maj_test_set)
print(f"Majority DT vs KNN: {(dt_maj_labels == knn_maj_labels).sum()/len(avg_test_set)}")
print(f"Majority DT vs SVC: {(dt_maj_labels == svc_maj_labels).sum()/len(avg_test_set)}")
print(f"Majority KNN vs SVC: {(knn_maj_labels == svc_maj_labels).sum()/len(avg_test_set)}")

Average DT vs KNN: 0.6190476190476191
Average DT vs SVC: 0.6523809523809524
Average KNN vs SVC: 0.7476190476190476
Majority DT vs KNN: 0.6238095238095238
Majority DT vs SVC: 0.638095238095238
Majority KNN vs SVC: 0.8476190476190476
