# Research Question 2: Which mechanical taxonomy is a better predictor of subdomains 

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import pymc as pm


from scipy.stats import shapiro, anderson, jarque_bera, spearmanr, probplot
import statsmodels.api as sm
import statsmodels.formula.api as smf

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, average_precision_score
from sklearn.utils import class_weight
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier

from tqdm import tqdm

from stargazer.stargazer import Stargazer
from IPython.core.display import HTML



## Loading dataframes

In [2]:
cat_17 = pd.read_csv(f'data/2017/categories_2017.csv', index_col='Unnamed: 0', encoding='utf-8') 
mech_17 = pd.read_csv(f'data/2017/mechanics_2017.csv', index_col='Unnamed: 0', encoding='utf-8')
sub_17 = pd.read_csv(f'data/2017/subdomains_2017.csv')
rank_id_17 = pd.read_csv('data/2017/rank_id.csv')

cat_23 = pd.read_csv(f'data/2023/categories_2023.csv', index_col='Unnamed: 0', encoding='utf-8') 
mech_23 = pd.read_csv(f'data/2023/mechanics_2023.csv', index_col='Unnamed: 0', encoding='utf-8')
sub_23 = pd.read_csv(f'data/2023/subdomains_2023.csv')
rank_id_23 = pd.read_csv('data/2023/rank_id.csv')

df_list = [cat_17, mech_17, cat_23, mech_23]

## Building Ontologies

## Functions

In [3]:
def test_forest(df):
    X = df.drop(columns=['rank', 'percentile', 'game_id'])
    y = df[:]['percentile']

    clf = RandomForestClassifier(n_estimators=100, random_state = 54)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=54)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Model Accuracy
    print("Accuracy:", accuracy_score(y_test, y_pred))

    # Classification report
    print(classification_report(y_test, y_pred))
    
    return clf

In [4]:
def add_percentile_column(df, rank_id):
    df = rank_id.join(df.set_index('game_id'), on='game_id', rsuffix='_mech')
    df['percentile'] = pd.qcut(df['rank'], q = 10, labels=False)
    df['percentile'] = df['percentile'] + 1

    percentile_col = df.columns[-1]
    last_column = df.pop(percentile_col)
    df.insert(1, percentile_col, last_column)
    return df

## Training

In [5]:
# adding percentiles to dataframes
for i in range(len(df_list)):
    if 'percentile' not in df_list[i].columns:
        if i < 2:
            df_list[i] = add_percentile_column(df_list[i], rank_id_17)
        else:
            df_list[i] = add_percentile_column(df_list[i], rank_id_23)
    else:
        pass
for i in range(len(df_list)):
    print(f'printed df: {df_list[i].head()}')


printed df:    rank  percentile  geek_rating  game_id  Abstract Strategy  \
0     1           1      8.48904   161936                  0   
1     2           1      8.30744   182028                  0   
2     3           1      8.22021    12333                  0   
3     4           1      8.15458   120677                  0   
4     5           1      8.15151   174430                  0   

   Action / Dexterity  Adventure  Age of Reason  American Civil War  \
0                   0          0              0                   0   
1                   0          0              0                   0   
2                   0          0              0                   0   
3                   0          0              0                   0   
4                   0          1              0                   0   

   American Indian Wars  ...  Transportation  Travel  Trivia  \
0                     0  ...               0       0       0   
1                     0  ...               0    

  df['percentile'] = pd.qcut(df['rank'], q = 10, labels=False)
  df.insert(1, percentile_col, last_column)


In [15]:
df_17 = pd.merge(mech_17, sub_17, on='game_id')
df_23 = pd.merge(mech_23, sub_23, on='game_id')
df_list = [df_17, df_23]

In [17]:
for df in df_list:
    year = 2017 if df.equals(df_17) else 2023
    print(f'Random Forest Classifier for {year}:')
    subdomains = ['Abstract Games', 'Children\'s Games', 'Customizable Games', 'Family Games', 'Party Games', 'Strategy Games', 'Thematic Games', 'Wargames']

    X = df.drop(columns=['game_id', 'Abstract Games', 'Children\'s Games', 'Customizable Games', 'Family Games', 'Party Games', 'Strategy Games', 'Thematic Games', 'Wargames'])
    y = df[subdomains]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=54)


    param_grid = {
        'n_estimators': [500],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }


    clf = RandomForestClassifier(class_weight='balanced', random_state = 42)

    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
    grid_search.fit(X, y)

    best_clf = grid_search.best_estimator_
    multi_clf = MultiOutputClassifier(best_clf, n_jobs=-1)

    multi_clf.fit(X_train, y_train)
    y_pred = multi_clf.predict(X_test)

    # Model Accuracy
    print("Accuracy:", accuracy_score(y_test, y_pred))

    # Classification report
    print(classification_report(y_test, y_pred,target_names=subdomains))
    
    # Confusion Matrix Metrics
    for i in range(len(subdomains)):
        cmatrix = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])
        classes = [f'{subdomains[i].rstrip(" Games")}', f'Not {subdomains[i].rstrip(" Games")}']

        TN = cmatrix[0][0]
        FP = cmatrix[0][1]
        FN = cmatrix[1][0]
        TP = cmatrix[1][1]

        sensitivity = TP/(TP+FN)
        specificity = TN/(TN+FP)

        precision = TP/(TP+FP)
        neg_pred = TN/(TN+FN)

        accuracy = (TP+TN)/(TP+TN+FP+FN)

        print(f'Metrics for {subdomains[i]} Confusion Matrix',
              f'\n\tSensitivity: {sensitivity:0.2f}',
              f'\n\tSpecificity: {specificity:0.2f}',
              f'\n\tPrecision: {precision:0.2f}',
              f'\n\tNegative Predictive Value: {neg_pred:0.2f}',
              f'\n\tAccuracy: {accuracy:0.2f}')

Random Forest Classifier for 2017:
Accuracy: 0.3
                    precision    recall  f1-score   support

    Abstract Games       0.30      0.72      0.43        76
  Children's Games       0.16      0.72      0.27        32
Customizable Games       0.18      0.55      0.27        33
      Family Games       0.46      0.77      0.58       254
       Party Games       0.34      0.73      0.46        81
    Strategy Games       0.58      0.74      0.65       303
    Thematic Games       0.43      0.66      0.52       163
          Wargames       0.74      0.82      0.78       194

         micro avg       0.45      0.74      0.56      1136
         macro avg       0.40      0.71      0.49      1136
      weighted avg       0.50      0.74      0.59      1136
       samples avg       0.54      0.72      0.59      1136

Metrics for Abstract Games Confusion Matrix 
	Sensitivity: 0.72 
	Specificity: 0.86 
	Precision: 0.30 
	Negative Predictive Value: 0.97 
	Accuracy: 0.85
Metrics for Chi

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5075
                    precision    recall  f1-score   support

    Abstract Games       0.47      0.56      0.51        16
  Children's Games       0.17      0.14      0.15         7
Customizable Games       0.55      0.35      0.43        17
      Family Games       0.72      0.58      0.64       128
       Party Games       0.75      0.60      0.67        30
    Strategy Games       0.77      0.80      0.78       179
    Thematic Games       0.85      0.67      0.75        86
          Wargames       0.82      0.71      0.76        38

         micro avg       0.75      0.67      0.71       501
         macro avg       0.64      0.55      0.59       501
      weighted avg       0.75      0.67      0.70       501
       samples avg       0.73      0.71      0.70       501

Metrics for Abstract Games Confusion Matrix 
	Sensitivity: 0.56 
	Specificity: 0.97 
	Precision: 0.47 
	Negative Predictive Value: 0.98 
	Accuracy: 0.96
Metrics for Children's Games Confusion Matrix 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## confusion matrix stats

In [10]:
# for i in range(len(subdomains)):
#     cmatrix = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])
#     classes = [f'{subdomains[i].rstrip(" Games")}', f'Not {subdomains[i].rstrip(" Games")}']
    
#     TN = cmatrix[0][0]
#     FP = cmatrix[0][1]
#     FN = cmatrix[1][0]
#     TP = cmatrix[1][1]
    
#     sensitivity = TP/(TP+FN)
#     specificity = TN/(TN+FP)
    
#     precision = TP/(TP+FP)
#     neg_pred = TN/(TN+FN)
    
#     accuracy = (TP+TN)/(TP+TN+FP+FN)
    
#     print(f'Metrics for {subdomains[i]} Confusion Matrix',
#           f'\n\tSensitivity: {sensitivity:0.2f}',
#           f'\n\tSpecificity: {specificity:0.2f}',
#           f'\n\tPrecision: {precision:0.2f}',
#           f'\n\tNegative Predictive Value: {neg_pred:0.2f}',
#           f'\n\tAccuracy: {accuracy:0.2f}')

Metrics for Abstract Games Confusion Matrix 
	Sensitivity: 0.56 
	Specificity: 0.97 
	Precision: 0.47 
	Negative Predictive Value: 0.98 
	Accuracy: 0.96
Metrics for Children's Games Confusion Matrix 
	Sensitivity: 0.14 
	Specificity: 0.99 
	Precision: 0.17 
	Negative Predictive Value: 0.98 
	Accuracy: 0.97
Metrics for Customizable Games Confusion Matrix 
	Sensitivity: 0.35 
	Specificity: 0.99 
	Precision: 0.55 
	Negative Predictive Value: 0.97 
	Accuracy: 0.96
Metrics for Family Games Confusion Matrix 
	Sensitivity: 0.58 
	Specificity: 0.89 
	Precision: 0.72 
	Negative Predictive Value: 0.82 
	Accuracy: 0.79
Metrics for Party Games Confusion Matrix 
	Sensitivity: 0.60 
	Specificity: 0.98 
	Precision: 0.75 
	Negative Predictive Value: 0.97 
	Accuracy: 0.95
Metrics for Strategy Games Confusion Matrix 
	Sensitivity: 0.80 
	Specificity: 0.81 
	Precision: 0.77 
	Negative Predictive Value: 0.83 
	Accuracy: 0.80
Metrics for Thematic Games Confusion Matrix 
	Sensitivity: 0.67 
	Specificity: 0.

## Figures

In [9]:
plt.figure(figsize=(10, 8))

y_score = multi_clf.predict_proba(X_test)

# For each subdomain
for i in range(len(subdomains)):
    precision, recall, _ = precision_recall_curve(y_test.iloc[:, i], y_score[i][:, 1])
    average_precision = average_precision_score(y_test.iloc[:, i], y_score[i][:, 1])

    plt.plot(recall, precision, lw=2, label='Precision-Recall curve of class {0} (area = {1:0.2f})'
                                           ''.format(subdomains[i], average_precision))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves of 2017 Classifier')
plt.legend(loc="lower right")
plt.savefig(f'figures/2017/Precision Recall Curves 2017.png')
plt.show()

NameError: name 'plt' is not defined

In [None]:
for i in range(len(subdomains)):
    matrix = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])
    classes = [f'{subdomains[i].rstrip(' Games')}', f'Not {subdomains[i].rstrip(' Games')}']
    plt.matshow(matrix)
    plt.title(f'2017 Confusion Matrix of {subdomains[i]}')
    plt.colorbar()
    plt.ylabel('Ground Truth')
    plt.xlabel('Prediction')
    plt.xticks(range(len(classes)), classes, rotation=45)
    plt.yticks(range(len(classes)), classes)
    
    for j in range(len(classes)):
        for k in range(len(classes)):
            plt.text(k, j, matrix[j, k], ha='center', va='center')
    plt.savefig(f'figures/2017/confusion_matrix/{subdomains[i]} Confusion Matrix_2017.pdf')
    plt.show()


In [None]:
fig, axs = plt.subplots(2, 4, figsize=(15, 10))
fig.suptitle('Confusion Matrices of 2017')

for i, ax in enumerate(axs.flatten()):
    matrix = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])
    classes = [f'{subdomains[i]}', f'Not {subdomains[i]}']
    cax = ax.matshow(matrix, cmap=plt.cm.Blues)
    fig.colorbar(cax, ax=ax)
    ax.set_title(f'{subdomains[i]}')
    ax.set_ylabel('Ground Truth')
    ax.set_xlabel('Prediction')
    ax.set_xticks(range(len(classes)))
    ax.set_yticks(range(len(classes)))
    ax.set_xticklabels(classes, rotation=45)
    ax.set_yticklabels(classes)
    
    for j in range(len(classes)):
        for k in range(len(classes)):
            ax.text(k, j, matrix[j, k], ha='center', va='center')

plt.tight_layout(rect=[0, 0, 1, 0.96]) 
plt.savefig('figures/2017/Confusion Matrices_2017.png')
plt.show()

In [None]:
# Initialize a DataFrame to hold the feature importances
feature_importances = pd.DataFrame()
feature_importances['feature'] = X.columns

# For each classifier in the multi output classifier
for i, classifier in enumerate(multi_clf.estimators_):
    # Get the feature importances
    importances = classifier.feature_importances_
    
    # Add these importances to the DataFrame
    feature_importances[f'importance_{subdomains[i]}'] = importances

# Print the DataFrame
print(feature_importances)

In [None]:
# For each class
plt.figure(figsize=(10, 8))
for i in range(len(subdomains)):
    # Initialize figure for this class

    plt.plot(recall[i], precision[i], label='Precision-Recall curve of class {0} (area = {1:0.2f})'
                                           ''.format(subdomains[i], average_precision[i]))

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall curve for {subdomains[i]}')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
import numpy as np

fig, axs = plt.subplots(2, 4, figsize=(15, 10))
fig.suptitle('Confusion Matrices of 2017')

# Initialize the aggregate confusion matrix
aggregate_matrix = np.zeros((2, 2))

for i, ax in enumerate(axs.flatten()):
    matrix = confusion_matrix(y_test.iloc[:, i], y_pred[:, i])
    
    # Add the current matrix to the aggregate matrix
    aggregate_matrix += matrix

    classes = [f'{subdomains[i]}', f'Not {subdomains[i]}']
    cax = ax.matshow(matrix, cmap=plt.cm.Blues)
    fig.colorbar(cax, ax=ax)
    ax.set_title(f'{subdomains[i]}')
    ax.set_ylabel('Ground Truth')
    ax.set_xlabel('Prediction')
    ax.set_xticks(range(len(classes)))
    ax.set_yticks(range(len(classes)))
    ax.set_xticklabels(classes, rotation=45)
    ax.set_yticklabels(classes)
    
    for j in range(len(classes)):
        for k in range(len(classes)):
            ax.text(k, j, matrix[j, k], ha='center', va='center')

# Print or plot the aggregate confusion matrix as required
print(aggregate_matrix)

plt.tight_layout(rect=[0, 0, 1, 0.96]) 
plt.savefig('figures/2017/Confusion Matrices_2017.png')
plt.show()