In [1]:
import os
import json
from tqdm import tqdm
import pickle
import numpy as np
import pandas as pd
from functools import reduce
from collections import Counter

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [5]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:


with open('gdrive/My Drive/Fall_2020/ISE540/ise540project/RAWG/data/data10m.pkl','rb') as f:
    data_list = pickle.load(f)

Mounted at /content/gdrive


CREATE THE DATA SETS USED FOR BASELINE

In [None]:
description_list = list(map(lambda x: x['description'], data_list))
genre_list = list(map(lambda x: x['genres'], data_list))
genre_list = list(map(lambda x: [i['name'] for i in x], genre_list))

In [None]:
df = pd.DataFrame({'text':description_list, 'labels':genre_list})
df

Unnamed: 0,text,labels
0,"The year is 2021, and Genoq has become a leadi...","[Adventure, Puzzle]"
1,<strong>Extreme Exorcism</strong> is a paranor...,"[Action, Adventure, Casual, Indie]"
2,"<ul><li>""Toto Temple Deluxe is near perfect an...","[Action, Casual, Indie]"
3,"Penarium is set in a sinister circus arena, wh...","[Action, Arcade, Casual, Indie]"
4,A labyrinth fades into view; a space you contr...,"[Action, Indie]"
...,...,...
86551,"<p>""Party game"" to be played with 2 or 4 playe...",[]
86552,"<p>At ease, pilot!<br/></p><p>Your goal is to ...",[Action]
86553,<p>It is Secret of Mana meets Zelda in all it'...,[Adventure]
86554,<p>Simucities is a citybuilder in the vanes of...,[]


In [None]:
df = df[~df.labels.apply(lambda x: x== [])]
df.reset_index(inplace=True, drop=True)
mlb = MultiLabelBinarizer()
df2 = pd.DataFrame(mlb.fit_transform(df['labels']),columns=mlb.classes_, index=df.index)
df2['text'] = df['text']
df = df2

In [None]:
table = str.maketrans('','',string.punctuation) # Delete punctuation
stop_words = set(stopwords.words('english')) # All unimportant words
lemmatizer = WordNetLemmatizer()
steamer = PorterStemmer()

def text_clean(l):
    l = re.sub('\<\/?[\w\d]+\/?\>', ' ', l)
    tokens = [w.lower() for w in word_tokenize(l)]
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()] # Remove things (e.g. numbers) that is not alphabet
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(i) for i in words]
    return words

In [None]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [None]:
def title_to_vec(title):
    vec_list = []
    for word in title:
        try:
            vec_list.append(word_to_vec_map[word])
        except KeyError:
            continue
    if len(vec_list) == 0:
        return 0
    return sum(vec_list)/len(vec_list)

In [None]:
word_to_vec_map = read_glove_vecs('gdrive/My Drive/Fall_2020/ISE540/ise540project/RAWG/glove.6B.50d.txt')[2]

In [None]:
df['word'] = df.text.apply(text_clean)
df['vec'] = df.word.apply(title_to_vec)
df = df[df['vec'].apply(lambda x: type(x)) != int]

In [None]:
genres = reduce(lambda a,b: a+b, genre_list)

In [None]:
Counter(genres)

Counter({'Action': 25447,
         'Adventure': 18549,
         'Arcade': 11217,
         'Board Games': 2558,
         'Card': 1504,
         'Casual': 13402,
         'Educational': 614,
         'Family': 1897,
         'Fighting': 1065,
         'Indie': 19508,
         'Massively Multiplayer': 1490,
         'Platformer': 2141,
         'Puzzle': 7388,
         'RPG': 10380,
         'Racing': 5605,
         'Shooter': 3248,
         'Simulation': 11652,
         'Sports': 6324,
         'Strategy': 11581})

In [None]:
genres = df.drop(['text','word','vec'], axis=1).columns

In [None]:
train_df, test_df = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = np.stack(train_df.vec.to_list())
X_test = np.stack(test_df.vec.to_list())
y_train = train_df.drop(['text','word','vec'], axis=1).to_numpy()
y_test = test_df.drop(['text','word','vec'], axis=1).to_numpy()

In [None]:
clf = OneVsRestClassifier(SVC()).fit(X_train, y_train)

In [None]:
y_prob = clf.predict(X_test)

In [None]:
roc_auc_score(y_test, y_prob)

In [None]:
clf2 = OneVsRestClassifier(GradientBoostingClassifier(), n_jobs=-1).fit(X_train, y_train)

In [None]:
y_prob = clf2.predict_proba(X_test)

In [None]:
roc_auc_score(y_test, y_prob)

In [None]:
test_df.text[11121]

In [None]:
prediction = clf2.predict(test_df.vec[11121].reshape(1,-1))[0]

In [None]:
genres.to_numpy()[[True if i == 1 else False for i in prediction]]

In [None]:
train_df
y_train

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0]])

In [None]:
# freq of prediction 

#get the frequency of belonging to 1,2,3...n genres

#get the frequency of belong to each genre

array([ 0.15160542,  0.20533319,  0.31467445, -0.01337747,  0.11468971,
       -0.13322361, -0.49542268, -0.01646927,  0.20100949, -0.11494901,
       -0.01399378, -0.11547573, -0.31393773,  0.17709266,  0.19530624,
        0.13424335,  0.14792551,  0.44214869, -0.33574066, -0.80327155,
        0.11706932,  0.03973944, -0.24929335,  0.31746158,  0.21727958,
       -1.05926116, -0.0065889 ,  0.34615203,  0.64063323, -0.166693  ,
        2.64129677,  0.20043426, -0.18011447, -0.1939086 ,  0.31854043,
        0.22969133,  0.29839996,  0.03903874, -0.07484023, -0.08952137,
       -0.10788284, -0.11727216, -0.03233715,  0.19911806, -0.06501656,
       -0.10786626,  0.1786842 , -0.16799011,  0.20486274,  0.02876316])

In [None]:
with open('gdrive/My Drive/Fall_2020/ISE540/ise540project/RAWG/data/response_training_baseline.pkl','wb') as f:
    pickle.dump(y_train, f)

In [None]:
with open('gdrive/My Drive/Fall_2020/ISE540/ise540project/RAWG/data/training_data_baseline.pkl','wb') as f:
    pickle.dump(train_df, f)

#RANDOM BASELINE BEGINNING



In [6]:
with open('gdrive/My Drive/Fall_2020/ISE540/ise540project/RAWG/data/response_training_baseline.pkl','rb') as f:
    response_training_baseline = pickle.load(f)

with open('gdrive/My Drive/Fall_2020/ISE540/ise540project/RAWG/data/training_data_baseline.pkl','rb') as f:
    training_data_baseline = pickle.load(f)

In [7]:
frequency_of_each_genre = pd.DataFrame(response_training_baseline).sum(axis=0)

In [8]:
number_categories = pd.DataFrame(pd.DataFrame(response_training_baseline).sum(axis=1))

In [9]:
count_num_genre = pd.DataFrame(number_categories[number_categories.columns[0]].value_counts()).sort_index()
count_num_genre.columns=['freq']
count_num_genre = count_num_genre['freq'].div(sum(count_num_genre['freq']))

count_num_genre

1     0.352817
2     0.388903
3     0.169756
4     0.061007
5     0.019360
6     0.005719
7     0.001376
8     0.000354
9     0.000590
10    0.000079
11    0.000020
19    0.000020
Name: freq, dtype: float64

In [10]:
p_each_genre = pd.DataFrame(pd.DataFrame(response_training_baseline).sum(axis=0))
p_each_genre.columns = ['freq']
p_each_genre = p_each_genre['freq'].div(sum(p_each_genre['freq']))

p_each_genre

0     0.164072
1     0.119762
2     0.072302
3     0.016010
4     0.009479
5     0.085858
6     0.003932
7     0.012348
8     0.006773
9     0.125665
10    0.009314
11    0.014203
12    0.047943
13    0.065664
14    0.035972
15    0.021083
16    0.074814
17    0.040378
18    0.074427
Name: freq, dtype: float64

In [11]:
#generate the number of categories the game belongs to

np.random.choice(count_num_genre.index.tolist(), p=count_num_genre.values.tolist())

2

In [12]:
#generate the game category

np.random.choice(p_each_genre.index.tolist(), p=p_each_genre.values.tolist())

13

In [14]:
#10,000 random obs
truth_obs = []

for j in range(10000):
  num_categories_truth = np.random.choice(count_num_genre.index.tolist(), p=count_num_genre.values.tolist())
  list_categories_truth = []
  
  for i in range(num_categories_truth):
    genre_val_truth = np.random.choice(p_each_genre.index.tolist(), p=p_each_genre.values.tolist())
    while genre_val_truth in list_categories_truth:
      genre_val_truth = np.random.choice(p_each_genre.index.tolist(), p=p_each_genre.values.tolist())
      #print("a")
    list_categories_truth.append(genre_val_truth)

  truth_obs.append(list_categories_truth)


In [15]:
predicted_obs = []

for j in range(10000):

  num_categories_predicted = np.random.choice(count_num_genre.index.tolist(), p=count_num_genre.values.tolist())
  list_categories_predicted = []

  for i in range(num_categories_predicted):
    genre_val_predicted = np.random.choice(p_each_genre.index.tolist(), p=p_each_genre.values.tolist())
    while genre_val_predicted in list_categories_predicted:
      genre_val_predicted = np.random.choice(p_each_genre.index.tolist(), p=p_each_genre.values.tolist())
      #print("a")
    list_categories_predicted.append(genre_val_predicted)

  predicted_obs.append(list_categories_predicted)

In [16]:
random_predictions = pd.DataFrame({"predicted": predicted_obs,"truth":truth_obs}, columns=['predicted','truth'])
random_predictions

Unnamed: 0,predicted,truth
0,[0],[1]
1,"[13, 8]","[1, 18]"
2,[6],[17]
3,"[0, 12, 15, 18]","[2, 9]"
4,"[18, 17]","[9, 0, 18, 2]"
...,...,...
9995,"[16, 13]",[0]
9996,[13],"[1, 12, 5]"
9997,[16],"[12, 15]"
9998,[16],"[9, 0]"


In [17]:
results_dict = {
    "0":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "1":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "2":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "3":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "4":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "5":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "6":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "7":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "8":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "9":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "10":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "11":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "12":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "13":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "14":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "15":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "16":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "17":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
      "18":{
        "true_pos":0,
         "true_neg":0,
         "false_pos":0,
         "false_neg":0
    },
    
    
}

In [18]:
for index, row in random_predictions.iterrows():

  for i in row['predicted']:
    if i in row['truth']:
      results_dict[str(i)]['true_pos'] = results_dict[str(i)]['true_pos'] + 1 
    else: 
      results_dict[str(i)]['false_pos'] = results_dict[str(i)]['false_pos'] + 1 
  
  for i in row['truth']:
    if i not in row['predicted']:
      results_dict[str(i)]['false_neg'] = results_dict[str(i)]['false_neg'] + 1 

In [19]:
#results_dict.keys()
[*results_dict]

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18']

#MICRO

In [20]:
sum_true_positive = 0
sum_false_positive = 0
sum_false_negative = 0

for key in [*results_dict]:
  sum_true_positive+=results_dict[key]['true_pos']
  sum_false_positive+=results_dict[key]['false_pos']
  sum_false_negative+=results_dict[key]['false_neg']

In [21]:
micro_average_precision = 100*sum_true_positive/(sum_true_positive+sum_false_positive)
micro_average_recall = 100*sum_true_positive/(sum_true_positive+sum_false_negative)

print("randomized - miro average precision: ", micro_average_precision,'%')
print("randomized - miro average recall: ",micro_average_recall,'%')
print("randomized - miro average f1-score: ",(2*micro_average_precision*micro_average_recall)/(micro_average_recall+micro_average_precision),'%')


randomized - miro average precision:  17.995869800373686 %
randomized - miro average recall:  17.99763965381589 %
randomized - miro average f1-score:  17.99675468358165 %


#MACRO 

In [23]:
precision_lst = []
recall_lst = []
f1_score_lst = []

for key in [*results_dict]:
  precision_lst.append(results_dict[key]['true_pos']/(results_dict[key]['true_pos'] + results_dict[key]['false_pos']))
  recall_lst.append(results_dict[key]['true_pos']/(results_dict[key]['true_pos'] + results_dict[key]['false_neg']))
  f1_score_lst.append(results_dict[key]['true_pos']/(results_dict[key]['true_pos'] + 0.5*(results_dict[key]['false_neg']+results_dict[key]['false_pos'])))


In [28]:
print("randomized - macro average precision: ", np.average(precision_lst)*100,'%')
print("randomized - macro average recall: ",np.average(recall_lst)*100,'%')
print("randomized - macro average f1-score: ",np.average(f1_score_lst)*100,'%')


randomized - macro average precision:  10.615265471527827 %
randomized - macro average recall:  10.594693916469625 %
randomized - macro average f1-score:  10.599896361033592 %
