# Finalizes datasets

In [1]:
import os
import numpy as np 
import pandas as pd


In [2]:
output = os.path.abspath( os.path.join('..', 'output') )

-----

## EPOSTS dataset

In [3]:
ds_eposts = os.path.join(output, 'dataset', 'ds_eposts')

In [4]:
os.makedirs(os.path.join(ds_eposts, 'train'), exist_ok=True)
os.makedirs(os.path.join(ds_eposts, 'test'), exist_ok=True)

### Set labelling scheme

In [5]:
epost_lbl = 'thread_label'
epost_outcol = 'thread_label_id'
epost_usage = 'Usage'

In [6]:
possible_labels_epost = [
    'bastard', 'beginners-mafia', 'byor', 'classic', 
    'closed-setup', 'cybrid', 'kotm', 'non-mafia-game', 
    'other', 'paranormal', 'supernatural', 'vanilla', 'vengeful'
]
labels_epost = pd.Series(range(len(possible_labels_epost)), index=possible_labels_epost)
labels_epost.name = 'type_id'
labels_epost.index.name = 'type_name'
labels_epost

type_name
bastard             0
beginners-mafia     1
byor                2
classic             3
closed-setup        4
cybrid              5
kotm                6
non-mafia-game      7
other               8
paranormal          9
supernatural       10
vanilla            11
vengeful           12
Name: type_id, dtype: int64

In [7]:
labels_epost.to_csv(os.path.join(ds_eposts, 'label_map.csv'), header=True, index=True, encoding='utf-8')

### Load labels and save into datasets

In [8]:
thread_train, thread_public, thread_private = [
    pd.read_csv(
        os.path.join(ds_eposts, 'raw', '%s_thread_full.csv') % z,
        header=0, encoding='utf-8'
    )
    for z in ['train', 'public', 'private']
]
thread_public[epost_usage] = 'Public'
thread_private[epost_usage] = 'Private'
thread_test = pd.concat([thread_public, thread_private], axis='rows')

In [9]:
thread_train[epost_outcol] = thread_train[epost_lbl].map(labels_epost)
thread_test[epost_outcol] = thread_test[epost_lbl].map(labels_epost)
thread_test_censored = thread_test.drop(columns=[epost_lbl, epost_outcol, epost_usage])

In [10]:
# Create randomly seeded output
epost_rng = np.random.RandomState(seed=42)
thread_test_random = thread_test_censored.copy()
thread_test_random[epost_outcol] = epost_rng.choice(labels_epost.values, size=len(thread_test_random))

In [11]:
thread_test_random.head()

Unnamed: 0,thread_num,thread_name,thread_replies,thread_label_id
0,126856,"Mafia Tools and Utilities (lurkertracker, etc)...",38,6
1,132415,Mafia Theory,211,3
2,134482,"Iron Diadem, Night One: Things Said Behind Bar...",474,12
3,133728,Beginner's Mafia XLIV: The Court of Colors | R...,563,10
4,134270,Mod Use #2,10,7


In [12]:
opts_out = {'encoding': 'utf-8', 'header': True, 'index': False}

In [13]:
thread_train.to_csv(os.path.join(ds_eposts, 'train', 'thread.csv'), **opts_out)
thread_test[['thread_num', epost_outcol, epost_usage]].to_csv(
    os.path.join(ds_eposts, 'test', 'thread_answers.csv'), **opts_out)
thread_test_censored.to_csv(os.path.join(ds_eposts, 'test', 'thread.csv'), **opts_out)
thread_test_random.to_csv(os.path.join(ds_eposts, 'test', 'thread_RANDOM.csv'), **opts_out)

### Load and save posts dataframe

In [14]:
post_train, post_public, post_private = [
    pd.read_csv(
        os.path.join(ds_eposts, 'raw', '%s_post.csv') % z,
        header=0, encoding='utf-8'
    )
    for z in ['train', 'public', 'private']
]
# post_public[epost_usage] = 'Public'
# post_private[epost_usage] = 'Private'
post_test = pd.concat([post_public, post_private], axis='rows')

In [15]:
post_train.to_csv(os.path.join(ds_eposts, 'train', 'post.csv'), **opts_out)
post_test.to_csv(os.path.join(ds_eposts, 'test', 'post.csv'), **opts_out)

-----

## MAFIA dataset: Remove role labels

In [16]:
ds_mafia = os.path.join(output, 'dataset', 'ds_mafia') 

In [17]:
os.makedirs(os.path.join(ds_mafia, 'train'), exist_ok=True)
os.makedirs(os.path.join(ds_mafia, 'test'), exist_ok=True)

### Set labelling scheme

In [18]:
mafia_lbl = 'role'
mafia_outcol = 'role_id'
mafia_usage = 'Usage'

In [19]:
# possible_labels_mafia = ['town', 'mafia', 'observer', 'ic', 'game-master', 'unknown']
possible_labels_mafia = ['town', 'mafia']

labels_mafia = pd.Series(range(len(possible_labels_mafia)), index=possible_labels_mafia)
labels_mafia.name = 'type_id'
labels_mafia.index.name = 'type_name'
labels_mafia

type_name
town     0
mafia    1
Name: type_id, dtype: int64

In [20]:
labels_mafia.to_csv(os.path.join(ds_mafia, 'label_map.csv'), header=True, index=True, encoding='utf-8')

### Load labels and save into datasets

In [21]:
role_train, role_public, role_private = [
    pd.read_csv(
        os.path.join(ds_mafia, 'raw', '%s_role_full.csv') % z,
        header=0, encoding='utf-8'
    )
    for z in ['train', 'public', 'private']
]
role_public[mafia_usage] = 'Public'
role_private[mafia_usage] = 'Private'
role_test = pd.concat([role_public, role_private], axis='rows')

In [22]:

# Set labels
role_train[mafia_outcol] = role_train[mafia_lbl].map(labels_mafia)
role_test[mafia_outcol] = role_test[mafia_lbl].map(labels_mafia)

# Drop labels we aren't considering
role_train = role_train.dropna()
role_test = role_test.dropna()

# Set unique ids
def get_id(row):
    s = "%s_%s" % (row.thread_num, row.user)
    return s.replace(' ', '_')  # 

role_train['uid'] = role_train.apply(get_id, axis='columns')
role_test['uid'] = role_test.apply(get_id, axis='columns')

# Censor labels 
role_test_censored = role_test.drop(columns=[mafia_lbl, mafia_outcol, mafia_usage])

# Create randomly seeded output
mafia_rng = np.random.RandomState(seed=42)
role_test_random = role_test_censored.copy()
role_test_random[mafia_outcol] = mafia_rng.uniform(0, 1, len(role_test_random))

In [23]:
role_test_random.head()

Unnamed: 0,thread_num,user,final_player,uid,role_id
0,142461,4maskwolf,4maskwolf,142461_4maskwolf,0.37454
1,142461,AlleeCat,tn5421,142461_AlleeCat,0.950714
2,142461,Comrade Shamrock,McDonald,142461_Comrade_Shamrock,0.731994
3,142461,Krevsin,Krevsin,142461_Krevsin,0.598658
4,142461,McDonald,McDonald,142461_McDonald,0.156019


In [24]:
opts_out = {'encoding': 'utf-8', 'header': True, 'index': False}

In [25]:
role_train.to_csv(os.path.join(ds_mafia, 'train', 'role.csv'), **opts_out)
role_test[['uid', mafia_outcol, mafia_usage]].to_csv(os.path.join(ds_mafia, 'test', 'role_answers.csv'), **opts_out)
role_test_censored.to_csv(os.path.join(ds_mafia, 'test', 'role.csv'), **opts_out)
role_test_random.to_csv(os.path.join(ds_mafia, 'test', 'role_RANDOM.csv'), **opts_out)

### Load and save posts dataframe

In [26]:
post_train, post_public, post_private = [
    pd.read_csv(
        os.path.join(ds_mafia, 'raw', '%s_post.csv') % z,
        header=0, encoding='utf-8'
    )
    for z in ['train', 'public', 'private']
]
# post_public[mafia_usage] = 'Public'
# post_private[mafia_usage] = 'Private'
post_test = pd.concat([post_public, post_private], axis='rows')

In [27]:
post_train.to_csv(os.path.join(ds_mafia, 'train', 'post.csv'), **opts_out)
post_test.to_csv(os.path.join(ds_mafia, 'test', 'post.csv'), **opts_out)