# Creates the Mafia problem datasets (train, public and private test)

In [1]:
import os  
import logging 
import numpy as np
import pandas as pd

In [2]:
#from bay12_scraper.thread import ForumThread
from bay12_scraper.prep.mafia_ds import load_or_create_posts, fix_roles_df, split_ds

In [3]:
output = os.path.abspath( os.path.join('..', 'output') )

## Preparing full posts

In [4]:
# Load threads
threads = pd.read_csv(os.path.join(output, 'threads.csv'), header=0, encoding='utf-8')
threads = threads[threads.thread_label.isin(['beginners-mafia', 'vanilla'])]

In [5]:
# Load roles
roles_raw = pd.read_csv(os.path.join(output, 'roles.csv'), header=0, encoding='utf-8')
roles = fix_roles_df(roles_raw)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_fixed['final_player'][~tbr] = df_fixed['user'][~tbr]


In [6]:
# Load posts (note that creation can take a long time)
posts = load_or_create_posts(os.path.join(output, 'posts.csv'), roles, threads)

In [7]:
posts.columns, roles.columns

(Index(['thread_num', 'user', 'text', 'quotes'], dtype='object'),
 Index(['thread_num', 'user', 'role', 'num_posts', 'replaced_by',
        'replacement_depth', 'final_player'],
       dtype='object'))

In [8]:
# The "full" dataframe, but we won't be using it directly
pp = (
    posts
    .merge(roles, on=['thread_num', 'user'], how='left')
)[['thread_num', 'user','text', 'role', 'num_posts', 'quotes', 'final_player']]
pp.sort_values(['thread_num', 'user']).head()

Unnamed: 0,thread_num,user,text,role,num_posts,quotes,final_player
16,39457,Alexhans,I'll co-mod this one if meph doesn't mind... H...,observer,25,[],Alexhans
18,39457,Alexhans,I suggest randomly choosing between this 4 set...,observer,25,[],Alexhans
31,39457,Alexhans,Meph... this game needs to be non pm-free... \...,observer,25,[],Alexhans
57,39457,Alexhans,not really...\r\r\n\r\r\nit says that it can b...,observer,25,[],Alexhans
59,39457,Alexhans,"yes, that's why I strongly suggested the mod t...",observer,25,[],Alexhans


In [9]:
roles.sort_values(['thread_num', 'user']).head()

Unnamed: 0,thread_num,user,role,num_posts,replaced_by,replacement_depth,final_player
745,39457,Alexhans,observer,25,,0,Alexhans
746,39457,BloodBeard,town,34,,0,BloodBeard
747,39457,Dariush,observer,2,,0,Dariush
748,39457,Eduren,town,79,,0,Eduren
749,39457,Free Beer,mafia,54,,0,Free Beer


## Split into train/test by threads

In [10]:
ds = split_ds(roles, posts, 0.2, 0.2)
# {'train': [posts_train, roles_train], etc. }

In [11]:
print("\n".join([
    "%s threads, %s posts" % (len(ds[z][0].groupby('thread_num').thread_num.count()), len(ds[z][0])) 
     for z in ['train', 'public', 'private']
]))

46 threads, 19889 posts
15 threads, 6619 posts
15 threads, 6479 posts


## Export to disk

In [12]:
dataset = os.path.join(output, 'dataset', 'ds_mafia') 
os.makedirs(dataset, exist_ok=True)

In [13]:
opts_out = {'encoding': 'utf-8', 'header': True, 'index': False}
cols_role = ['thread_num', 'user', 'final_player', 'role']
cols_post = list(posts.columns)

In [14]:
for z in ['train', 'public', 'private']:
    ds[z][0][cols_post].to_csv(os.path.join(dataset, '%s_post.csv' % z), **opts_out)
    ds[z][1][cols_role].to_csv(os.path.join(dataset, '%s_role.csv' % z), **opts_out)