# Creates the Gametype problem datasets (train, public and private test)

In [1]:
import os  
import logging 
import numpy as np
import pandas as pd

In [2]:
from bay12_scraper.prep.eposts_ds import load_or_create_extended_posts, split_extended_ds

In [3]:
output = os.path.abspath( os.path.join('..', 'output') )

## Preparing full posts

In [4]:
# Load threads
threads = pd.read_csv(os.path.join(output, 'threads.csv'), header=0, encoding='utf-8')

In [5]:
print(threads.shape)
threads.head(2)

(594, 5)


Unnamed: 0,url,thread_num,thread_name,thread_label,thread_replies
0,http://www.bay12forums.com/smf/index.php?topic...,134925,Mafia Setup Discussion and Review,other,598
1,http://www.bay12forums.com/smf/index.php?topic...,45016,Games Threshold Discussion and List [Vote for ...,other,5703


In [6]:
# Load posts (note that initial creation will take a long time)
eposts = load_or_create_extended_posts(
    fname=os.path.join(output, 'extended_posts.csv'), 
    threads=threads, incremental=True, timeout=10, limit=None, 
)

TOTAL THREADS: 594
Saved threads: 39
Saved threads: 59
Saved threads: 79
Saved threads: 99
Saved threads: 119
Saved threads: 139
Saved threads: 159
Saved threads: 179
Saved threads: 199
Saved threads: 219
Saved threads: 239
Saved threads: 259
Saved threads: 279
Saved threads: 299
Saved threads: 319
Saved threads: 339
Saved threads: 359
Saved threads: 379
Saved threads: 399
Saved threads: 419
Saved threads: 439
Saved threads: 459
Saved threads: 479
Saved threads: 499
Saved threads: 519
Saved threads: 539
Saved threads: 559
Saved threads: 579


In [7]:
print(eposts.shape)
eposts.head(2)

(196466, 4)


Unnamed: 0,thread_num,user,text,quotes
0,134925,Leafsnail,This thread is for:\r\n- Discussing proposed o...,[]
1,134925,Leafsnail,(from the previous thread)\r\n QUOTED_SECTION ...,"[<blockquote class=""bbc_standard_quote"">Quantu..."


## Split into train/test by threads

In [8]:
ds = split_extended_ds(threads, eposts, 0.2, 0.2)
# {'train': [posts_train, threads_train], etc. }

In [9]:
print("\n".join([
    "%s threads, %s posts" % (len(ds[z][0].groupby('thread_num').thread_num.count()), len(ds[z][0])) 
     for z in ['train', 'public', 'private']
]))

358 threads, 97539 posts
118 threads, 44778 posts
118 threads, 54149 posts


## Export to disk

In [10]:
dataset = os.path.join(output, 'dataset', 'ds_eposts') 
os.makedirs(dataset, exist_ok=True)

In [11]:
opts_out = {'encoding': 'utf-8', 'header': True, 'index': False}
cols_thread = ['thread_num', 'thread_name', 'thread_label', 'thread_replies']
cols_post = list(eposts.columns)

In [12]:
for z in ['train', 'public', 'private']:
    ds[z][0][cols_post].to_csv(os.path.join(dataset, '%s_post.csv' % z), **opts_out)
    ds[z][1][cols_thread].to_csv(os.path.join(dataset, '%s_thread.csv' % z), **opts_out)