# Preprocess "ROC Stories" for Story Completion

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import glob
import pandas as pd

DATAPATH = '/path/to/ROCStories'

In [3]:
ROCstory_spring2016 = pd.read_csv(os.path.join(DATAPATH, "ROCStories__spring2016 - ROCStories_spring2016.csv"))
ROCstory_winter2017 = pd.read_csv(os.path.join(DATAPATH, "ROCStories_winter2017 - ROCStories_winter2017.csv"))

In [4]:
ROCstory_train = pd.concat([ROCstory_spring2016, ROCstory_winter2017])

In [5]:
len(ROCstory_train["storyid"].unique())

98161

In [6]:
stories = ROCstory_train.loc[:, "sentence1":"sentence5"].values

## Train, Dev, Test

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_and_dev, test_stories = train_test_split(stories, test_size=0.1)

In [9]:
train_stories, dev_stories = train_test_split(train_and_dev, test_size=1/9)

In [10]:
len(train_stories), len(dev_stories), len(test_stories)

(78528, 9816, 9817)

### dev

In [11]:
import numpy as np
np.random.seed(1234)

In [12]:
dev_missing_indexes = np.random.randint(low=0, high=5, size=len(dev_stories))

In [13]:
dev_stories_with_missing = []

for st, mi in zip(dev_stories, dev_missing_indexes):
    missing_sentence = st[mi]
    remain_sentences = np.delete(st, mi)
    
    dev_stories_with_missing.append([remain_sentences[0], 
                                 remain_sentences[1],
                                 remain_sentences[2],
                                 remain_sentences[3],                                 
                                 mi, missing_sentence])

In [14]:
dev_df = pd.DataFrame(dev_stories_with_missing,
                  columns=['stories_with_missing_sentence1',
                           'stories_with_missing_sentence2',
                           'stories_with_missing_sentence3',
                           'stories_with_missing_sentence4',
                           'missing_id', 'missing_sentence'])

In [15]:
dev_df.to_csv("./data/rocstories_completion_dev.csv", index=False)

### test

In [16]:
test_missing_indexes = np.random.randint(low=0, high=5, size=len(test_stories))

In [17]:
test_stories_with_missing = []

for st, mi in zip(test_stories, test_missing_indexes):
    missing_sentence = st[mi]
    remain_sentences = np.delete(st, mi)
    
    test_stories_with_missing.append([remain_sentences[0], 
                                 remain_sentences[1],
                                 remain_sentences[2],
                                 remain_sentences[3],                                 
                                 mi, missing_sentence])

In [18]:
test_df = pd.DataFrame(test_stories_with_missing,
                  columns=['stories_with_missing_sentence1',
                           'stories_with_missing_sentence2',
                           'stories_with_missing_sentence3',
                           'stories_with_missing_sentence4',
                           'missing_id', 'missing_sentence'])

In [19]:
test_df.to_csv("./data/rocstories_completion_test.csv", index=False)

### train

In [20]:
train_df = pd.DataFrame(train_stories,
                       columns=['sentence1',
                                'sentence2',
                                'sentence3',
                                'sentence4',
                                'sentence5'])

In [21]:
train_df.to_csv("./data/rocstories_completion_train.csv", index=False)

## load saved data

In [22]:
train_df2 = pd.read_csv("./data/rocstories_completion_train.csv")

In [23]:
# train_df2.head()

In [24]:
dev_df2 = pd.read_csv("./data/rocstories_completion_dev.csv")

In [25]:
# dev_df2.head()

In [26]:
test_df2 = pd.read_csv("./data/rocstories_completion_test.csv")

In [27]:
# test_df2.head()

In [28]:
dev_df2.missing_id.value_counts()

3    1997
4    1983
2    1963
0    1945
1    1928
Name: missing_id, dtype: int64

In [29]:
test_df2.missing_id.value_counts()

0    2020
1    1988
3    1953
2    1943
4    1913
Name: missing_id, dtype: int64

### mini size dataset

In [30]:
train_mini, train_else = train_test_split(train_df, test_size=0.9)

In [31]:
len(train_mini)

7852

In [32]:
train_mini.to_csv("./data/rocstories_completion_train_mini.csv", index=False)

In [33]:
dev_mini, dev_else = train_test_split(dev_df, test_size=0.9)

In [34]:
len(dev_mini)

981

In [35]:
dev_mini.to_csv("./data/rocstories_completion_dev_mini.csv", index=False)