# Werewolf and vampire

In [1]:
import pandas as pd

## Loading data

### Metadata

In [2]:
metadata_df = pd.read_csv('data/out/metadata.csv', converters={'story_tags': eval})

In [3]:
metadata_df.head()

Unnamed: 0,story_id,story_description,category_id,story_tags,category
0,5881,Stuff that is not really important. Contains s...,10,"{damon, melody, alone, chris, rabbit, lucky, c...",Classics
1,14620,18 year old Grace has way more responsibilitie...,10,"{action, forest, henry, angel, phillip, hastin...",Classics
2,15577,Can i Have This Dance? Nicki Alab whom lives ...,10,"{principle, eat, disgust, trashed, cassie, fun...",Classics
3,24019,Giselle knew what she wanted. She wanted the b...,19,"{brother, bestfriend, axel, summer, giselle, b...",Random
4,31450,From the popular anime show Yu Yu Hakusho come...,19,"{world, spirit, yuyugang, myruki, youko, yuyuh...",Random


In [4]:
metadata_df.shape

(245851, 5)

### Sample

In [5]:
sample_df = pd.read_csv('data/out/sample.csv')

In [6]:
sample_df.head()

Unnamed: 0,story_id,chapter_id,chapter_index,chapter_text
0,35,183814,0,We caught up on a lot of things that happened ...
1,35,184177,1,They split up to enclose around me. Crap. Wh...
2,35,184771,2,Where's Bing? Why aren't any of your roommate...
3,35,185259,3,Question Vote (it really helps I know) Copyrig...
4,35,186546,4,"I want to marry Miss, I need your blessing. Y..."


In [7]:
sample_df.shape

(1000, 4)

### Stories

In [8]:
CHUNK_SIZE = 1000

In [9]:
%%time

stories_df = pd.read_csv(
    'data/src/stories.tsv',
    sep='\t',
    names=['story_id', 'chapter_id', 'chapter_index', 'chapter_text'],
    chunksize=CHUNK_SIZE,
)

CPU times: user 1.65 ms, sys: 0 ns, total: 1.65 ms
Wall time: 1.18 ms


In [10]:
type(stories_df)

pandas.io.parsers.TextFileReader

## Selecting data

In [11]:
werewolf_vampire_metadata_df = metadata_df[(metadata_df.category_id == 18) | (metadata_df.category_id == 22)]
werewolf_vampire_story_ids = werewolf_vampire_metadata_df.story_id.unique()

In [12]:
len(werewolf_vampire_story_ids)

5869

In [13]:
%%time

werewolf_vampire_list = []

for chunk in stories_df:
    selected = chunk[chunk.story_id.isin(werewolf_vampire_story_ids)]
    werewolf_vampire_list.append(selected)

werewolf_vampire_stories_df = pd.concat(werewolf_vampire_list, ignore_index=True)

CPU times: user 9min 39s, sys: 40.7 s, total: 10min 20s
Wall time: 11min 48s


## Saving data

In [14]:
werewolf_vampire_stories_df.to_csv('data/out/werewolf_vampire_stories.csv', index=False)