# Wattpad

In [1]:
from string import punctuation

import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
from IPython.display import display

## Loading data

### Metadata

In [2]:
metadata_df = pd.read_csv(
    'data/metadata.tsv',
    sep='\t',
    names=['story_id', 'story_description', 'category_id', 'story_tags'],
)

In [3]:
metadata_df.head()

Unnamed: 0,story_id,story_description,category_id,story_tags
0,5881,Stuff that is not really important. Contains s...,10,corazon stephen dodge-ball volleyball butterfl...
1,14620,18 year old Grace has way more responsibilitie...,10,hilton karen underwear forest molest death run...
2,15577,Can i Have This Dance? Nicki Alab whom lives ...,10,touching newgirl holding joking revolted close...
3,24019,Giselle knew what she wanted. She wanted the b...,19,axel giselle liv dream boyfriend braydon seth ...
4,31450,From the popular anime show Yu Yu Hakusho come...,19,myruki yuyugang kurama spirit youko yuyuhakush...


In [4]:
metadata_df.shape

(245851, 4)

In [43]:
metadata_df.story_id.nunique()

245704

### Sample

In [5]:
sample_df = pd.read_csv('data/sample.tsv', sep='\t', names=['story_id', 'chapter_id', 'chapter_index', 'chapter_text'])

In [6]:
sample_df.head()

Unnamed: 0,story_id,chapter_id,chapter_index,chapter_text
0,35,183814,0,We caught up on a lot of things that happened ...
1,35,184177,1,They split up to enclose around me. Crap. Wh...
2,35,184771,2,Where's Bing? Why aren't any of your roommate...
3,35,185259,3,Question Vote (it really helps I know) Copyrig...
4,35,186546,4,"I want to marry Miss, I need your blessing. Y..."


In [7]:
sample_df.shape

(1000, 4)

### Stories

In [None]:
CHUNK_SIZE = 1000

In [41]:
%%time
stories_df = pd.read_csv(
    'data/stories.tsv',
    sep='\t',
    names=['story_id', 'chapter_id', 'chapter_index', 'chapter_text'],
    chunksize=CHUNK_SIZE,
)

CPU times: user 2.56 ms, sys: 0 ns, total: 2.56 ms
Wall time: 1.87 ms


In [9]:
type(stories_df)

pandas.io.parsers.TextFileReader

In [10]:
# %%time

# for i, chunk in enumerate(stories_df):
#     pass

## Modifying data

In [11]:
category_dict = {
    0: 'Unknown',
    1: 'Teen Fiction',
    2: 'Poetry',
    3: 'Fantasy',
    4: 'Romance',
    5: 'Science Fiction',
    6: 'Fanfiction',
    7: 'Humor',
    8: 'Mystery / Thriller',
    9: 'Horror',
    10: 'Classics',
    11: 'Adventure',
    12: 'Paranormal',
    13: 'Spiritual',
    14: 'Action',
    16: 'Non-Fiction',
    17: 'Short Story',
    18: 'Vampire',
    19: 'Random',
    21: 'General Fiction',
    22: 'Werewolf',
    23: 'Historical Fiction',
    24: 'ChickLit',
}

In [12]:
metadata_df['category'] = metadata_df.category_id.apply(lambda x: category_dict[x])

In [13]:
metadata_df['story_tags'] = metadata_df.story_tags.apply(lambda x: set(x.split()))

In [14]:
metadata_df.head()

Unnamed: 0,story_id,story_description,category_id,story_tags,category
0,5881,Stuff that is not really important. Contains s...,10,"{kitchen, alone, song, jovenes, chunky, fake-d...",Classics
1,14620,18 year old Grace has way more responsibilitie...,10,"{police, birth, angel, supernatural, daemons, ...",Classics
2,15577,Can i Have This Dance? Nicki Alab whom lives ...,10,"{police, jealously, disgust, blackmail, sleep,...",Classics
3,24019,Giselle knew what she wanted. She wanted the b...,19,"{liv, braydon, boyfriend, axel, summer, gisell...",Random
4,31450,From the popular anime show Yu Yu Hakusho come...,19,"{yuyugang, kurama, spirit, youko, world, yuyuh...",Random


## Processing data

In [15]:
def clean_text(text):
    for p in punctuation:
        text = text.replace(p, '')
    return text.lower()

In [16]:
def chunk_generator(seq, size, step):
    return (seq[pos:pos + size] for pos in range(0, len(seq), step) if len(seq) - pos > step)

In [17]:
example = """
Hands Across Hawthorne was a rally held at the Hawthorne Bridge in the American West Coast city of Portland, Oregon, on May 29, 2011. The demonstration was in response to an attack, one week earlier, on Brad Forkner and Christopher Rosevear, a gay male couple who had been holding hands while walking across the bridge. According to the couple and the Portland Police Bureau, a group of five men followed Forkner and Rosevear along the bridge before physically assaulting them. The assault was condemned by Portland's mayor, Sam Adams, and its police chief, Mike Reese, and news of the attack spread throughout the Pacific Northwest and the United States. The attack prompted volunteers from the Q Center, a nonprofit organization that supports the LGBT community, to form street patrols as a means of monitoring Portland's downtown area.
Several LGBT and human rights organizations sponsored Hands Across Hawthorne in response to the attack, with the purpose of linking hands across the entire span of the Hawthorne Bridge to show solidarity. More than 4,000 people attended the rally, which had been publicized on a single Facebook page 72 hours previously. Forkner, Rosevear, Mayor Adams, and other community leaders spoke at the rally. The event received attention throughout the United States. On June 5, residents of Spokane, Washington, held a similar hand-holding rally called "Hands Across Monroe", crossing the Monroe Street Bridge in Riverfront Park.
"""

def extract_information(text, window_size=50, step=25):
    words = word_tokenize(clean_text(text))
    words = [w for w in words if w not in stopwords.words('english')]
    chunks = chunk_generator(words, window_size, step)
    return list(chunks)
   
extract_information(example, 10, 5)

[['hands',
  'across',
  'hawthorne',
  'rally',
  'held',
  'hawthorne',
  'bridge',
  'american',
  'west',
  'coast'],
 ['hawthorne',
  'bridge',
  'american',
  'west',
  'coast',
  'city',
  'portland',
  'oregon',
  'may',
  '29'],
 ['city',
  'portland',
  'oregon',
  'may',
  '29',
  '2011',
  'demonstration',
  'response',
  'attack',
  'one'],
 ['2011',
  'demonstration',
  'response',
  'attack',
  'one',
  'week',
  'earlier',
  'brad',
  'forkner',
  'christopher'],
 ['week',
  'earlier',
  'brad',
  'forkner',
  'christopher',
  'rosevear',
  'gay',
  'male',
  'couple',
  'holding'],
 ['rosevear',
  'gay',
  'male',
  'couple',
  'holding',
  'hands',
  'walking',
  'across',
  'bridge',
  'according'],
 ['hands',
  'walking',
  'across',
  'bridge',
  'according',
  'couple',
  'portland',
  'police',
  'bureau',
  'group'],
 ['couple',
  'portland',
  'police',
  'bureau',
  'group',
  'five',
  'men',
  'followed',
  'forkner',
  'rosevear'],
 ['five',
  'men',
  'fol

In [18]:
# c=0
# for chunk in stories_df:
#     if c > 0:
#         break
# #     display(chunk)
# #     print(chunk.ix[1700].chapter_text)
#     c+=1

In [42]:
%%time

c=0
ids = set()
for chunk in stories_df:
    if c % 100000 == 0:
        print(c, len(ids))
    ids = ids.union(chunk.story_id.unique())
    c+=len(chunk)

ParserError: Error tokenizing data. C error: out of memory

In [34]:
len(chunk)

1000

In [27]:
c

11

In [40]:
len(ids)

984493

In [19]:
# c=0
# for chunk in stories_df:
#     for i, x in chunk.iterrows():
#         c+=1

In [21]:
# c
# 3947664

3947664

In [20]:
# c=0
# for chunk in stories_df:
#     if c > 0:
#         break
#     display(chunk)
#     c+=1

In [None]:
Comprobar que pasa si el texto es menor que el window_size