# Wattpad

In [26]:
from string import punctuation

import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
from IPython.display import display

## Loading data

### Metadata

In [9]:
metadata_df = pd.read_csv(
    'data/metadata.tsv',
    sep='\t',
    names=['story_id', 'story_description', 'category_id', 'story_tags'],
)

In [10]:
metadata_df.head()

Unnamed: 0,story_id,story_description,category_id,story_tags
0,5881,Stuff that is not really important. Contains s...,10,corazon stephen dodge-ball volleyball butterfl...
1,14620,18 year old Grace has way more responsibilitie...,10,hilton karen underwear forest molest death run...
2,15577,Can i Have This Dance? Nicki Alab whom lives ...,10,touching newgirl holding joking revolted close...
3,24019,Giselle knew what she wanted. She wanted the b...,19,axel giselle liv dream boyfriend braydon seth ...
4,31450,From the popular anime show Yu Yu Hakusho come...,19,myruki yuyugang kurama spirit youko yuyuhakush...


In [11]:
metadata_df.shape

(245851, 4)

### Sample

In [2]:
sample_df = pd.read_csv('data/sample.tsv', sep='\t', names=['story_id', 'chapter_id', 'chapter_index', 'chapter_text'])

In [3]:
sample_df.head()

Unnamed: 0,story_id,chapter_id,chapter_index,chapter_text
0,35,183814,0,We caught up on a lot of things that happened ...
1,35,184177,1,They split up to enclose around me. Crap. Wh...
2,35,184771,2,Where's Bing? Why aren't any of your roommate...
3,35,185259,3,Question Vote (it really helps I know) Copyrig...
4,35,186546,4,"I want to marry Miss, I need your blessing. Y..."


In [4]:
sample_df.shape

(1000, 4)

### Stories

In [5]:
%%time
stories_df = pd.read_csv(
    'data/000000_0',
    sep='\t',
    names=['story_id', 'chapter_id', 'chapter_index', 'chapter_text'],
    chunksize=1000,
)

CPU times: user 1.66 ms, sys: 560 µs, total: 2.22 ms
Wall time: 1.65 ms


In [6]:
type(stories_df)

pandas.io.parsers.TextFileReader

In [8]:
# %%time

# for i, chunk in enumerate(stories_df):
#     pass

## Modifying data

In [62]:
category_dict = {
    0: 'Unknown',
    1: 'Teen Fiction',
    2: 'Poetry',
    3: 'Fantasy',
    4: 'Romance',
    5: 'Science Fiction',
    6: 'Fanfiction',
    7: 'Humor',
    8: 'Mystery / Thriller',
    9: 'Horror',
    10: 'Classics',
    11: 'Adventure',
    12: 'Paranormal',
    13: 'Spiritual',
    14: 'Action',
    16: 'Non-Fiction',
    17: 'Short Story',
    18: 'Vampire',
    19: 'Random',
    21: 'General Fiction',
    22: 'Werewolf',
    23: 'Historical Fiction',
    24: 'ChickLit',
}

In [63]:
metadata_df['category'] = metadata_df.category_id.apply(lambda x: category_dict[x])

In [64]:
metadata_df['story_tags'] = metadata_df.story_tags.apply(lambda x: set(x.split()))

In [65]:
metadata_df.head()

Unnamed: 0,story_id,story_description,category_id,story_tags,category
0,5881,Stuff that is not really important. Contains s...,10,"{badboysteen, kitchen, nice, cookies, bottle, ...",Classics
1,14620,18 year old Grace has way more responsibilitie...,10,"{daemons, romance, pain, shop, home, healing, ...",Classics
2,15577,Can i Have This Dance? Nicki Alab whom lives ...,10,"{jokes, playing, eddie, day, blackmail, family...",Classics
3,24019,Giselle knew what she wanted. She wanted the b...,19,"{boyfriend, giselle, axel, brother, summer, br...",Random
4,31450,From the popular anime show Yu Yu Hakusho come...,19,"{midnitepurplerose, world, yuyuhakusho, kurama...",Random


## Processing data

In [23]:
def clean_text(text):
    for p in punctuation:
        text = text.replace(p, '')
    return text.lower()

In [84]:
def chunk_generator(seq, size, step):
    return (seq[pos:pos + size] for pos in range(0, len(seq), step) if len(seq) - pos > step)

In [85]:
example = """
Hands Across Hawthorne was a rally held at the Hawthorne Bridge in the American West Coast city of Portland, Oregon, on May 29, 2011. The demonstration was in response to an attack, one week earlier, on Brad Forkner and Christopher Rosevear, a gay male couple who had been holding hands while walking across the bridge. According to the couple and the Portland Police Bureau, a group of five men followed Forkner and Rosevear along the bridge before physically assaulting them. The assault was condemned by Portland's mayor, Sam Adams, and its police chief, Mike Reese, and news of the attack spread throughout the Pacific Northwest and the United States. The attack prompted volunteers from the Q Center, a nonprofit organization that supports the LGBT community, to form street patrols as a means of monitoring Portland's downtown area.
Several LGBT and human rights organizations sponsored Hands Across Hawthorne in response to the attack, with the purpose of linking hands across the entire span of the Hawthorne Bridge to show solidarity. More than 4,000 people attended the rally, which had been publicized on a single Facebook page 72 hours previously. Forkner, Rosevear, Mayor Adams, and other community leaders spoke at the rally. The event received attention throughout the United States. On June 5, residents of Spokane, Washington, held a similar hand-holding rally called "Hands Across Monroe", crossing the Monroe Street Bridge in Riverfront Park.
"""

def extract_information(text, window_size=50, step=25):
    words = word_tokenize(clean_text(text))
    words = [w for w in words if w not in stopwords.words('english')]
    chunks = chunk_generator(words, window_size, step)
    return list(chunks)
   
extract_information(example, 10, 5)

[['hands',
  'across',
  'hawthorne',
  'rally',
  'held',
  'hawthorne',
  'bridge',
  'american',
  'west',
  'coast'],
 ['hawthorne',
  'bridge',
  'american',
  'west',
  'coast',
  'city',
  'portland',
  'oregon',
  'may',
  '29'],
 ['city',
  'portland',
  'oregon',
  'may',
  '29',
  '2011',
  'demonstration',
  'response',
  'attack',
  'one'],
 ['2011',
  'demonstration',
  'response',
  'attack',
  'one',
  'week',
  'earlier',
  'brad',
  'forkner',
  'christopher'],
 ['week',
  'earlier',
  'brad',
  'forkner',
  'christopher',
  'rosevear',
  'gay',
  'male',
  'couple',
  'holding'],
 ['rosevear',
  'gay',
  'male',
  'couple',
  'holding',
  'hands',
  'walking',
  'across',
  'bridge',
  'according'],
 ['hands',
  'walking',
  'across',
  'bridge',
  'according',
  'couple',
  'portland',
  'police',
  'bureau',
  'group'],
 ['couple',
  'portland',
  'police',
  'bureau',
  'group',
  'five',
  'men',
  'followed',
  'forkner',
  'rosevear'],
 ['five',
  'men',
  'fol

In [86]:
c=0
for chunk in stories_df:
    if c > 0:
        break
#     display(chunk)
#     print(chunk.ix[1700].chapter_text)
    c+=1

In [91]:
for i, x in chunk.iterrows():
    print(i, x.chapter_text)
    break

20000 Thank you, "You look amazing," He said. His eyes scanned over my body. I, once again, crossed my arms self conciously. "Unfold your arms, you look gorgeous." Maddie laughed, skipping into the kitchen. "Shut up, Mad." "I agree with her," Mason smiled. "Thank you," I blushed. "You two have fun, and remember, you need to be safe." I grabbed a pillow from the couch and threw it at Maddie. She caught it mid-air and tossed it back onto the couch. I pulled the door closed, and right before it clicked, I heard Maddi scream, "Heard that!" "Thank you," I smiled, "Ready? Let's get away from Maddi." "Yupp, you did. Smooth one," Maddie laughed. "Did I just say that outloud?" He asked slowly. I spun around and smiled at him. "You'd stillbe beautiful without the makeup, Sophie." Mason informed me. "Don't touch it." I growled. "Watch it, Soph. I know where you keep your makeup!" Thank you for everyone's support! Don't forget to VOTE, COMMMMMENT!, and TELL YOUR FRIENDS! xoxo, p.s., that was a pro

In [13]:
c=0
for chunk in stories_df:
    if c > 0:
        break
    display(chunk)
    c+=1

Unnamed: 0,story_id,chapter_id,chapter_index,chapter_text
15000,115865,1002791,0,Mary Alice Whitlock Cullen ***all recognizab...
15001,115865,1002809,1,"Jasper, you don't have to make yourself uncomf..."
15002,115865,1027044,2,As the days passed I got more and more anxious...
15003,115865,1045476,3,"I grabbed my wallet, purse and keys and made s..."
15004,115865,1065010,4,I was about to say something when Jasper slowl...
15005,115944,1002977,0,Copyright@2011 Agateophobia are really sorry i...
15006,115944,1009251,1,Copyright@2011Agateophobia case as Aiden grabs...
15007,115944,1012003,2,have at least have a couple of classes togethe...
15008,115944,1015264,3,"about you both take the bed."" Vincent replied...."
15009,115944,1016731,4,"favorites too."" Aiden said sitting next to me...."
