# Wattpad

In [1]:
from string import punctuation

import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Loading data

### Metadata

In [2]:
metadata_df = pd.read_csv('data/out/metadata.csv', converters={'story_tags': eval})

In [3]:
metadata_df.head()

Unnamed: 0,story_id,story_description,category_id,story_tags,category_name
0,5881,Stuff that is not really important. Contains s...,10,"{idea, jovenes, look, sam, badboy, really, sca...",Classics
1,14620,18 year old Grace has way more responsibilitie...,10,"{mother, white, book, sisters, arcanxo, ryan, ...",Classics
2,15577,Can i Have This Dance? Nicki Alab whom lives ...,10,"{cute, scared, anger, day, knockedover, suspen...",Classics
3,24019,Giselle knew what she wanted. She wanted the b...,19,"{liv, giselle, braydon, axel, dream, brother, ...",Random
4,31450,From the popular anime show Yu Yu Hakusho come...,19,"{myruki, world, youko, yuyuhakusho, spirit, yu...",Random


In [4]:
metadata_df.shape

(245851, 5)

### Sample

In [5]:
sample_df = pd.read_csv('data/out/sample.csv')

In [6]:
sample_df.head()

Unnamed: 0,story_id,chapter_id,chapter_index,chapter_text
0,35,183814,0,We caught up on a lot of things that happened ...
1,35,184177,1,They split up to enclose around me. Crap. Wh...
2,35,184771,2,Where's Bing? Why aren't any of your roommate...
3,35,185259,3,Question Vote (it really helps I know) Copyrig...
4,35,186546,4,"I want to marry Miss, I need your blessing. Y..."


In [7]:
sample_df.shape

(1000, 4)

### Stories

In [9]:
CHUNK_SIZE = 1000

In [10]:
%%time

stories_df = pd.read_csv(
    'data/src/stories.tsv',
    sep='\t',
    names=['story_id', 'chapter_id', 'chapter_index', 'chapter_text'],
    chunksize=CHUNK_SIZE,
)

CPU times: user 2.11 ms, sys: 0 ns, total: 2.11 ms
Wall time: 1.02 ms


In [11]:
type(stories_df)

pandas.io.parsers.TextFileReader

## Processing data

### WordNet

In [89]:
needs_df = pd.read_excel('data/src/maslow.xlsx')
needs_df.fillna('', inplace=True)

In [123]:
needs_df.head()

Unnamed: 0,PHYSIOLOGICAL,SAFETY,LOVE,ESTEEM,SELF_ACTUALIZATION,SELF_TRANSCENDENCE
0,food,freedom,love,esteem,write,volunteering
1,water,philosophy,community,evaluation,poetry,help
2,salt,safety,parents,respect,history,morality
3,sugar,stable,family,capacity,philosophy,ethics
4,protein,sunniness,sex,achievement,culture,volunteering


In [92]:
for need in needs_df:
    print(need)
    words = needs_df[need].unique()

PHYSIOLOGICAL
SAFETY
LOVE
ESTEEM
SELF_ACTUALIZATION
SELF_TRANSCENDENCE


In [124]:
need_sss = wn.synsets('volunteering')

In [125]:
need_sss

[Synset('volunteer.v.01'), Synset('volunteer.v.02'), Synset('volunteer.v.03')]

In [126]:
for i in range(len(need_sss)):
    print(i, need_sss[i].definition())

0 tell voluntarily
1 agree freely
2 do volunteer work


In [127]:
need_sss[2].hyponyms()

[]

In [128]:
need_sss = wn.synsets('work')

In [129]:
need_sss

[Synset('work.n.01'),
 Synset('work.n.02'),
 Synset('employment.n.02'),
 Synset('study.n.02'),
 Synset('work.n.05'),
 Synset('workplace.n.01'),
 Synset('oeuvre.n.01'),
 Synset('work.v.01'),
 Synset('work.v.02'),
 Synset('work.v.03'),
 Synset('function.v.01'),
 Synset('work.v.05'),
 Synset('exercise.v.03'),
 Synset('make.v.36'),
 Synset('work.v.08'),
 Synset('work.v.09'),
 Synset('work.v.10'),
 Synset('bring.v.03'),
 Synset('work.v.12'),
 Synset('cultivate.v.02'),
 Synset('work.v.14'),
 Synset('influence.v.01'),
 Synset('work.v.16'),
 Synset('work.v.17'),
 Synset('work.v.18'),
 Synset('work.v.19'),
 Synset('shape.v.02'),
 Synset('work.v.21'),
 Synset('knead.v.01'),
 Synset('exploit.v.01'),
 Synset('solve.v.01'),
 Synset('ferment.v.03'),
 Synset('sour.v.01'),
 Synset('work.v.27')]

In [130]:
for i in range(len(need_sss)):
    print(i, need_sss[i].definition())

0 activity directed toward making or doing something
1 a product produced or accomplished through the effort or activity or agency of a person or thing
2 the occupation for which you are paid
3 applying the mind to learning and understanding a subject (especially by reading)
4 (physics) a manifestation of energy; the transfer of energy from one physical system to another expressed as the product of a force and the distance through which it moves a body in the direction of that force
5 a place where work is done
6 the total output of a writer or artist (or a substantial part of it)
7 exert oneself by doing mental or physical work for a purpose or out of necessity
8 be employed
9 have an effect or outcome; often the one desired or expected
10 perform as expected when applied
11 shape, form, or improve a material
12 give a workout to
13 proceed along a path
14 operate in a certain place, area, or specialty
15 proceed towards a goal or along a path or through an activity
16 move in an agit

In [127]:
need_sss[2].hyponyms()

[]

In [97]:
need_sss[5].hyponyms()

[Synset('branch_water.n.01'),
 Synset('drinking_water.n.01'),
 Synset('spring_water.n.02')]

In [100]:
need_sss[0].hyponyms()[2].lemma_name

AttributeError: 'Synset' object has no attribute 'lemma_name'

In [101]:
from nltk.corpus import wordnet as wn
vehicle = wn.synset('vehicle.n.01')
typesOfVehicles = list(set([w for s in vehicle.closure(lambda s:s.hyponyms()) for w in s.lemma_names]))

TypeError: 'method' object is not iterable

In [104]:
for s in vehicle.closure(lambda s:s.hyponyms()):
    print(s)

Synset('bumper_car.n.01')
Synset('craft.n.02')
Synset('military_vehicle.n.01')
Synset('rocket.n.01')
Synset('skibob.n.01')
Synset('sled.n.01')
Synset('steamroller.n.02')
Synset('wheeled_vehicle.n.01')
Synset('aircraft.n.01')
Synset('hovercraft.n.01')
Synset('landing_craft.n.01')
Synset('spacecraft.n.01')
Synset('vessel.n.02')
Synset('caisson.n.02')
Synset('half_track.n.01')
Synset('humvee.n.01')
Synset('personnel_carrier.n.01')
Synset('picket.n.04')
Synset('reconnaissance_vehicle.n.01')
Synset('tank.n.01')
Synset('technical.n.01')
Synset('troop_carrier.n.01')
Synset('warplane.n.01')
Synset('warship.n.01')
Synset('weapons_carrier.n.01')
Synset('missile.n.01')
Synset('multistage_rocket.n.01')
Synset('test_rocket.n.01')
Synset('bobsled.n.01')
Synset('bobsled.n.02')
Synset('dogsled.n.01')
Synset('luge.n.01')
Synset('pung.n.01')
Synset('toboggan.n.01')
Synset('baby_buggy.n.01')
Synset('bicycle.n.01')
Synset('boneshaker.n.01')
Synset('car.n.02')
Synset('handcart.n.01')
Synset('horse-drawn_ve

In [96]:
need_sss = wn.synsets('need')

In [70]:
need_sss

[Synset('need.n.01'),
 Synset('need.n.02'),
 Synset('motivation.n.01'),
 Synset('indigence.n.01'),
 Synset('necessitate.v.01'),
 Synset('want.v.02'),
 Synset('need.v.03')]

In [77]:
for i in range(len(need_sss)):
    print(i, need_sss[i].definition())

0 a condition requiring relief
1 anything that is necessary but lacking
2 the psychological feature that arouses an organism to action toward a desired goal; the reason for the action; that which gives purpose and direction to behavior
3 a state of extreme poverty or destitution
4 require as useful, just, or proper
5 have need of
6 have or feel a need for


In [74]:
need_ss = need_sss[2]

In [87]:
need_lemmas = need_ss.lemma_names()

In [88]:
need_lemmas

['motivation', 'motive', 'need']

In [89]:
need_hyponyms = need_ss.hyponyms()

In [90]:
need_hyponyms

[Synset('ethical_motive.n.01'),
 Synset('irrational_motive.n.01'),
 Synset('life.n.13'),
 Synset('psychic_energy.n.01'),
 Synset('rational_motive.n.01'),
 Synset('urge.n.01')]

In [94]:
need_hyponyms[0].hyponyms()

[Synset('conscience.n.01'),
 Synset('hedonism.n.01'),
 Synset('inner_light.n.01')]

In [92]:
need_hyponyms_all = list(need_ss.closure(lambda x: x.hyponyms()))

In [119]:
for x in wn.synsets('eat'):
    print(x, x.definition())

Synset('eat.v.01') take in solid food
Synset('eat.v.02') eat a meal; take a meal
Synset('feed.v.06') take in food; used of animals only
Synset('eat.v.04') worry or cause anxiety in a persistent way
Synset('consume.v.05') use up (resources or materials)
Synset('corrode.v.01') cause to deteriorate due to the action of water, air, or an acid


In [128]:
for x in wn.synsets('feed'):
    hypers = list(x.closure(lambda x: x.hypernyms(), depth=-1))
    print(hypers)
    if need_ss in hypers:
        print(True)

[Synset('food.n.01'), Synset('substance.n.07'), Synset('matter.n.03'), Synset('physical_entity.n.01'), Synset('entity.n.01')]
[Synset('provide.v.02'), Synset('give.v.03'), Synset('transfer.v.05')]
[Synset('provide.v.02'), Synset('give.v.03'), Synset('transfer.v.05')]
[Synset('supply.v.01'), Synset('give.v.03'), Synset('transfer.v.05')]
[Synset('insert.v.02'), Synset('put.v.01'), Synset('move.v.02')]
[Synset('promote.v.01'), Synset('support.v.01')]
[Synset('consume.v.02')]
[Synset('provide.v.02'), Synset('give.v.03'), Synset('transfer.v.05')]
[Synset('move.v.03')]
[Synset('exploit.v.01'), Synset('use.v.01')]
[Synset('regale.v.01'), Synset('provide.v.02'), Synset('give.v.03'), Synset('transfer.v.05')]
[Synset('enrich.v.01'), Synset('better.v.02'), Synset('change.v.01')]
[Synset('give.v.08')]


In [127]:
for x in wn.synsets('eat'):
    hypers = list(x.closure(lambda x: x.hypernyms(), depth=1))
    print(hypers)
    if need_ss in hypers:
        print(True)

[Synset('consume.v.02'), Synset('eat.v.02')]
[Synset('consume.v.02')]
[Synset('consume.v.02')]
[Synset('worry.v.03')]
[Synset('spend.v.02')]
[Synset('damage.v.01')]


In [139]:
need_ss.

[]

In [93]:
need_hyponyms_all

[Synset('ethical_motive.n.01'),
 Synset('irrational_motive.n.01'),
 Synset('life.n.13'),
 Synset('psychic_energy.n.01'),
 Synset('rational_motive.n.01'),
 Synset('urge.n.01'),
 Synset('conscience.n.01'),
 Synset('hedonism.n.01'),
 Synset('inner_light.n.01'),
 Synset('compulsion.n.02'),
 Synset('irrational_impulse.n.01'),
 Synset('mania.n.01'),
 Synset('incitement.n.03'),
 Synset('libidinal_energy.n.01'),
 Synset('disincentive.n.01'),
 Synset('incentive.n.01'),
 Synset('reason.n.01'),
 Synset('abience.n.01'),
 Synset('adience.n.01'),
 Synset('death_instinct.n.01'),
 Synset('wanderlust.n.01'),
 Synset('sense_of_shame.n.01'),
 Synset('superego.n.01'),
 Synset('wee_small_voice.n.01'),
 Synset('onomatomania.n.01'),
 Synset('compulsion.n.01'),
 Synset('agromania.n.01'),
 Synset('dipsomania.n.01'),
 Synset('egomania.n.01'),
 Synset('kleptomania.n.01'),
 Synset('logorrhea.n.01'),
 Synset('monomania.n.01'),
 Synset('necrophilia.n.01'),
 Synset('phaneromania.n.01'),
 Synset('pyromania.n.01'),
 S

### Wattpad

In [2]:
def clean_text(text):
    for p in punctuation:
        text = text.replace(p, '')
    return text.lower()

In [82]:
def window_generator(seq, size, step):
    return (seq[pos:pos + size] for pos in range(0, len(seq), step) if pos + step < len(seq))

In [83]:
example = """
Hands Across Hawthorne was a rally held at the Hawthorne Bridge in the American West Coast city of Portland, Oregon, on May 29, 2011. The demonstration was in response to an attack, one week earlier, on Brad Forkner and Christopher Rosevear, a gay male couple who had been holding hands while walking across the bridge. According to the couple and the Portland Police Bureau, a group of five men followed Forkner and Rosevear along the bridge before physically assaulting them. The assault was condemned by Portland's mayor, Sam Adams, and its police chief, Mike Reese, and news of the attack spread throughout the Pacific Northwest and the United States. The attack prompted volunteers from the Q Center, a nonprofit organization that supports the LGBT community, to form street patrols as a means of monitoring Portland's downtown area.
Several LGBT and human rights organizations sponsored Hands Across Hawthorne in response to the attack, with the purpose of linking hands across the entire span of the Hawthorne Bridge to show solidarity. More than 4,000 people attended the rally, which had been publicized on a single Facebook page 72 hours previously. Forkner, Rosevear, Mayor Adams, and other community leaders spoke at the rally. The event received attention throughout the United States. On June 5, residents of Spokane, Washington, held a similar hand-holding rally called "Hands Across Monroe", crossing the Monroe Street Bridge in Riverfront Park.
"""

def extract_information(text, window_size=50, step=25):
    info = {
        'needs': None, # LDA diciendole lo que tiene que encontrar
        'behaviours': None, 
    }
    text = clean_text(text)
    words = word_tokenize(text)
    words = [w for w in words if w not in stopwords.words('english')]
    windows = window_generator(words, window_size, step)
#     for window in windows:
        
    return list(windows)
   
extract_information(example[:50], 10, 5)

[['hands', 'across', 'hawthorne', 'rally', 'held', 'ha']]

In [79]:
example="El aliento de mi gato huele a comida de gato"
extract_information(example, 10, 5)

[['el', 'aliento', 'de', 'mi', 'gato', 'huele', 'comida', 'de', 'gato']]

In [61]:
ex = ' '.join(example.split()[:14])
print(ex)
extract_information(ex, 10, 5)

Hands Across Hawthorne was a rally held at the Hawthorne Bridge in the American
8


[['hands',
  'across',
  'hawthorne',
  'rally',
  'held',
  'hawthorne',
  'bridge',
  'american'],
 ['hawthorne', 'bridge', 'american']]

In [None]:
# LDA topic extraction, guiding the process with our specific keywords
# Result: topic list, list of words and frequencies
needs = None

In [None]:
# Graph
behaviors = None

In [42]:
%%time

for chunk in stories_df:
    for text in chunk.chapter_text:
        text = clean_text(text)
        words = word_tokenize(text)
        words = [w for w in words if w not in stopwords.words('english')]
        windows = window_generator(words, window_size, step)
        for window in windows:

ParserError: Error tokenizing data. C error: out of memory

In [34]:
len(chunk)

1000

In [27]:
c

11

In [40]:
len(ids)

984493

In [19]:
# c=0
# for chunk in stories_df:
#     for i, x in chunk.iterrows():
#         c+=1

In [21]:
# c
# 3947664

3947664

In [20]:
# c=0
# for chunk in stories_df:
#     if c > 0:
#         break
#     display(chunk)
#     c+=1

In [None]:
Comprobar que pasa si el texto es menor que el window_size