In [1]:
from collections import Counter
import pickle

import nltk
from nltk.corpus import stopwords
import pandas as pd

In [2]:
with open('data/out/clean_stories.pickle', 'rb') as f:
    stories = pickle.load(f)

In [3]:
len(stories)

98

In [4]:
N = 25

In [5]:
all_words = [w for story in stories for w in story['text'].split()]

In [6]:
len(all_words)

1681069

In [7]:
all_counter = Counter(all_words)

In [8]:
all_counter.most_common(N)

[('ojos', 20383),
 ('cabeza', 10363),
 ('mano', 10299),
 ('puerta', 9683),
 ('manos', 9043),
 ('mirada', 8763),
 ('cuerpo', 8184),
 ('casa', 8039),
 ('voz', 7857),
 ('tiempo', 7383),
 ('sangre', 7352),
 ('habitacion', 7132),
 ('sonrisa', 6881),
 ('rostro', 6775),
 ('vida', 5945),
 ('frente', 5899),
 ('padre', 5755),
 ('labios', 5737),
 ('brazos', 5716),
 ('cama', 5368),
 ('madre', 5092),
 ('gracias', 4750),
 ('vampiro', 4515),
 ('suelo', 4513),
 ('cuello', 4363)]

In [9]:
v_words = [w for story in stories for w in story['text'].split() if story['category'] == 'VAMPIROS (HISTORIAS EN ESPANOL)']

In [10]:
len(v_words)

1005091

In [11]:
v_counter = Counter(v_words)

In [12]:
v_counter.most_common(N)

[('ojos', 12339),
 ('cabeza', 6300),
 ('mano', 6268),
 ('puerta', 6091),
 ('sangre', 5809),
 ('mirada', 5677),
 ('manos', 5509),
 ('cuerpo', 4938),
 ('tiempo', 4858),
 ('voz', 4807),
 ('casa', 4667),
 ('rostro', 4510),
 ('habitacion', 4351),
 ('sonrisa', 4257),
 ('vampiro', 4112),
 ('vida', 3675),
 ('frente', 3604),
 ('labios', 3320),
 ('padre', 3291),
 ('vampiros', 3289),
 ('cama', 3278),
 ('brazos', 3056),
 ('madre', 2962),
 ('gracias', 2916),
 ('cuello', 2755)]

In [13]:
ww_words = [w for story in stories for w in story['text'].split() if story['category'] == 'HOMBRES LOBO (HISTORIAS EN ESPAÑOL)']

In [14]:
len(ww_words)

675978

In [15]:
ww_counter = Counter(ww_words)

In [16]:
ww_counter.most_common(N)

[('ojos', 8044),
 ('cabeza', 4063),
 ('mano', 4031),
 ('puerta', 3592),
 ('manos', 3534),
 ('casa', 3372),
 ('cuerpo', 3246),
 ('mirada', 3086),
 ('voz', 3050),
 ('habitacion', 2781),
 ('brazos', 2660),
 ('sonrisa', 2624),
 ('tiempo', 2525),
 ('lobo', 2502),
 ('padre', 2464),
 ('labios', 2417),
 ('manada', 2371),
 ('luna', 2331),
 ('frente', 2295),
 ('vida', 2270),
 ('rostro', 2265),
 ('madre', 2130),
 ('cama', 2090),
 ('suelo', 1984),
 ('chicos', 1865)]

In [17]:
union_top_words = set(w for w, f in v_counter.most_common(N)).union(w for w, f in ww_counter.most_common(N))

In [18]:
top_df = pd.DataFrame(
    data=[[all_counter[w], v_counter[w], ww_counter[w]] for w in sorted(union_top_words)],
    index=sorted(union_top_words),
    columns=['Freq. total', 'Freq. vampires', 'Freq. werewolves']
)

In [19]:
top_df.shape

(30, 3)

In [20]:
top_df.sort_values('Freq. total', ascending=False)

Unnamed: 0,Freq. total,Freq. vampires,Freq. werewolves
ojos,20383,12339,8044
cabeza,10363,6300,4063
mano,10299,6268,4031
puerta,9683,6091,3592
manos,9043,5509,3534
mirada,8763,5677,3086
cuerpo,8184,4938,3246
casa,8039,4667,3372
voz,7857,4807,3050
tiempo,7383,4858,2525


In [21]:
top_df.sort_values('Freq. vampires', ascending=False)

Unnamed: 0,Freq. total,Freq. vampires,Freq. werewolves
ojos,20383,12339,8044
cabeza,10363,6300,4063
mano,10299,6268,4031
puerta,9683,6091,3592
sangre,7352,5809,1543
mirada,8763,5677,3086
manos,9043,5509,3534
cuerpo,8184,4938,3246
tiempo,7383,4858,2525
voz,7857,4807,3050


In [22]:
top_df.sort_values('Freq. werewolves', ascending=False)

Unnamed: 0,Freq. total,Freq. vampires,Freq. werewolves
ojos,20383,12339,8044
cabeza,10363,6300,4063
mano,10299,6268,4031
puerta,9683,6091,3592
manos,9043,5509,3534
casa,8039,4667,3372
cuerpo,8184,4938,3246
mirada,8763,5677,3086
voz,7857,4807,3050
habitacion,7132,4351,2781


“Topics are usually characterized in the form of noun phrases” [1].

**References**
<br>
[1] Xie, Z., P. C. Nelson, W. Xiao, and T. M. Tirpak. 2004. “Using Noun Phrase Centrality to Identify Topics for Extraction Based Summaries.” In *Proceedings of the IASTED International Conference on Knowledge Sharing and Collaborative Engineering*. US Virgin Islands: ACTA Press. http://www.actapress.com/Abstract.aspx?paperId=17241.