# Histogram


In [1]:
import pickle
from collections import Counter
from string import punctuation

import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Loading data

### Stories

In [2]:
stories_df = pd.read_csv('data/out/werewolf_vampire_stories.csv')

In [3]:
stories_df.head()

Unnamed: 0,story_id,chapter_id,chapter_index,chapter_text
0,157305,1100896,0,'I packed whilst you were at school' she said ...
1,157305,1108308,1,"'Ladies and Gentlemen, this is your captain sp..."
2,157305,1131027,2,My father was still frowning but he seemed to ...
3,157305,1157067,3,"Bloody girl, why did she have to turn up...she..."
4,157305,1160601,4,Sam x I blushed and looked down... “Umm...well...


In [4]:
stories_df.shape

(23978, 4)

In [5]:
stories_df.story_id.nunique()

5869

## Processing data

### Data

In [6]:
data = stories_df.chapter_text.values

In [7]:
len(data)

23978

In [8]:
data = [d for d in data if d is not np.nan]

In [9]:
len(data)

23972

In [10]:
data = [d for d in data if d.strip()]

In [11]:
len(data)

23962

### Auxiliar functions

In [12]:
stop_words = stopwords.words('english')

def text2words(text):
    text = text.lower()
    for p in punctuation:
        text = text.replace(p, '')
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]
    return words

### Histogram

In [13]:
# %%time

# counter = Counter()
# for d in data:
#     counter += Counter(text2words(d))

# CPU times: user 16min 59s, sys: 335 ms, total: 16min 59s
# Wall time: 16min 58s

In [14]:
# with open('data/out/histogram.pickle', 'wb') as f:
#     pickle.dump(counter, f)

In [15]:
with open('data/out/histogram.pickle', 'rb') as f:
    counter = pickle.load(f)

In [16]:
len(counter)

491000

In [17]:
counter.most_common(100)

[('said', 128755),
 ('like', 95848),
 ('’', 88338),
 ('back', 87217),
 ('im', 79575),
 ('“', 76365),
 ('”', 75411),
 ('know', 71827),
 ('one', 67982),
 ('get', 67427),
 ('eyes', 63998),
 ('dont', 62304),
 ('go', 55821),
 ('looked', 54917),
 ('would', 54179),
 ('could', 51107),
 ('see', 49677),
 ('around', 48069),
 ('time', 47169),
 ('got', 47068),
 ('going', 47065),
 ('asked', 46158),
 ('didnt', 45679),
 ('khocircng', 44453),
 ('head', 42748),
 ('want', 40971),
 ('look', 40584),
 ('face', 39960),
 ('well', 39852),
 ('room', 39698),
 ('really', 38601),
 ('even', 37462),
 ('say', 36954),
 ('lagrave', 36678),
 ('door', 36365),
 ('một', 36161),
 ('away', 35266),
 ('way', 34834),
 ('right', 34479),
 ('think', 33349),
 ('hand', 33172),
 ('still', 32024),
 ('something', 31993),
 ('us', 31684),
 ('walked', 31391),
 ('ta', 30725),
 ('không', 29602),
 ('little', 29544),
 ('coacute', 28611),
 ('cũng', 28326),
 ('come', 27680),
 ('đi', 27656),
 ('went', 27419),
 ('love', 27314),
 ('good', 27163),


### Topics

In [18]:
SIMILARITY_THRESHOLD = 0.5
INTERSECTION_THRESHOLD = 5

In [19]:
lda_unique_topics_df = pd.read_csv('data/out/lda_unique_topics.csv')

In [20]:
lda_unique_topics_df['METHOD'] = 'LDA'

In [21]:
nmf_unique_topics_df = pd.read_csv('data/out/nmf_unique_topics.csv')

In [22]:
nmf_unique_topics_df['METHOD'] = 'NMF'

In [23]:
all_topics_df = pd.concat([lda_unique_topics_df, nmf_unique_topics_df], ignore_index=True)

In [24]:
all_topics_df = all_topics_df.sort_values('SIMILARITY', ascending=False)

In [25]:
len(all_topics_df)

104

In [26]:
%%time

indexes = []

for i, row in all_topics_df.iterrows():
    add = True
    words = set(row.TOPIC.split())
    previous_df = all_topics_df.loc[indexes]
    for previous_topic in previous_df.TOPIC:
        previous_words = set(previous_topic.split())
        intersect = words.intersection(previous_words)
        if len(intersect) >= INTERSECTION_THRESHOLD:
            add = False
            break
    if add:
        indexes.append(i)

CPU times: user 114 ms, sys: 71 µs, total: 114 ms
Wall time: 110 ms


In [27]:
unique_topics_df = all_topics_df.loc[indexes]

In [28]:
len(unique_topics_df)

68

In [29]:
topic_words = set()
for topic in unique_topics_df.TOPIC:
    topic_words = topic_words.union(topic.split())

In [30]:
topic_counter = Counter()
for word in topic_words:
    topic_counter[word] = counter[word]

In [31]:
topic_counter.most_common(100)

[('said', 128755),
 ('like', 95848),
 ('know', 71827),
 ('eyes', 63998),
 ('looked', 54917),
 ('time', 47169),
 ('got', 47068),
 ('going', 47065),
 ('asked', 46158),
 ('head', 42748),
 ('want', 40971),
 ('look', 40584),
 ('face', 39960),
 ('room', 39698),
 ('say', 36954),
 ('door', 36365),
 ('away', 35266),
 ('way', 34834),
 ('right', 34479),
 ('think', 33349),
 ('hand', 33172),
 ('walked', 31391),
 ('little', 29544),
 ('come', 27680),
 ('went', 27419),
 ('love', 27314),
 ('good', 27163),
 ('started', 26255),
 ('looking', 26170),
 ('oh', 26061),
 ('let', 25545),
 ('tell', 25184),
 ('make', 25099),
 ('hair', 25023),
 ('girl', 24643),
 ('thought', 24539),
 ('took', 24100),
 ('turned', 23774),
 ('saw', 22979),
 ('felt', 22684),
 ('smile', 22194),
 ('people', 21756),
 ('school', 21572),
 ('need', 21557),
 ('day', 21261),
 ('says', 21033),
 ('told', 20981),
 ('came', 20941),
 ('long', 20888),
 ('knew', 20784),
 ('sure', 20713),
 ('left', 20592),
 ('voice', 20178),
 ('bed', 19701),
 ('feel',

In [32]:
def count_words(x):
    return sum([topic_counter[w] for w in x.split()])

unique_topics_df['FREQUENCY'] = unique_topics_df.TOPIC.apply(count_words)

In [33]:
unique_topics_df = unique_topics_df.sort_values('FREQUENCY', ascending=False)

In [34]:
unique_topics_df.head()

Unnamed: 0,SIMILARITY,TOPIC,N_TOPICS,METHOD,FREQUENCY
97,0.511988,said asked looking yelled smiling know walkin...,74,NMF,694378
70,0.574911,looked saw looking felt turned look ran heard...,153,NMF,539242
6,0.633045,eyes face lips smile hands hand like body voi...,87,LDA,521487
22,0.564767,school face eyes girl black turned desk don w...,167,LDA,488727
89,0.52384,time day going way work good make long things...,97,NMF,484774


In [35]:
for i, (j, row) in enumerate(unique_topics_df.iterrows()):
    print(i + 1, '-', row.FREQUENCY, '-', row.SIMILARITY, '-', row.METHOD)
    print(row.TOPIC.strip())
    print()

1 - 694378 - 0.5119875519994187 - NMF
said asked looking yelled smiling know walking laughing did walked like want went got saw just heard let come going

2 - 539242 - 0.5749114842845493 - NMF
looked saw looking felt turned look ran heard sat stopped stood opened came face confused said took noticed gave thought

3 - 521487 - 0.6330450241102841 - LDA
eyes face lips smile hands hand like body voice head felt mouth hair feel feeling fingers way moment small words

4 - 488727 - 0.5647672524187922 - LDA
school face eyes girl black turned desk don way know want day chair saw life year high head thing arm

5 - 484774 - 0.5238401699327233 - NMF
time day going way work good make long things sure thought little think home place days come right night thing

6 - 440095 - 0.5156750438651204 - NMF
did knew wanted thought care know look happened like mean felt hurt want anymore liked tried needed tell answer loved

7 - 417644 - 0.53984274411519 - NMF
asked told answered said nodded explained yes sta

## Saving data