# Werewolf and vampire


In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Loading data

### Stories

In [2]:
stories_df = pd.read_csv('data/out/werewolf_vampire_stories.csv')

In [3]:
stories_df.head()

Unnamed: 0,story_id,chapter_id,chapter_index,chapter_text
0,157305,1100896,0,'I packed whilst you were at school' she said ...
1,157305,1108308,1,"'Ladies and Gentlemen, this is your captain sp..."
2,157305,1131027,2,My father was still frowning but he seemed to ...
3,157305,1157067,3,"Bloody girl, why did she have to turn up...she..."
4,157305,1160601,4,Sam x I blushed and looked down... “Umm...well...


In [4]:
stories_df.shape

(23978, 4)

In [5]:
stories_df.story_id.nunique()

5869

## Processing data

### Data

In [6]:
data = stories_df.chapter_text.values

In [7]:
len(data)

23978

In [8]:
data = [d for d in data if d is not np.nan]

In [9]:
len(data)

23972

In [10]:
%%time

wordnet_word_synsets = {}
for d in data:
    for w in word_tokenize(d):
        if w not in wordnet_word_synsets:
            synsets = wn.synsets(w)
            if synsets:
                wordnet_word_synsets[w] = synsets

CPU times: user 11min 43s, sys: 718 ms, total: 11min 44s
Wall time: 11min 44s


In [11]:
len(wordnet_word_synsets)

72321

In [12]:
%%time

data = [' '.join([w for w in word_tokenize(d) if w in wordnet_word_synsets]) for d in data]

CPU times: user 6min 32s, sys: 152 ms, total: 6min 33s
Wall time: 6min 32s


In [13]:
data = [d for d in data if d.strip()]

In [14]:
len(data)

23904

### Auxiliar functions

In [15]:
def get_topic(topic, feature_names, n_top_words):
    return [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]

def get_topics(model, feature_names, n_top_words):
    topics = []
    for topic in model.components_:
        topics.append(get_topic(topic, feature_names, n_top_words))
    return topics

In [16]:
# def print_topics(topics):
#     for topic_idx, topic in enumerate(topics):
#         print("Topic #%d:" % (topic_idx + 1))
#         print(" ".join(topic))
#         print()

def print_topics(topics):
    for topic_idx, (similariy, topic, n_topics) in enumerate(topics):
        print("Topic #%d:" % (topic_idx + 1))
        print('SIMILARITY:', similariy, '. Discovered topics:', n_topics)
        print(" ".join(topic))
        print()

In [17]:
def two_words_similarity(w1, w2):
    similarities = [0]
    for syns1 in wordnet_word_synsets[w1]:
        for syns2 in wordnet_word_synsets[w2]:
            similarity = wn.wup_similarity(syns1, syns2) or 0
            similarities.append(similarity)
    return max(similarities)

def word_list_similarity(ws):
    similarities = []
    for w1 in ws:
        for w2 in ws:
            if w1 != w2:
                similarity = two_words_similarity(w1, w2)
                similarities.append(similarity)
    return np.mean(similarities)

# def topic_list_similarity(ts):
#     similarities = []
#     for t in ts:
#         similarity = word_list_similarity(t)
#         similarities.append(similarity)
#     return np.mean(similarities)

### Parameters

In [26]:
N_FEATURES = 1000
N_TOP_WORDS = 20
MAX_TOPICS = 200
MIN_TOPICS = 1

### Topic extraction

In [27]:
data_samples = data

#### LDA

In [28]:
%%time

tf_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=N_FEATURES,
    stop_words='english',
)
tf = tf_vectorizer.fit_transform(data_samples)
tf_feature_names = tf_vectorizer.get_feature_names()

lda_similarities = []
for N_TOPICS in range(MIN_TOPICS, MAX_TOPICS + 1):
    lda = LatentDirichletAllocation(
        n_topics=N_TOPICS,
        learning_method='online',
        learning_offset=50.,
        random_state=0,
        n_jobs=4,
    )
    lda.fit(tf)
    topics = get_topics(lda, tf_feature_names, N_TOP_WORDS)
    similarities = [(word_list_similarity(topic), topic, len(topics)) for topic in topics]
    lda_similarities.extend(similarities)
lda_winners = sorted(lda_similarities, reverse=True)

CPU times: user 21h 31min, sys: 34min 42s, total: 22h 5min 42s
Wall time: 3d 17h 45min 28s


In [38]:
with open('data/out/lda_winners.csv', 'w') as f:
    for topic in lda_winners:
        f.write('{}, {}, {}\n'.format(topic[0], ' '.join(topic[1]), topic[2]))

In [36]:
'{}, {}, {}'.format(topic[0], ' '.join(topic[1]), topic[2])

'0.6722436207494797, car drive seat road park driving bag drove home ride house window door inside hand bags head street taking pick, 191'

In [39]:
c=0
for i in range(1, 201):
    c+=i

In [40]:
c

20100

In [35]:
topic

(0.67224362074947974,
 ['car',
  'drive',
  'seat',
  'road',
  'park',
  'driving',
  'bag',
  'drove',
  'home',
  'ride',
  'house',
  'window',
  'door',
  'inside',
  'hand',
  'bags',
  'head',
  'street',
  'taking',
  'pick'],
 191)

In [29]:
print_topics(lda_winners[:10])

Topic #1:
SIMILARITY: 0.672243620749 . Discovered topics: 191
car drive seat road park driving bag drove home ride house window door inside hand bags head street taking pick

Topic #2:
SIMILARITY: 0.664257033619 . Discovered topics: 182
car drive house seat road driving drove home ride window door inside street way lot park pick bag look pull

Topic #3:
SIMILARITY: 0.653545993766 . Discovered topics: 105
car house door home drive seat road driving drove ride window inside way street stairs bags outside open walk bag

Topic #4:
SIMILARITY: 0.646932983514 . Discovered topics: 160
team ball park dream game play line hit won running playing run ready fast air throw shot right pass set

Topic #5:
SIMILARITY: 0.641080358938 . Discovered topics: 157
game play ball playing hit played beat won shot line ready air face throw middle fun threw like set high

Topic #6:
SIMILARITY: 0.640201534732 . Discovered topics: 80
box picture bags stairs closet open bag shoes pack till pull door doors walk ste

#### NMF

In [None]:
from datetime import datetime

In [None]:
%%time

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=N_FEATURES,
    stop_words='english',
)
tfidf = tfidf_vectorizer.fit_transform(data_samples)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

nmf_similarities = []
for N_TOPICS in range(MIN_TOPICS, MAX_TOPICS + 1):
    nmf = NMF(
        n_components=N_TOPICS,
        random_state=1,
        alpha=.1,
        l1_ratio=.5,
    )
    nmf.fit(tfidf)
    topics = get_topics(nmf, tfidf_feature_names, N_TOP_WORDS)
    similarities = [(word_list_similarity(topic), topic, len(topics)) for topic in topics]
    nmf_similarities.extend(similarities)
    
    if N_TOPICS % 100 == 0:
        print(N_TOPICS, datetime.now())
    
nmf_winners = sorted(nmf_similarities, reverse=True)

In [None]:
with open('data/out/nmf_winners.txt', 'w') as f:
    for topic in nmf_winners:
        f.write('{}, {}, {}\n'.format(topic[0], ' '.join(topic[1]), topic[2]))

In [None]:
print_topics(nmf_winners[:10])

## Saving data

In [None]:
# stories_df.to_csv('data/out/werewolf_vampire_stories.csv', index=False)