# Werewolf and vampire

In [1]:
from time import time

import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Loading data

### Stories

In [2]:
stories_df = pd.read_csv('data/out/werewolf_vampire_stories.csv')

In [3]:
stories_df.head()

Unnamed: 0,story_id,chapter_id,chapter_index,chapter_text
0,157305,1100896,0,'I packed whilst you were at school' she said ...
1,157305,1108308,1,"'Ladies and Gentlemen, this is your captain sp..."
2,157305,1131027,2,My father was still frowning but he seemed to ...
3,157305,1157067,3,"Bloody girl, why did she have to turn up...she..."
4,157305,1160601,4,Sam x I blushed and looked down... “Umm...well...


In [4]:
stories_df.shape

(23978, 4)

In [5]:
stories_df.story_id.nunique()

5869

## Processing data

### Data

In [6]:
data = stories_df.chapter_text.values

In [7]:
len(data)

23978

In [8]:
data = [d for d in data if d is not np.nan]

In [None]:
len(data)

23972

In [None]:
%%time

wordnet_word_synsets = {}
for d in data:
    for w in word_tokenize(d):
        if w not in wordnet_word_synsets:
            synsets = wn.synsets(w)
            if synsets:
                wordnet_word_synsets[w] = synsets

In [None]:
len(wordnet_word_synsets)

In [None]:
%%time

data = [' '.join([w for w in word_tokenize(d) if w in wordnet_word_synsets]) for d in data]

In [None]:
data = [d for d in data if d.strip()]

In [None]:
len(data)

### Auxiliar functions

In [None]:
def get_topic(topic, feature_names, n_top_words):
    return [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]

def get_topics(model, feature_names, n_top_words):
    topics = []
    for topic in model.components_:
        topics.append(get_topic(topic, feature_names, n_top_words))
    return topics

In [None]:
def print_top_words(topics):
    for topic_idx, topic in enumerate(topics):
        print("Topic #%d:" % (topic_idx + 1))
        print(" ".join(topic))
        print()

In [None]:
def two_words_similarity(w1, w2):
    similarities = [0]
    for syns1 in wordnet_word_synsets[w1]:
        for syns2 in wordnet_word_synsets[w2]:
            similarity = wn.wup_similarity(syns1, syns2) or 0
            similarities.append(similarity)
    return max(similarities)

def word_list_similarity(ws):
    similarities = []
    for w1 in ws:
        for w2 in ws:
            if w1 != w2:
                similarity = two_words_similarity(w1, w2)
                similarities.append(similarity)
    return np.mean(similarities)

def topic_list_similarity(ts):
    similarities = []
    for t in ts:
        similarity = word_list_similarity(t)
        similarities.append(similarity)
    return np.mean(similarities)

### Parameters

In [None]:
N_FEATURES = 1000
N_TOP_WORDS = 20
MAX_TOPICS = 50
MIN_TOPICS = 5

### Topic extraction

In [None]:
# data_samples = data[:2000]
data_samples = data

#### LDA

In [None]:
%%time

lda_similarities = []
for N_TOPICS in range(MIN_TOPICS, MAX_TOPICS + 1):
# for N_TOPICS in range(1, 11):
    tf_vectorizer = CountVectorizer(
        max_df=0.95,
        min_df=2,
        max_features=N_FEATURES,
        stop_words='english',
    )
    tf = tf_vectorizer.fit_transform(data_samples)
    lda = LatentDirichletAllocation(
        n_topics=N_TOPICS,
        max_iter=5,
        learning_method='online',
        learning_offset=50.,
        random_state=0,
    )
    lda.fit(tf)
    tf_feature_names = tf_vectorizer.get_feature_names()
    topics = get_topics(lda, tf_feature_names, N_TOP_WORDS)
    similarity = topic_list_similarity(topics)
    lda_similarities.append((similarity, topics, len(topics)))
lda_winner = max(lda_similarities)

In [None]:
print_top_words(lda_winner[1])

#### NMF

In [None]:
%%time

nmf_similarities = []
for N_TOPICS in range(1, MAX_TOPICS + 1):
# for N_TOPICS in range(1, 11):
    tfidf_vectorizer = TfidfVectorizer(
        max_df=0.95,
        min_df=2,
        max_features=N_FEATURES,
        stop_words='english',
    )
    tfidf = tfidf_vectorizer.fit_transform(data_samples)
    nmf = NMF(
        n_components=N_TOPICS,
        random_state=1,
        alpha=.1,
        l1_ratio=.5,
    )
    nmf.fit(tfidf)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    topics = get_topics(nmf, tfidf_feature_names, N_TOP_WORDS)
    similarity = topic_list_similarity(topics)
    nmf_similarities.append((similarity, topics, len(topics)))
nmf_winner = max(nmf_similarities)

In [None]:
print_top_words(nmf_winner[1])

## Saving data

In [None]:
# stories_df.to_csv('data/out/werewolf_vampire_stories.csv', index=False)