# Werewolf and vampire


In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

## Loading data

### Stories

In [2]:
stories_df = pd.read_csv('data/out/werewolf_vampire_stories.csv')

In [3]:
stories_df.head()

Unnamed: 0,story_id,chapter_id,chapter_index,chapter_text
0,157305,1100896,0,'I packed whilst you were at school' she said ...
1,157305,1108308,1,"'Ladies and Gentlemen, this is your captain sp..."
2,157305,1131027,2,My father was still frowning but he seemed to ...
3,157305,1157067,3,"Bloody girl, why did she have to turn up...she..."
4,157305,1160601,4,Sam x I blushed and looked down... “Umm...well...


In [4]:
stories_df.shape

(23978, 4)

In [5]:
stories_df.story_id.nunique()

5869

## Processing data

### Data

In [6]:
data = stories_df.chapter_text.values

In [7]:
len(data)

23978

In [8]:
data = [d for d in data if d is not np.nan]

In [9]:
len(data)

23972

In [10]:
# DELETE CELL
data=data[:10]

In [11]:
%%time

wordnet_word_synsets = {}
for d in data:
    for w in word_tokenize(d):
        if w not in wordnet_word_synsets:
            synsets = wn.synsets(w)
            if synsets:
                wordnet_word_synsets[w] = synsets

CPU times: user 4.07 s, sys: 120 ms, total: 4.19 s
Wall time: 4.19 s


In [12]:
len(wordnet_word_synsets)

2460

In [14]:
%%time

data = [' '.join([w for w in word_tokenize(d) if w in wordnet_word_synsets]) for d in data]

CPU times: user 200 ms, sys: 8 ms, total: 208 ms
Wall time: 205 ms


In [14]:
data = [d for d in data if d.strip()]

In [15]:
len(data)

10

In [15]:
for d in data:
    for w in word_tokenize(d):
        if w not in wordnet_word_synsets:
            print(w)

.


### Auxiliar functions

In [16]:
def get_topic(topic, feature_names, n_top_words):
    return [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]

def get_topics(model, feature_names, n_top_words):
    topics = []
    for topic in model.components_:
        topics.append(get_topic(topic, feature_names, n_top_words))
    return topics

In [17]:
# def print_topics(topics):
#     for topic_idx, topic in enumerate(topics):
#         print("Topic #%d:" % (topic_idx + 1))
#         print(" ".join(topic))
#         print()

def print_topics(topics):
    for topic_idx, (similariy, topic, n_topics) in enumerate(topics):
        print("Topic #%d:" % (topic_idx + 1))
        print('SIMILARITY:', similariy, '. Discovered topics:', n_topics)
        print(" ".join(topic))
        print()

In [18]:
def two_words_similarity(w1, w2):
    similarities = [0]
    for syns1 in wordnet_word_synsets[w1]:
        for syns2 in wordnet_word_synsets[w2]:
            similarity = wn.wup_similarity(syns1, syns2) or 0
            similarities.append(similarity)
    return max(similarities)

def word_list_similarity(ws):
    similarities = []
    for w1 in ws:
        for w2 in ws:
            if w1 != w2:
                similarity = two_words_similarity(w1, w2)
                similarities.append(similarity)
    return np.mean(similarities)

# def topic_list_similarity(ts):
#     similarities = []
#     for t in ts:
#         similarity = word_list_similarity(t)
#         similarities.append(similarity)
#     return np.mean(similarities)

### Parameters

In [19]:
# parameters = [{
#         'vect__max_df': (0.95, 1.0),
#         'vect__min_df': (2, 1),
#         'vect__max_features': (10, 100, 1000),
#         'vect__stop_words': ('english',),
#         'topext__n_topics': list(range(1, 201)) + [300, 400, 500],
#         'topext__learning_method': ('online', 'batch'),
#         'topext__learning_offset': (10., 20., 30., 40., 50.),
#     }, {
#         'vect__max_df': (0.95, 1.0),
#         'vect__min_df': (2, 1),
#         'vect__max_features': (10, 100, 1000),
#         'vect__stop_words': ('english',),
#         'topext__n_components': list(range(1, 201)) + [300, 400, 500],
#         'topext__alpha': (0, .1, .5, 1.),
#         'topext__l1_ratio': (0, .5, 1.),
#     }
# ]
# topic_extractors = [LDA, NMF]
# vectorizers = [CountVectorizer, TfidfVectorizer]
# pipeline_parameters = list(zip(vectorizers, topic_extractors, parameters))

In [20]:
parameters = {
    'vect__max_df': (0.95, 1.0),
    'vect__min_df': (2, 1),
    'vect__max_features': (10, 100, 1000),
    'vect__stop_words': ('english',),
    'topext__n_topics': list(range(1, 201)) + [300, 400, 500],
    'topext__learning_method': ('online', 'batch'),
    'topext__learning_offset': (10., 20., 30., 40., 50.),
}

In [21]:
# DELETE CELL
parameters = {
    'vect__max_features': (100, 1000),
    'vect__stop_words': ('english',),
    'topext__n_topics': [100, 200],
    'topext__learning_offset': (40., 50.),
}

### Topic extraction

In [22]:
import warnings
warnings.filterwarnings('ignore') # ("error", "ignore", "always", "default", "module", "once")

In [23]:
N_TOP_WORDS = 20

In [26]:
data_samples = data
len(data_samples)

10

In [30]:
# %%time

# best_estimators = []
# for vectorizer, topic_extractor, parameters in pipeline_parameters:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('topext', LDA()),
])
grid_search_cv = GridSearchCV(pipeline, parameters, n_jobs=1)
grid_search_cv.fit(data_samples)

feature_names = grid_search_cv.best_estimator_.steps[0][1].get_feature_names()
topext = grid_search_cv.best_estimator_.steps[1][1]

topics = get_topics(topext, feature_names, N_TOP_WORDS)
similarities = [(word_list_similarity(topic), topic, len(topics)) for topic in topics]
sorted(similarities, reverse=True)

KeyError: 'frank'

In [32]:
for d in data:
    if 'frank' in d:
        print('OK')

In [33]:
wordnet_word_synsets

{'looked': [Synset('look.v.01'),
  Synset('look.v.02'),
  Synset('look.v.03'),
  Synset('search.v.02'),
  Synset('front.v.01'),
  Synset('attend.v.02'),
  Synset('look.v.07'),
  Synset('expect.v.03'),
  Synset('look.v.09'),
  Synset('count.v.08')],
 'ipod': [Synset('ipod.n.01')],
 'tried': [Synset('try.v.01'),
  Synset('test.v.01'),
  Synset('judge.v.05'),
  Synset('sample.v.01'),
  Synset('hear.v.03'),
  Synset('try.v.06'),
  Synset('try.v.07'),
  Synset('try.v.08'),
  Synset('try_on.v.01'),
  Synset('tested.s.01'),
  Synset('tested.s.02')],
 'trees': [Synset('tree.n.01'),
  Synset('tree.n.02'),
  Synset('tree.n.03'),
  Synset('corner.v.02'),
  Synset('tree.v.02'),
  Synset('tree.v.03'),
  Synset('tree.v.04')],
 'small': [Synset('small.n.01'),
  Synset('small.n.02'),
  Synset('small.a.01'),
  Synset('minor.s.10'),
  Synset('little.s.03'),
  Synset('small.s.04'),
  Synset('humble.s.01'),
  Synset('little.s.07'),
  Synset('little.s.05'),
  Synset('small.s.08'),
  Synset('modest.s.02'),


In [64]:
# %%time

# lda_similarities = []

# for N_FEATURES in N_FEATURES_LIST:
# tf_vectorizer = CountVectorizer(
#     max_features=N_FEATURES,
#     stop_words='english',
# )
# tf = tf_vectorizer.fit_transform(data_samples)
# tf_feature_names = tf_vectorizer.get_feature_names()

# for N_TOPICS in range(MIN_TOPICS, MAX_TOPICS + 1):
#     lda = LatentDirichletAllocation(
#         n_topics=N_TOPICS,
#         learning_method='online',
#         learning_offset=50.,
#         random_state=0,
#         n_jobs=4,
#     )
#     lda.fit(tf)
#     topics = get_topics(lda, tf_feature_names, N_TOP_WORDS)
#     similarities = [(word_list_similarity(topic), topic, len(topics)) for topic in topics]
#     lda_similarities.extend(similarities)

# lda_winners = sorted(lda_similarities, reverse=True)

In [38]:
with open('data/out/lda_winners.csv', 'w') as f:
    for topic in lda_winners:
        f.write('{}, {}, {}\n'.format(topic[0], ' '.join(topic[1]), topic[2]))

In [36]:
'{}, {}, {}'.format(topic[0], ' '.join(topic[1]), topic[2])

'0.6722436207494797, car drive seat road park driving bag drove home ride house window door inside hand bags head street taking pick, 191'

In [39]:
c=0
for i in range(1, 201):
    c+=i

In [40]:
c

20100

In [35]:
topic

(0.67224362074947974,
 ['car',
  'drive',
  'seat',
  'road',
  'park',
  'driving',
  'bag',
  'drove',
  'home',
  'ride',
  'house',
  'window',
  'door',
  'inside',
  'hand',
  'bags',
  'head',
  'street',
  'taking',
  'pick'],
 191)

In [29]:
print_topics(lda_winners[:10])

Topic #1:
SIMILARITY: 0.672243620749 . Discovered topics: 191
car drive seat road park driving bag drove home ride house window door inside hand bags head street taking pick

Topic #2:
SIMILARITY: 0.664257033619 . Discovered topics: 182
car drive house seat road driving drove home ride window door inside street way lot park pick bag look pull

Topic #3:
SIMILARITY: 0.653545993766 . Discovered topics: 105
car house door home drive seat road driving drove ride window inside way street stairs bags outside open walk bag

Topic #4:
SIMILARITY: 0.646932983514 . Discovered topics: 160
team ball park dream game play line hit won running playing run ready fast air throw shot right pass set

Topic #5:
SIMILARITY: 0.641080358938 . Discovered topics: 157
game play ball playing hit played beat won shot line ready air face throw middle fun threw like set high

Topic #6:
SIMILARITY: 0.640201534732 . Discovered topics: 80
box picture bags stairs closet open bag shoes pack till pull door doors walk ste

#### NMF

In [None]:
from datetime import datetime

In [48]:
%%time

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=N_FEATURES,
    stop_words='english',
)
tfidf = tfidf_vectorizer.fit_transform(data_samples)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

nmf_similarities = []
for N_TOPICS in range(MIN_TOPICS, MAX_TOPICS + 1):
    nmf = NMF(
        n_components=N_TOPICS,
        random_state=1,
        alpha=.1,
        l1_ratio=.5,
    )
    nmf.fit(tfidf)
    topics = get_topics(nmf, tfidf_feature_names, N_TOP_WORDS)
    similarities = [(word_list_similarity(topic), topic, len(topics)) for topic in topics]
    nmf_similarities.extend(similarities)
    
    if N_TOPICS % 100 == 0:
        print(N_TOPICS, datetime.now())
    
nmf_winners = sorted(nmf_similarities, reverse=True)

100 2017-06-12 18:35:34.841702
200 2017-06-13 16:37:50.501902
CPU times: user 1d 6h 24min 54s, sys: 9h 53s, total: 1d 15h 25min 48s
Wall time: 1d 4h 36min 37s


In [49]:
with open('data/out/nmf_winners.csv', 'w') as f:
    for topic in nmf_winners:
        f.write('{}, {}, {}\n'.format(topic[0], ' '.join(topic[1]), topic[2]))

In [50]:
print_topics(nmf_winners[:10])

Topic #1:
SIMILARITY: 0.648428122943 . Discovered topics: 100
look walk looks asks smile head smiles walks turn nod ask start hear say face hand nods starts grab turns

Topic #2:
SIMILARITY: 0.645495403787 . Discovered topics: 136
look walk looks smile head asks smiles walks turn nod face ask start hear nods turns starts eyes grab looking

Topic #3:
SIMILARITY: 0.642188814187 . Discovered topics: 134
black hair white blue red brown dark green shirt color wearing dress jeans pink light long purple grey blonde wear

Topic #4:
SIMILARITY: 0.640778642366 . Discovered topics: 104
asks ask smiles nods nod laughs looks takes turns smile walks tell says pulls questions laugh question answer reply gets

Topic #5:
SIMILARITY: 0.640778642366 . Discovered topics: 172
asks ask smiles looks nods nod laughs smile walks turns takes laugh pulls says tell question answer reply questions gets

Topic #6:
SIMILARITY: 0.639088471421 . Discovered topics: 186
asks ask smiles nods looks nod laughs turns takes 

## Saving data

In [None]:
# stories_df.to_csv('data/out/werewolf_vampire_stories.csv', index=False)