# Werewolf and vampire

In [1]:
from time import time

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Loading data

### Stories

In [2]:
stories_df = pd.read_csv('data/out/werewolf_vampire_stories.csv')

In [3]:
stories_df.head()

Unnamed: 0,story_id,chapter_id,chapter_index,chapter_text
0,157305,1100896,0,'I packed whilst you were at school' she said ...
1,157305,1108308,1,"'Ladies and Gentlemen, this is your captain sp..."
2,157305,1131027,2,My father was still frowning but he seemed to ...
3,157305,1157067,3,"Bloody girl, why did she have to turn up...she..."
4,157305,1160601,4,Sam x I blushed and looked down... “Umm...well...


In [4]:
stories_df.shape

(23978, 4)

## Processing data

In [26]:
data = stories_df.chapter_text.values

In [37]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % (topic_idx + 1))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

In [28]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [29]:
data_samples = data[:n_samples]

### LDA

In [30]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')

In [33]:
%%time

tf = tf_vectorizer.fit_transform(data_samples)

CPU times: user 2.64 s, sys: 37 µs, total: 2.64 s
Wall time: 2.64 s


In [32]:
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [34]:
%%time

lda.fit(tf)

CPU times: user 13.4 s, sys: 2.95 ms, total: 13.4 s
Wall time: 13.4 s


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [38]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #1:
emma mike rsquo smiled ya dance room said like eyes know family grabbed looked head felt pulled going ll come

Topic #2:
sam mom dean just know jacob yeah said dad don hey justin going like oh jake got ok beau look

Topic #3:
na sa ko la ako que ang el ng kyle ba lo di ya gt oh lt ah love ai

Topic #4:
harry niall louis liam zayn says just said say like don know boys eyes love ron going room look really

Topic #5:
agrave ng nh aacute oacute ocirc acirc kh ecirc uacute igrave tr th ch hưu iacute cũng atilde một đi

Topic #6:
rsquo rdquo ldquo damon elena stefan eyes katherine know head like looked didn just don face going smiled room said

Topic #7:
gt alex said ian like eyes asked jason girl looked just man face hair lt hand chi little don looking

Topic #8:
eyes like just didn head felt face away know time looked hand man way did knew door body thought don

Topic #9:
said looked just asked like eyes walked smiled got head turned room face door know don hair didn look went

T

### NMF

In [44]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

In [46]:
%%time

tfidf = tfidf_vectorizer.fit_transform(data_samples)

CPU times: user 2.56 s, sys: 3.8 ms, total: 2.57 s
Wall time: 2.56 s


In [47]:
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5)

In [49]:
%%time

nmf.fit(tfidf)

CPU times: user 1.95 s, sys: 2.45 s, total: 4.39 s
Wall time: 1.75 s


NMF(alpha=0.1, beta=1, eta=0.1, init=None, l1_ratio=0.5, max_iter=200,
  n_components=10, nls_max_iter=2000, random_state=1, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [50]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topic #1:
eyes looked like head face felt didn hand away man door room just turned hair way look knew body hands

Topic #2:
niall liam zayn louis boys pov im guys direction emma yeah phone li date room hey kiss just couch ok

Topic #3:
said asked walked looked got smiled went started hey nodded laughed mr oh yeah replied turned okay saw door guys

Topic #4:
just like know don really love didn want time school going think ve ll oh tell people right say day

Topic #5:
harry louis ron hermione zayn fred boys george uncle love direction room don hand el smiled door eyes hug mum

Topic #6:
mom dad mother father house parents room school home car phone bed baby family told mum daughter sister going went

Topic #7:
says say asks ask walk look looks fred eyes door don laugh turn run george head room hear pov grab

Topic #8:
alex liam hermione mum girl red nodded ron mate lt date world asked school wolf asks ryan year moment parents

Topic #9:
jacob bella jake vampire mr don aunt car just mrs c

## Saving data

In [None]:
# stories_df.to_csv('data/out/werewolf_vampire_stories.csv', index=False)

In [16]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
# dataset = fetch_20newsgroups(shuffle=True, random_state=1,
#                              remove=('headers', 'footers', 'quotes'))
# data_samples = dataset.data[:n_samples]
# data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
done in 0.000s.
Extracting tf-idf features for NMF...
done in 2.713s.
Extracting tf features for LDA...
done in 2.644s.
Fitting the NMF model with tf-idf features, n_samples=2000 and n_features=1000...
done in 1.612s.

Topics in NMF model:
Topic #0:
eyes looked like head face felt didn hand away man door room just turned hair way look knew body hands
Topic #1:
niall liam zayn louis boys pov im guys direction emma yeah phone li date room hey kiss just couch ok
Topic #2:
said asked walked looked got smiled went started hey nodded laughed mr oh yeah replied turned okay saw door guys
Topic #3:
just like know don really love didn want time school going think ve ll oh tell people right say day
Topic #4:
harry louis ron hermione zayn fred boys george uncle love direction room don hand el smiled door eyes hug mum
Topic #5:
mom dad mother father house parents room school home car phone bed baby family told mum daughter sister going went
Topic #6:
says say asks ask walk look l

In [5]:
dataset.keys()

dict_keys(['target', 'filenames', 'target_names', 'DESCR', 'data', 'description'])

In [4]:
dataset['target']

array([17,  0, 17, ...,  9,  4,  9])

In [6]:
dataset['filenames']

array([ '/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/talk.politics.mideast/76141',
       '/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53281',
       '/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/talk.politics.mideast/76350',
       ...,
       '/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.baseball/105105',
       '/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51575',
       '/home/antonio/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.baseball/104908'], 
      dtype='<U94')

In [7]:
dataset['target_names']

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [8]:
dataset['DESCR']

In [9]:
dataset['description']

'the 20 newsgroups by date dataset'

In [11]:
len(dataset['data'])

11314

In [12]:
dataset['data'][0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"