# "No Regrets"
## A word vector analysis of Game of Thrones
following Siraj's example following Yuriy Guts' example

### Generating and analyzing word vectors from novels

---

In [8]:
########## IMPORT DEPENDENCIES ##########
# allows to use both Python 2 and 3 syntax
from __future__ import absolute_import, division, print_function
# for word encoding we
import codecs
# and of course for any NLP we'll need regex:
import re
# but seems we'll also be using a different library for this
import glob
# concurrency, in order to process the data more efficiently
import multiprocessing
# dealing with the OS, like reading a file
import os
# for niceness
from pprint import pprint
# for any sorts of NLP analysis
import nltk
# AAAAAND: WORD 2 VEC!! (the star of the show)
# a google trained a NN on a huge set of word vectors;
# it's a generalized collection of word vectors (=tha shit!)
import gensim.models.word2vec as w2v
# for dimensionality reduction for our puny brains
# bc the wv are gonna be 300+ dimensions
import sklearn.manifold
# for math
import numpy as np
import pandas as pd
# and plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [11]:
########## CLEAN DATA ##########
from nltk.tokenize import punkt
from nltk.corpus import stopwords

In [16]:
book_filenames = sorted(glob.glob(r'data/*.txt'))
print(book_filenames)

['data/got1.txt', 'data/got2.txt', 'data/got3.txt', 'data/got4.txt', 'data/got5.txt']


### Making one large opus (that's what it is anyways)

In [21]:
# using UTF-8 as the common format for all texts
corpus_raw = u""
# adding a fake count for amount of people in the books
# just because GoT and there are so many deaths. ;)
fun_count = 300
for book_filename in book_filenames:
    print("Leisurely skimming '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book:
        corpus_raw += book.read()
    fun_count -= 50
    print("Read {0} characters of text, found ~{1} plot characters...".format(len(corpus_raw), fun_count))
    print()
print("Whoa!... Okay peepz. Spoiler Alert! ...")

Leisurely skimming 'data/got1.txt'...
Read 1770659 characters of text, found ~250 plot characters...

Leisurely skimming 'data/got2.txt'...
Read 4071041 characters of text, found ~200 plot characters...

Leisurely skimming 'data/got3.txt'...
Read 6391405 characters of text, found ~150 plot characters...

Leisurely skimming 'data/got4.txt'...
Read 8107945 characters of text, found ~100 plot characters...

Leisurely skimming 'data/got5.txt'...
Read 9719485 characters of text, found ~50 plot characters...

Whoa!... Okay peepz. Spoiler Alert! ...


### Tokenize the corpus

In [25]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [27]:
raw_sents = tokenizer.tokenize(corpus_raw)

In [28]:
def sentence_to_wordlist(raw):
    """replace all non-alpha chars with a space, then split the text."""
    # tr -sc 'a-zA-Z' ' ' < GoT_corpus.txt 😉
    clean = re.sub(r"[^a-zA-Z]", " ", raw)
    words = clean.split()
    return words

In [29]:
# awesome stuff with list comprehension
# tokenize each word in each sentence
sentences = [sentence_to_wordlist(s) for s in raw_sents if len(raw_sents) > 0]

In [40]:
sent_num = 11

print(raw_sents[sent_num])
print(sentence_to_wordlist(raw_sents[sent_num]))

Their droppings speckled the gargoyles that rose twelve feet tall on either side of him, a hellhound and a wyvern, two of the thousand that brooded over the walls of the ancient fortress.
['Their', 'droppings', 'speckled', 'the', 'gargoyles', 'that', 'rose', 'twelve', 'feet', 'tall', 'on', 'either', 'side', 'of', 'him', 'a', 'hellhound', 'and', 'a', 'wyvern', 'two', 'of', 'the', 'thousand', 'that', 'brooded', 'over', 'the', 'walls', 'of', 'the', 'ancient', 'fortress']


In [41]:
# count the amount of word tokens in the corpus
token_count = sum([len(sentence) for sentence in sentences])

In [42]:
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 1,818,103 tokens


Once we have vectors, we have three main tasks we want to perform:
- DISTANCE
- SIMILARITY
- RANKING

(e.g. scan all scientific papers and figure out which has the highest ranking regarding having the most information about climate change! Exciting!)

In [43]:
# define the dimensionality of the word vectors.
# higher dimensionality makes them more computationally expensive to train
# but also more accurate
# GENERALLY: more dimensions = more generalized
num_features = 300

# smallest set that we want to recognize when converting to a vector
min_word_count = 3

# number of threads to run in parallel
# the more workers we have, the faster we can train
num_workers = multiprocessing.cpu_count()

# context window size (how many words looked at in one peek)
context_window = 7

# downsample setting for frequent words
# 0 - 1e-5 is good for this
# essentially it means that we don't want to look at words that appear too frequently too frequently
# how often do we look at a word - the more frequent, the less often we want to look at it
downsampling = 1e-3

# seed for the random number generator (RNG)
# to make the results reproducible
# the seed makes sure that it is deterministic - which is useful for debugging
seed = 1

In [44]:
# using gensim library for creating the model
# https://radimrehurek.com/gensim/models/word2vec.html
thrones2vec = w2v.Word2Vec(
    sg = 1,
    seed = seed,
    workers = num_workers,
    size = num_features,
    min_count = min_word_count,
    window = context_window,
    sample = downsampling
)

In [45]:
# building the model
thrones2vec.build_vocab(sentences)

In [50]:
print("Word2Vec vocabulary length: {}".format(len(thrones2vec.vocab)))

Word2Vec vocabulary length: 17277


### Train the model

In [51]:
thrones2vec.train(sentences)

7023185

### Saving it to a file for later use

In [52]:
# create a folder for the files
if not os.path.exists("trainer"):
    os.makedirs("trained")
    
# save the output of the trained model to a file
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

## Exploration time!

In [54]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))

So these are 300-dimensional word vectors, we'll need to apply some dimensionality reduction in order to be able to plot them into 2-dimensional space.

### Compressing into 2D space for visualization

In [56]:
# using t-SNE (t-distributed Stochastic Neighbor Mapping)
# for squashing the dimensions of our data in order to be able to visualize it
# here we choose a 2D representation
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

In [57]:
# assigning a variable to the big matrix holding all the word vectors
# 'syn0' is the feature matrix generated by gensim
all_word_vec_matrix = thrones2vec.syn0

### Training t-SNE
Have to learn about why this is "training" here

In [None]:
all_word_vec_matrix_2D = tsne.fit_transform(all_word_vec_matrix)