<script>
    function findAncestor (el, name) {
        while ((el = el.parentElement) && el.nodeName.toLowerCase() !== name);
        return el;
    }
    function colorAll(el, textColor) {
        el.style.color = textColor;
        Array.from(el.children).forEach((e) => {colorAll(e, textColor);});
    }
    function setBackgroundImage(src, textColor) {
        var section = findAncestor(document.currentScript, "section");
        if (section) {
            section.setAttribute("data-background-image", src);
			if (textColor) colorAll(section, textColor);
        }
    }
</script>


# Starwars exploration with LDA

In [19]:
%matplotlib inline
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stopwords = stopwords.words('english')
from spacy.lang.en import English
parser = English()
import pyLDAvis.gensim
from nltk.corpus import wordnet as wn
from gensim import corpora
import pickle
import gensim

In [4]:
episodes= ["SW_EpisodeIV.txt","SW_EpisodeV.txt", "SW_EpisodeVI.txt"]
SW= []
star_all = pd.DataFrame()
for episode in episodes:
    star_1 = pd.read_csv(episode, sep='" "', engine='python')
    star_1 = star_1.loc[:,star_1.dtypes==object].apply(lambda star_1:star_1.str.replace('"', ""))
    star_1 = star_1.reset_index(drop=True)
    star_1.columns = ['character', 'dialogue']
    SW.append(star_1)
star_all = pd.concat(SW)
star_all = star_all.reset_index(drop=True)
star_all.shape

(2523, 2)

In [5]:
classes=star_all.character.value_counts()
classes_big=classes[classes>=40]
sufficient=star_all[star_all.character.isin(classes_big.index)][["character",'dialogue']]

In [6]:
sufficient=sufficient.reset_index(drop=True)
sufficient.shape

(1930, 2)

In [7]:
sufficient["tokens"]=sufficient['dialogue'].apply(word_tokenize) 

In [8]:
def no_punct(tokens):
    words = [word for word in tokens if word.isalpha()]
    return words
def to_lower(tokens):
    words = [w.lower() for w in tokens]
    return words

In [9]:
sufficient["tokens"]=sufficient['tokens'].apply(to_lower)
sufficient["tokens"]=sufficient['tokens'].apply(no_punct)

# LDA

In [10]:
sufficient.head()

Unnamed: 0,character,dialogue,tokens
0,THREEPIO,Did you hear that? They've shut down the main...,"[did, you, hear, that, they, shut, down, the, ..."
1,THREEPIO,We're doomed!,"[we, doomed]"
2,THREEPIO,There'll be no escape for the Princess this time.,"[there, be, no, escape, for, the, princess, th..."
3,THREEPIO,What's that?,"[what, that]"
4,THREEPIO,I should have known better than to trust the l...,"[i, should, have, known, better, than, to, tru..."


In [11]:
suff_grouped=sufficient.groupby('character')['dialogue'].apply(list)

In [12]:
suff_grouped2=sufficient.groupby('character')['tokens'].apply(list)
suff_grouped2

character
BEN         [[hello, there, come, here, my, little, friend...
EMPEROR     [[there, is, a, great, disturbance, in, the, f...
HAN         [[han, solo, i, captain, of, the, millennium, ...
LANDO       [[why, you, slimy, swindler, you, got, a, lot,...
LEIA        [[lord, vader, i, should, have, known, only, y...
LUKE        [[hurry, up, come, with, me, what, are, you, w...
THREEPIO    [[did, you, hear, that, they, shut, down, the,...
VADER       [[where, are, those, transmissions, you, inter...
YODA        [[hmmm, much, anger, in, him, like, his, fathe...
Name: tokens, dtype: object

In [13]:
from itertools import chain

In [14]:
superlist=[]
for i in range(len(suff_grouped2)):
    line = list(chain.from_iterable(suff_grouped2[i]))
    superlist.append(line)

In [15]:
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [16]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [20]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nataliespeiser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    #tokens = [token for token in tokens if token is not " "]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [22]:
flattened_list=[]
for x in superlist: #
    for y in x:
        flattened_list.append(y)

In [23]:
suff_grouped_list=[]
for i in range(len(suff_grouped)):
    for suffgr in suff_grouped[i]:
        suff_grouped_list.append(suffgr)

In [24]:
superlist_new=[]
for characterlist in superlist:
    for token in characterlist:
        if token in en_stop:
            characterlist.remove(token)
    superlist_new.append(characterlist)

In [25]:
superlist_new=[]
characterlist_new=[]
for characterlist in superlist:
    characterlist_new=[token for token in characterlist if token not in en_stop]
    superlist_new.append(characterlist_new)

In [26]:
dictionary = corpora.Dictionary(superlist_new)
corpus = [dictionary.doc2bow(text) for text in superlist_new]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [27]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [28]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

In [29]:
NUM_TOPICS = 9
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,iterations=200)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.001*"chewie" + 0.001*"get" + 0.001*"us" + 0.001*"well"')
(1, '0.016*"luke" + 0.015*"know" + 0.013*"get" + 0.009*"help"')
(2, '0.018*"get" + 0.016*"right" + 0.016*"going" + 0.013*"come"')
(3, '0.000*"get" + 0.000*"right" + 0.000*"like" + 0.000*"come"')
(4, '0.024*"force" + 0.020*"yes" + 0.019*"must" + 0.015*"jedi"')
(5, '0.000*"oh" + 0.000*"get" + 0.000*"going" + 0.000*"take"')
(6, '0.038*"oh" + 0.029*"sir" + 0.028*"artoo" + 0.014*"master"')
(7, '0.021*"luke" + 0.014*"father" + 0.014*"force" + 0.011*"jedi"')
(8, '0.010*"ship" + 0.010*"skywalker" + 0.009*"master" + 0.009*"yes"')


In [30]:
new_doc = 'let us go kid we got to be in the ship an save the princess and keep an eye on chewie'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(313, 1), (664, 1)]
[(0, 0.03703704), (1, 0.037044436), (2, 0.7036361), (3, 0.03703704), (4, 0.03703704), (5, 0.03703704), (6, 0.03708007), (7, 0.037042286), (8, 0.03704896)]


In [31]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)


# What is the meaning of this super cool visualization?
There are just 6 bigger bubbles although we have 9 characters and 9 topics. Is it a valid statement to say, that the characters are not very good to differentiate?