# Topic Modeler
Discovering abstract topics in documents (Flipgrid Topics). Using Latent Dirichlet Allocation (LDA)we will look for relevant collections of words, or topics, in the corpus.

#### Packages
* spaCy (NLP)
* Gensim (Topic modeling library)
* Numpy (Math stuff)
* Pandas (Data analysis)

In [29]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS

from pprint import pprint

## Cleaning the data

In [34]:

nlp = spacy.load("en_core_web_sm")

def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

doc = nlp('''The US government has no evidence that any foreign power has tampered with the vote count, the Department of Homeland Security's top cyber-official has said.

Christopher Krebs said in a statement that there was "no evidence any foreign adversary was capable of preventing Americans from voting or changing vote tallies".

US intelligence agencies concluded that in 2016 Russia was behind an effort to tip the scale of the US election against Hillary Clinton.

"We will remain vigilant for any attempts by foreign actors to target or disrupt the ongoing vote counting and final certification of results," Mr Krebs said.

"The American people are the last line of defence against foreign influence efforts and we encourage continued patience in the coming days and weeks."

President Trump's team has filed legal action in several states. Hours after the polls closed on Tuesday evening, he alleged there was "a fraud on the American public" but has not offered any evidence.''')

In [35]:
doc_list = []
doc_list.append(doc)
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [36]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [37]:
topics = lda_model.print_topics(num_words=10)
pprint(topics)

[(0,
  '0.014*"evidence" + 0.014*"vote" + 0.014*"\n'
  '\n'
  ' " + 0.014*"foreign" + 0.014*"line" + 0.014*"american" + '
  '0.014*"certification" + 0.014*"encourage" + 0.014*"adversary" + '
  '0.014*"ongoing"'),
 (1,
  '0.015*"\n'
  '\n'
  ' " + 0.015*"evidence" + 0.015*"vote" + 0.014*"foreign" + 0.014*"american" + '
  '0.014*"Krebs" + 0.014*"effort" + 0.014*"line" + 0.014*"ongoing" + '
  '0.014*"election"'),
 (2,
  '0.014*"foreign" + 0.014*"\n'
  '\n'
  ' " + 0.014*"evidence" + 0.014*"vote" + 0.014*"american" + 0.014*"encourage" '
  '+ 0.014*"official" + 0.014*"effort" + 0.014*"Krebs" + 0.014*"prevent"'),
 (3,
  '0.054*"\n'
  '\n'
  ' " + 0.044*"foreign" + 0.044*"vote" + 0.033*"evidence" + 0.022*"Krebs" + '
  '0.022*"effort" + 0.022*"american" + 0.012*"scale" + 0.012*"team" + '
  '0.012*"Homeland"'),
 (4,
  '0.014*"vote" + 0.014*"\n'
  '\n'
  ' " + 0.014*"foreign" + 0.014*"evening" + 0.014*"ongoing" + 0.014*"american" '
  '+ 0.014*"certification" + 0.014*"actor" + 0.014*"tally" + 0.0