# Topic Modeler
Discovering abstract topics in documents (Flipgrid Topics). Using Latent Dirichlet Allocation (LDA)we will look for relevant collections of words, or topics, in the corpus.

#### Packages
* spaCy (NLP)
* Gensim (Topic modeling library)
* Numpy (Math stuff)
* Pandas (Data analysis)

In [1]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS

from pprint import pprint

## Cleaning the data

In [8]:

nlp = spacy.load("en_core_web_sm")

def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

doc = nlp('''Mike Tyson was a great boxer. He was a boxer who won many belts.''')

In [9]:
doc_list = []
doc_list.append(doc)
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
bow = [words.doc2bow(doc) for doc in doc_list]

In [10]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [11]:
topics = lda_model.print_topics(num_words=10)
pprint(topics)

[(0,
  '0.167*"boxer" + 0.167*"win" + 0.167*"Mike" + 0.167*"Tyson" + 0.167*"belt" + '
  '0.167*"great"'),
 (1,
  '0.167*"boxer" + 0.167*"win" + 0.167*"Mike" + 0.167*"belt" + 0.167*"Tyson" + '
  '0.167*"great"'),
 (2,
  '0.167*"win" + 0.167*"boxer" + 0.167*"Mike" + 0.167*"belt" + 0.167*"great" + '
  '0.167*"Tyson"'),
 (3,
  '0.167*"win" + 0.167*"boxer" + 0.167*"Mike" + 0.167*"Tyson" + 0.167*"belt" + '
  '0.167*"great"'),
 (4,
  '0.167*"win" + 0.167*"belt" + 0.167*"boxer" + 0.167*"Mike" + 0.167*"great" + '
  '0.167*"Tyson"'),
 (5,
  '0.167*"boxer" + 0.167*"Mike" + 0.167*"belt" + 0.167*"win" + 0.167*"great" + '
  '0.167*"Tyson"'),
 (6,
  '0.276*"boxer" + 0.145*"Mike" + 0.145*"Tyson" + 0.145*"great" + 0.145*"belt" '
  '+ 0.145*"win"'),
 (7,
  '0.167*"boxer" + 0.167*"win" + 0.167*"Mike" + 0.167*"belt" + 0.167*"Tyson" + '
  '0.167*"great"'),
 (8,
  '0.167*"boxer" + 0.167*"Mike" + 0.167*"Tyson" + 0.167*"great" + 0.167*"belt" '
  '+ 0.167*"win"'),
 (9,
  '0.167*"belt" + 0.167*"boxer" + 0.167*"