<a href="https://colab.research.google.com/github/mkane968/Extracted-Features/blob/master/Topic_Modeling_with_SciFi_Corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic Modeling the SciFi Corpus Using Gensim and pyLDAvis

Adapted from: https://github.com/hawc2/text-analysis-with-python/blob/master/Topic_Modeling.ipynb 

# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Upload Files

In [None]:
from google.colab import files

uploaded = files.upload()

# Convert CSV to Data Frame

In [None]:
import numpy as np
import pandas as pd
import io

In [None]:
df = pd.read_csv(io.StringIO(uploaded['output (2).csv'].decode('utf-8')))
df

In [None]:
data = df.Text.values.tolist()

### View Dataframe

In [None]:
print(df)

In [None]:
%load_ext google.colab.data_table 
df

# Clean Texts

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
# A simple way to add further stop words
#stop_words.append('movie')

In [None]:
!pip3 install spacy
!python -m spacy download en_core_web_lg

In [None]:
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess

In [None]:
import re

In [None]:
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
      yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [None]:
print(data_words[:10])

In [None]:
bigram = gensim.models.Phrases(data_words, min_count=1, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
def remove_stopwords(texts):
   return [[word for word in simple_preprocess(str(doc))
if word not in stop_words] for doc in texts]

def make_bigrams(texts):
   return [bigram_mod[doc] for doc in texts]

#def make_trigrams(texts):
#   return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
   texts_out = []
   for sent in texts:
     doc = nlp(" ".join(sent))
     texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
   return texts_out

In [None]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=[
   'NOUN', 'ADJ', 'VERB', 'ADV'
])

In [None]:
print(data_lemmatized[:4])

# Building Dictionary and Corpus

In [None]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus)

# Create Topic Model - Topics 20

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

# Create Visualization (Save HTML)

The easiest way to create the visualization is to reveal it in the Google Colab notebook and save it as an html file that you can view on your browser. 

In [None]:
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [None]:
vis = gensimvis.prepare(lda_model, corpus, id2word)

#vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')

In [None]:
pyLDAvis.save_html(vis, '/content/LDAviz.html')

In [None]:
pyLDAvis.display(vis)

# Topic Modeling Model - 60 Topics

In [None]:
lda_model60 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=60,
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=20,
                                           iterations=200,
                                           alpha='auto',
                                           per_word_topics=True)

# Create Visualization (Save HTML)

The easiest way to create the visualization is to reveal it in the Google Colab notebook and save it as an html file that you can view on your browser. 

In [None]:
vis60 = pyLDAvis.gensim.prepare(lda_model60, corpus, id2word)
#vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')

In [None]:
pyLDAvis.save_html(vis60, '/content/LDAviz60.html')

In [None]:
pyLDAvis.display(vis60)

# Serve Visualization in Browser

You can also serve the visualization locally in the browser using the below chunk of code. Beware that caching in your browser and other issues, such as ad-blockers, may require some debugging to get this working on your machine. 

In [None]:
#pyLDAvis.enable_notebook()
#pyLDAvis.show(vis)