In [1]:
import os
import re
import numpy as np
import pandas as pd
import gensim
import collections
from nltk.corpus import stopwords
import nltk

from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists, drop_database
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

  "is going to be overriden.".format(identifier))


In [None]:
# using function from "Keeping API Keys Secret.ipynb"
# by https://github.com/dylburger
def get_file_contents(filename):
    """ 
    Given a filename,
    return the contents of that file
    """
    try:
        with open(filename, 'r') as f:
            # assumed file is a single line with key
            return f.read().strip()
    except FileNotFoundError:
        print("'%s' file not found" % filename)

In [None]:
# Define a database name (we're using a dataset on births, so we'll call it birth_db)
# Set your postgres username
dbname = 'insight_db'
username = 'postgres' # change this to your username
password = get_file_contents('../keys/psql_key')

In [None]:
# 'engine' is a connection to a database
engine = create_engine('postgres://%s:%s@localhost/%s'%(username,password,dbname))
# load sql_magic so we can write SQL in Jupyter Notebooks
%load_ext sql_magic

# setup SQL connection to the postgreSQL engine we created
%config SQL.conn_name = 'engine'
print(engine.url)

In [None]:
# if a PostgreSQL database with this name exists  
# create a database (if it doesn't exist)
if database_exists(engine.url):
    # delete PostgreSQL database 
    drop_database(engine.url)
    # create empty PostgreSQL database
    create_database(engine.url)
else:
    # create empty PostgreSQL database
    create_database(engine.url)
print(database_exists(engine.url))

In [None]:
# connect to make queries using psycopg2
con = psycopg2.connect(database = dbname, host = 'localhost', user = username, password = password)
con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

In [None]:
sql_create_table = """
CREATE TABLE videos (
    id SERIAL PRIMARY KEY, 
    video_id VARCHAR(255) NOT NULL, 
    uploader VARCHAR(255), 
    title VARCHAR(255), 
    upload_date DATE, 
    duration INT, 
    view_count INT, 
    likes INT, 
    dislikes INT, 
    video TEXT, 
    subtitles TEXT, 
    thumbnail TEXT, 
    primary_category VARCHAR(255), 
    description TEXT);
"""

In [None]:
# create a new cursor object
cur = con.cursor()
# execute the create table statement
cur.execute(sql_create_table)

In [None]:
#nltk.download('stopwords')

In [None]:
# function to clean vtt and description text files
def clean_vtt(path_list):
    """
    input a list of paths to text files
    returns a list of strings without vtt formatting
    """
    line_strings = []
    for i,path in enumerate(path_list):
        f = open(path, 'r',encoding='UTF-8')
        lines = []
        for line in f.readlines():
            line = line.strip()
            if line == '\n':
                continue
            try:
                pd.to_datetime(line.split(' ')[0])
                continue
            except:
                pass
            lines.append(re.sub('<.*?>', '', line))
        lines = list(dict.fromkeys(lines))
        full_string = ''
        for line in lines:
            full_string = full_string + line + " "
        line_strings.append(full_string)
        cleaned_line_strings = [s.replace('WEBVTT Kind: captions Language: en ', '').replace("'","").replace("NA",'0') for s in line_strings]
        f.close()
    return cleaned_line_strings

In [None]:
# function to clean vtt and description text files
def strip_path(vtt_path_list):
    """
    input a list of paths for vtt files
    returns a list paths for videos without extensions 
    """
    video_paths = []
    for path in vtt_path_list:
        video_paths.append(path[:-7])
    return video_paths

In [None]:
# parse paths to get variables for insight_db
def get_variables(vtt_path_list):
    """
    input list of paths
    return a list of tuples with variables for each path/file
    """
    variables = []
    # iterate vtt paths and parse into variables
    for i, vtt_path in enumerate(vtt_path_list):
        # split folders
        path = vtt_path.split(sep='/')
        vtt_fname = path.pop(-1)
        # split features
        uidx = vtt_fname.rfind('_')
        vpath = vtt_fname[:uidx]
        vtt_vals = []
        for i in range(5):
            uidx = vpath.rfind('_')
            if vpath[uidx+1:] != 'NA':
                vtt_vals.append(vpath[uidx+1:])
            else:
                vtt_vals.append(0)
            vpath = vpath[:uidx]
        vtt_vals.append(vpath[:uidx])
        for j in range(3):
            vtt_vals.append(path.pop(-1))
        variables.append(vtt_vals)
    return variables

In [None]:
# collected paths to video files
# videos, thumbnails, descriptions, subtitles
vtt_path_list = []
img_path_list = []
des_path_list = []

for root, dirs, files in os.walk("data"):
    for file in files:
        if file.endswith(".vtt"): # subtitles
             vtt_path_list.append(os.path.join(root, file))
        if file.endswith(".jpg"): # thumbnails
             img_path_list.append(os.path.join(root, file))
        if file.endswith(".description"): # descriptions
             des_path_list.append(os.path.join(root, file))

In [None]:
vtt_path_list

In [None]:
# get media paths
video_paths = strip_path(vtt_path_list)
#video_paths[:5]

In [None]:
vtt_items = clean_vtt(vtt_path_list)
#vtt_items[:5]

In [None]:
descriptions = clean_vtt(des_path_list)
#descriptions[:5]

In [None]:
variables = get_variables(vtt_path_list)
#variables[:5]

In [None]:
for i, video_path in enumerate(video_paths):
    sql_insert_into_videos = """
    INSERT INTO videos (video_id, uploader, title, upload_date, duration, 
                        view_count, likes, dislikes, video, subtitles, 
                        thumbnail, primary_category, description) 
    VALUES ('{}','{}','{}','{}','{}','{}','{}','{}',
            '{}','{}','{}','{}','{}')
    """.format(variables[i][5], variables[i][7], variables[i][6], variables[i][4], 
               variables[i][3], variables[i][2], variables[i][1], variables[i][0], 
               video_path, vtt_items[i], img_path_list[i], 
               variables[i][8], descriptions[i])
    # create a new cursor object
    cur = con.cursor()
    # execute the create table statement
    cur.execute(sql_insert_into_videos)

In [None]:
%%read_sql

-- Select all records from the videos table
SELECT * FROM videos;

In [None]:
SQL_Query = pd.read_sql_query('SELECT video_id FROM videos', con);

In [None]:
SQL_Query.to_csv(r'data/comments/list_of_video_ids.csv', index = None, header=False)

In [None]:
import pandas as pd

data_df = pd.read_sql('SELECT * from videos', con=con)

In [None]:
data_df['title'] = data_df['title'].str.replace('_',' ')

## START HERE

In [None]:
from_csv_filename = 'data/csv_files/summarized_data.csv'
data_df = pd.read_csv(from_csv_filename, sep=',')

In [None]:
data_text = data_df[['title']]

In [None]:
data_text['index'] = data_text.index
documents = data_text

In [None]:
print(len(documents))

In [None]:
print(documents[:5])

In [None]:
# Perform Tokenization to split the text into sentences and the sentences into words. 
# Lowercase the words and remove punctuation.

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer, StemmerI
from nltk.stem.porter import *
import numpy as np
np.random.seed(2019)
import nltk
nltk.download('wordnet')

stemmer = PorterStemmer()

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'),)
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
doc_sample = documents[documents['index'] == 1000].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [None]:
processed_docs = documents['title'].map(preprocess)
processed_docs[:10]

In [None]:
import pandas as pd
import os

os.chdir('..')

data_df = pd.read_sql('SELECT * from videos', con=con)
data_df.head()

In [None]:
# remove the columns
data_df = data_df.drop(columns=['id', 'video_id', 'uploader', 'thumbnail', 'video', 'duration'], axis=1)

In [None]:
data_df.head()

In [None]:
# Load the regular expression library
import re
# Remove punctuation
data_df['title_processed'] = data_df['title'].map(lambda x: re.sub('[,\.!_?]', ' ', x))
# Convert the titles to lowercase
data_df['title_processed'] = data_df['title_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
data_df['title_processed'].head()

In [None]:
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
long_string = ','.join(list(data_df['title_processed'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

In [None]:
## START OF LDA ANALYSIS

In [None]:
%matplotlib inline

# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(12, 12/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='Paired')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()
    
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(data_df['title_processed'])

# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)

In [None]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 15
number_words = 5
# Create and fit the LDA model
lda = LDA(n_components=number_topics)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

In [None]:
%%time
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

# #LDAvis_data_filepath = os.path.join('ldavis_prepared_'+ str(number_topics))
# # # this is a bit time consuming - make the if statement True
# # # if you want to execute visualization prep yourself
# if 1 == 1:
#     LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
# #with open(LDAvis_data_filepath, 'w') as f:
#         pickle.dump(LDAvis_prepared, f)
        
# # load the pre-prepared pyLDAvis data from disk
# with open(LDAvis_data_filepath) as f:
#     LDAvis_prepared = pickle.load(f)
# pyLDAvis.save_html(LDAvis_prepared, 'ldavis_prepared_'+ str(number_topics) +'.html')

vis = pyLDAvis.gensim.prepare(lda, count_data, count_vectorizer)

In [2]:
%matplotlib inline

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [4]:
from_csv_filename = 'data/csv_files/summarized_data.csv'
data_df = pd.read_csv(from_csv_filename, sep=',')

In [5]:
# data_df.drop_duplicates(subset='title', keep="last", inplace=True)

In [6]:
# data_df.reset_index(inplace=True)

In [7]:
drywall_titles = data_df[data_df.primary_category=='drywall_repair'].title.values.tolist()
carpet_titles = data_df[data_df.primary_category=='carpet_flooring'].title.values.tolist()
laminate_titles = data_df[data_df.primary_category=='laminate_flooring'].title.values.tolist()
fence_titles = data_df[data_df.primary_category=='repair_wood_fence'].title.values.tolist()
deck_titles = data_df[data_df.primary_category=='build_deck'].title.values.tolist()

In [8]:
all_titles = data_df.title.values.tolist()

In [9]:
# Remove Emails
titles = [re.sub('[,\.!_?]', ' ', s) for s in all_titles]

print(titles[:10])

['How to remove popcorn stipple ceiling', 'How to install carpet tiles', 'Fix small nail holes in walls fast and make them disappear', 'Hog wire deck rail installation', 'Replacing a section of drywall after a pipe leak', 'Vinyl plank flooring over tile should i do this', 'Large plaster repairs skim coating house painting', 'Diy ceiling repair - skim coat over a painted popcorn ceiling part 4', 'How to skim coat a wall', 'How to make a four season room from a porch']


In [10]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

title_words = list(sent_to_words(titles))

print(title_words[:10])

[['how', 'to', 'remove', 'popcorn', 'stipple', 'ceiling'], ['how', 'to', 'install', 'carpet', 'tiles'], ['fix', 'small', 'nail', 'holes', 'in', 'walls', 'fast', 'and', 'make', 'them', 'disappear'], ['hog', 'wire', 'deck', 'rail', 'installation'], ['replacing', 'section', 'of', 'drywall', 'after', 'pipe', 'leak'], ['vinyl', 'plank', 'flooring', 'over', 'tile', 'should', 'do', 'this'], ['large', 'plaster', 'repairs', 'skim', 'coating', 'house', 'painting'], ['diy', 'ceiling', 'repair', 'skim', 'coat', 'over', 'painted', 'popcorn', 'ceiling', 'part'], ['how', 'to', 'skim', 'coat', 'wall'], ['how', 'to', 'make', 'four', 'season', 'room', 'from', 'porch']]


In [11]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(title_words, min_count=3, threshold=20) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[title_words], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [12]:
# See trigram example
print(trigram_mod[bigram_mod[title_words[19]]])

['diy', 'deck', 'time', 'lapse', 'building', 'ground', 'level', 'deck']


In [13]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_bigrams_subs(texts):
    return [bigram_mod_subs[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def make_trigrams_subs(texts):
    return [trigram_mod_subs[bigram_mod_subs[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
# Remove Stop Words
title_words_nostops = remove_stopwords(title_words)

# Form Bigrams
title_words_bigrams = make_bigrams(title_words_nostops)


# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
titles_lemmatized = lemmatization(title_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [15]:
print(titles_lemmatized[:10])

[['remove', 'popcorn', 'stipple', 'ceiling'], ['install', 'carpet', 'tile'], ['fix', 'small', 'nail', 'hole', 'wall', 'fast', 'make', 'disappear'], ['wire', 'deck', 'rail', 'installation'], ['replace', 'section', 'drywall', 'pipe', 'leak'], ['vinyl_plank', 'flooring', 'tile'], ['large', 'plaster', 'repair', 'skim_coate', 'house', 'painting'], ['diy', 'ceiling', 'repair', 'skim_coat', 'painted_popcorn', 'ceiling', 'part'], ['skim_coat', 'wall'], ['make', 'season', 'room', 'porch']]


In [16]:
# Create Dictionary
id2word = corpora.Dictionary(titles_lemmatized)

# Create Corpus
texts = titles_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [17]:
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1)]]


In [18]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ceiling', 1), ('popcorn', 1), ('remove', 1), ('stipple', 1)]]

In [19]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=10,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
# Print the Keyword in the 10 topics
lda_model.print_topics()

[(0,
  '0.188*"floor" + 0.137*"laminate" + 0.083*"instal" + 0.041*"wooden" + 0.024*"guide" + 0.023*"panel" + 0.019*"tool" + 0.015*"shop" + 0.014*"beginner" + 0.013*"decor"'),
 (1,
  '0.114*"part" + 0.098*"drywall" + 0.088*"repair" + 0.059*"diy" + 0.036*"door" + 0.032*"tape" + 0.027*"texture" + 0.020*"wall_preparation" + 0.017*"rail" + 0.016*"trick"'),
 (2,
  '0.102*"easy" + 0.056*"transition" + 0.055*"gate" + 0.053*"stair" + 0.032*"strip" + 0.029*"cut" + 0.026*"home_mender" + 0.021*"float" + 0.020*"lie" + 0.018*"tack"'),
 (3,
  '0.043*"use" + 0.042*"way" + 0.040*"project" + 0.036*"new" + 0.028*"patio" + 0.021*"sheet" + 0.020*"screw" + 0.019*"quick" + 0.016*"idea" + 0.015*"tapcon"'),
 (4,
  '0.116*"drywall" + 0.104*"flooring" + 0.096*"repair" + 0.086*"wall" + 0.070*"fix" + 0.053*"patch" + 0.051*"ceiling" + 0.039*"hole" + 0.033*"crack" + 0.022*"vinyl_plank"'),
 (5,
  '0.133*"make" + 0.057*"outdoor" + 0.037*"seam" + 0.036*"cheap" + 0.029*"porch" + 0.028*"room" + 0.020*"height" + 0.017*"ou

In [21]:
doc_lda = lda_model[corpus]

In [22]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=titles_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.447591284217403

Coherence Score:  0.5792319687952285


In [23]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) #sort=True not included

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [24]:
vis

In [25]:
!pwd

/home/clutch/repos/insight_repo/youtube_test


In [26]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = 'wrapper/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=15, id2word=id2word)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [27]:
# Show Topics
print(ldamallet.show_topics(formatted=False))

[(13, [('fence', 0.2928759894459103), ('post', 0.1424802110817942), ('gate', 0.09762532981530343), ('wooden', 0.055408970976253295), ('porch', 0.0395778364116095), ('panel', 0.0316622691292876), ('lay', 0.023746701846965697), ('chain_link', 0.021108179419525065), ('sag', 0.018469656992084433), ('wire', 0.0158311345646438)]), (9, [('diy', 0.2542857142857143), ('outdoor', 0.07714285714285714), ('home_ideas', 0.054285714285714284), ('patio', 0.054285714285714284), ('project', 0.045714285714285714), ('ep', 0.03428571428571429), ('outdoor_great', 0.03142857142857143), ('modern', 0.02857142857142857), ('renovation', 0.022857142857142857), ('learn', 0.02)]), (1, [('diy', 0.23177083333333334), ('part', 0.22395833333333334), ('ceiling', 0.15885416666666666), ('skim_coat', 0.044270833333333336), ('water_damage', 0.020833333333333332), ('small', 0.018229166666666668), ('painted_popcorn', 0.015625), ('ground', 0.013020833333333334), ('painting', 0.010416666666666666), ('dry', 0.010416666666666666)

In [28]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=titles_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


Coherence Score:  0.6450984796096291


In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=titles_lemmatized, start=2, limit=30, step=6)

In [None]:
# Show graph
limit=30; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)

optimal_model.print_topics(num_words=5)

In [None]:
# STEP 18 NEXT!!!