# Generate Topic Models
Generates the topic models of forum posts with LDA (Latent Dirichlet Allocation)

## Data Sources
- corpus (created with 3.0-Topic_Models-Lemmatize_Text.ipynb)
- dictionary (created with 3.0-Topic_Models-Lemmatize_Text.ipynb)
- lemmatized_text (created with 3.0-Topic_Models-Lemmatize_Text.ipynb)

## Changes
- 2020-09-16: Created
- 2020-09-17: Found topic model with highest coherence and generated dominant topics
- 2020-12-19: Added new data

## TODO
- Tutorial
 - https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
 - https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python


## Imports

In [None]:
# LDA
from gensim import corpora, models
import pyLDAvis.gensim
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.models.callbacks import PerplexityMetric, ConvergenceMetric, CoherenceMetric
# Managing data
import pandas as pd
import re
# DB connection
from scraping import create_connection
# Files & I/O
import pickle
import csv
import os
from pathlib import Path
from io import FileIO
# For logging
import logging
# Plotting
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Functions

For formatting LDA

In [None]:
def format_topics_sentences(ldamodel, corpus):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    return(sent_topics_df)

In [None]:
def write_list(fn, results):
    with open(fn, 'a') as f:
        writer = csv.writer(f) 
        writer.writerow(results)

In [None]:
def flatten_topics(topics):
    return [t[1] for t in topics]

Load data

In [None]:
def load_data(forum="all", group="all", id_type="family_id"):
    lemmatized_text = pickle.load(open(path_lemma_pkl.format(forum, group, id_type), 'rb'))
    corpus = pickle.load(open(path_corpus_pkl.format(forum, group, id_type), 'rb'))
    dictionary = corpora.Dictionary.load(path_dictionary_gensim.format(forum, group, id_type))
    return lemmatized_text, corpus, dictionary

In [None]:
def find_doc_convergence(log, i):
    # Regex to bookend log for iteration - choose last occurrence
#     end_slice = re.compile(fr"End of model: {i} iterations")
#     end_matches = [end_slice.findall(l) for l in open(log)]
#     iteration_end = [i for i, x in enumerate(end_matches) if x]
#     iteration_end = iteration_end[-1]
#     start_slice = re.compile(fr"Start of model: {i} iterations")
#     start_matches = [start_slice.findall(l) for l in open(log)]
#     start_options = [i for i, x in enumerate(start_matches) if x]
#     start_options = [item for item in start_options if item < iteration_end]
#     iteration_start = max(start_options)
#     iteration_bookends = [iteration_start, iteration_end]
    # Regex to find documents converged figures
    num = re.compile(":(\d+)\/\d")
    matches_num = [num.findall(l) for l in open(log)]
#     matches_num = matches_num[iteration_bookends[0]:iteration_bookends[1]]
    matches_num = [m for m in matches_num if len(m) > 0]
    # Unlist internal lists and turn into numbers
    matches_num = [m for sublist in matches_num for m in sublist]
    matches_num = [float(m) for m in matches_num]
    # Regex to find documents converged figures
    den = re.compile(":\d+\/(\d+)")
    matches_den = [den.findall(l) for l in open(log)]
#     matches_den = matches_den[iteration_bookends[0]:iteration_bookends[1]]
    matches_den = [m for m in matches_den if len(m) > 0]
    # Unlist internal lists and turn into numbers
    matches_den = [m for sublist in matches_den for m in sublist]
    matches_den = [float(m) for m in matches_den]
    return(matches_num, matches_den)

## File Locations

In [None]:
p = Path.cwd()
path_parent = p.parents[0]

In [None]:
# database
path_db = str(path_parent / "database" / "youbemom-merged.db")
# data to load
path_lemma_pkl = str(path_parent / "clean_data" / "lemmatized_text_{0}_{1}_{2}.pkl")
path_corpus_pkl = str(path_parent / "clean_data" / "corpus_{0}_{1}_{2}.pkl")
path_dictionary_gensim = str(path_parent / "clean_data" / "dictionary_{0}_{1}_{2}.gensim")
# model saving
path_tune_models = str(path_parent / "clean_data" / "lda_tune_{0}_{1}_{2}_{3}_{4}.gensim")
path_ntopic_models = str(path_parent / "clean_data" / "lda_ntopics_{0}_{1}_{2}_{3}.gensim")
# path_coherence = str(path_parent / "clean_data" / "coherence_{}.csv")
path_log = str(path_parent / "clean_data" / "logging_{0}_{1}_{2}_{3}.log")
path_log_iterations = str(path_parent / "clean_data" / "logging_{0}_{1}_{2}_{3}.log")
# dominant topic
path_dom_topic = str(path_parent / "clean_data" / "dominant_topic_{0}_{1}_{2}_{3}.csv")

## LDA Model for convergence
Train an LDA model on all subforums and all posts grouped on family_id

### Parameters

In [None]:
subforum = ['toddler'] # ['special-needs','tween-teen','preschool','elementary','new-york-city','toddler']
group = 'all'
id_type = 'family_id'
n_words = 10
n_passes = 30
n_iterations = [200] # add more to list to test
eval_every = 20
n = 10

### Model 10 topics to evaluate number of passes and iterations

In [None]:
for sf in subforum:
    lemmatized_text, corpus, dictionary = load_data(sf, group, id_type)
    for handler in logging.root.handlers:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=path_log.format(sf, group, id_type, n),
                        format="%(asctime)s:%(levelname)s:%(message)s",
                        level=logging.NOTSET)
    perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
    convergence_logger = ConvergenceMetric(logger='shell')
    coherence_cv_logger = CoherenceMetric(corpus=corpus, logger='shell', coherence = 'c_v', texts = lemmatized_text)
    for iterations in n_iterations:
        logging.debug(f'Start of model: {iterations} iterations')
        ldamodel = LdaModel(
            corpus=corpus,
            num_topics=n,
            id2word=dictionary,
            passes=n_passes,
            alpha="auto",
            eta="auto",
            random_state=1,
            iterations=iterations,
            eval_every=eval_every,
            callbacks=[perplexity_logger, convergence_logger, coherence_cv_logger]
        )
        logging.debug(f'End of model: {iterations} iterations')
        ldamodel.save(path_tune_models.format(sf, group, id_type, str(n), str(iterations)))

### Chart convergence of 10 topics
see: https://www.meganstodel.com/posts/callbacks/

In [None]:
sf = "toddler"

In [None]:
ldamodel = LdaModel.load(path_tune_models.format(sf, group, id_type, str(n), str(200)))
df = pd.DataFrame.from_dict(ldamodel.metrics)

In [None]:
plt.plot(df.index, df["Convergence"])
plt.show()

In [None]:
plt.plot(df.index, df["Coherence"])
plt.show()

In [None]:
plt.plot(df.index, df["Perplexity"])
plt.show()

In [None]:
con_num, con_den = find_doc_convergence(path_log.format(sf, group, id_type, n), 200)

In [None]:
len(con_num)

In [None]:
eval_points = list(range(37)) * 198
epochs = [i for i in range(198) for _ in range(37)]
convergence = pd.DataFrame(list(zip(epochs, eval_points, con_num, con_den)),
                               columns = ["epoch","eval_point","converged","total"])

In [None]:
convergence['epoch_point'] = convergence['epoch'] + convergence['eval_point'] / 13
convergence['per_converged'] = convergence['converged'] / convergence['total']

In [None]:
plt.plot(convergence['epoch_point'], convergence['per_converged'])
plt.show()

In [None]:
convergence.head(20)

For special needs: after testing 10, 100, and 200 iterations, we need at least 200 for the docs made from threads (grouped on family_id) to converge. 100 passes seems to let the convergence, perplexity, and coherence converge.

### Iterate through different topic counts to compare results

In [None]:
sf = 'toddler'
group = 'all'
id_type = 'family_id'
n_words = 10
n_passes = 35
n_iterations = 200
n_topics = [5, 10, 15, 20, 25, 30, 40, 50]
# n_topics = [40, 50]
eval_every = 20

In [None]:
lemmatized_text, corpus, dictionary = load_data(sf, group, id_type)

In [None]:
for n in n_topics:
    print("number of topics: ", n)
    ldamodel = LdaModel(
        corpus=corpus,
        num_topics=n,
        id2word=dictionary,
        passes=n_passes,
        alpha="auto",
        eta="auto",
        random_state=1,
        iterations=n_iterations,
        eval_every=eval_every
    )
    ldamodel.save(path_ntopic_models.format(sf, group, id_type, str(n)))

In [None]:
path_topics = str(path_parent / "clean_data" / "lda_topics_{0}_{1}_{2}.csv")
write_list(path_topics.format(sf, group, id_type), ["n_topics","topic_n","topics"])
for n in n_topics:
    ldamodel = LdaModel.load(path_ntopic_models.format(sf, group, id_type, str(n)))
    topics = ldamodel.print_topics(num_topics=n, num_words=n_words)
    for topic in topics:
        write_list(path_topics.format(sf, group, id_type), [n, topic[0], topic[1]])

In [None]:
ldamodel = LdaModel.load(path_tune_models.format(forum, group, str(10)))
ldamodel.alpha

### Find dominant topics for each message

#### Data used to create the model

In [None]:
from scraping import create_connection
from lemmatize import * 

In [None]:
# load df of clean text from csv

#### For each n topic, find dominant topic in each message

In [None]:
for n in n_topics:
    ldamodel = LdaModel.load(path_tune_models.format(forum, group, str(n)))
    topic_sentences = format_topics_sentences(ldamodel, corpus)
    df_joined = pd.concat([df.reset_index(drop=True), topic_sentences.reset_index(drop=True)], axis=1)
    df_joined[["message_id","text_clean","Dominant_Topic","Perc_Contribution"]].to_csv(path_dom_topic.format(forum, group, str(n)))

In [None]:
topic_dominant = pd.DataFrame()
topic_grouped = df_joined.groupby('Dominant_Topic')
for i, grp in topic_grouped:
    topic_dominant = pd.concat([topic_dominant,
                                grp.sort_values(['Perc_Contribution'],
                                                ascending=[0]).head(3)],
                               axis=0)
topic_dominant.reset_index(drop=True, inplace=True)
topic_dominant.info()

In [None]:
topic_dominant.head()

In [None]:
for t in topic_dominant['text_clean']:
    print(t)
    print("\n\n")

### Plot differences between topics

In [None]:
def plot_difference_plotly(mdiff, title="", annotation=None):
    """Plot the difference between models.

    Uses plotly as the backend."""
    import plotly.graph_objs as go
    import plotly.offline as py

    annotation_html = None
    if annotation is not None:
        annotation_html = [
            [
                "+++ {}<br>--- {}".format(", ".join(int_tokens), ", ".join(diff_tokens))
                for (int_tokens, diff_tokens) in row
            ]
            for row in annotation
        ]

    data = go.Heatmap(z=mdiff, colorscale='RdBu', text=annotation_html)
    layout = go.Layout(width=950, height=950, title=title, xaxis=dict(title="topic"), yaxis=dict(title="topic"))
    py.iplot(dict(data=[data], layout=layout))


def plot_difference_matplotlib(mdiff, title="", annotation=None):
    """Helper function to plot difference between models.

    Uses matplotlib as the backend."""
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(18, 14))
    data = ax.imshow(mdiff, cmap='RdBu_r', origin='lower')
    plt.title(title)
    plt.colorbar(data)


try:
    get_ipython()
    import plotly.offline as py
except Exception:
    #
    # Fall back to matplotlib if we're not in a notebook, or if plotly is
    # unavailable for whatever reason.
    #
    plot_difference = plot_difference_matplotlib
else:
    py.init_notebook_mode()
    plot_difference = plot_difference_plotly

In [None]:
ldamodel5 = LdaModel.load(path_ntopic_models.format(forum, group, id_type, str(5)))
ldamodel10 = LdaModel.load(path_ntopic_models.format(forum, group, id_type, str(10)))
ldamodel15 = LdaModel.load(path_ntopic_models.format(forum, group, id_type, str(15)))
ldamodel20 = LdaModel.load(path_ntopic_models.format(forum, group, id_type, str(20)))
ldamodel25 = LdaModel.load(path_ntopic_models.format(forum, group, id_type, str(25)))
ldamodel30 = LdaModel.load(path_ntopic_models.format(forum, group, id_type, str(30)))
ldamodel40 = LdaModel.load(path_ntopic_models.format(forum, group, id_type, str(40)))
ldamodel50 = LdaModel.load(path_ntopic_models.format(forum, group, id_type, str(50)))

In [None]:
mdiff, annotation = ldamodel10.diff(ldamodel5, distance="hellinger", num_words=50)
plot_difference(mdiff, title="topic difference", annotation=annotation)

Visualize the topics. See: https://www.objectorientedsubject.net/2018/08/experiments-on-topic-modeling-pyldavis/

## What is the Dominant Topic in each Post?

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamodel, corpus=corpus)
df_topic_sents_keywords.info()
df_topic_sents_keywords.head()

In [None]:
import sqlite3
path_db = str(path_parent / "database" / "youbemom-merged.db")
sql = '''
    SELECT s.text_no_url AS text_no_url, s.text as text
    FROM sentiment AS s
    JOIN posts AS p
    ON s.message_id = p.message_id
    WHERE p.subforum="special-needs" AND p.parent_id=""
'''
conn = create_connection(path_db)
sn = pd.read_sql_query(sql, conn)
sn.info()

## Save Model Topics and Keywords in New Database

In [None]:
conn = sqlite3.connect(path_db)
df_topic_sents_keywords.to_sql('topicmodel', conn, if_exists='replace', index=False)