# Explore Topic Models
Explores the topic models of forum posts with LDA (Latent Dirichlet Allocation)

## Data Sources
- topicmodel (created with 4-Generate_Topic_Models.ipynb)
- sentiments (created with 2-Sentiment_Analysis.ipynb)

## Changes
- 2020-09-17: Created

## TODO
- Tutorial
 - https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
 - https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python


## Imports

In [1]:
from gensim import corpora, models
import pickle
from pathlib import Path
from io import FileIO
import pyLDAvis.gensim
from gensim.models import LdaModel, LdaMulticore
from lemmatize import *
from scraping import create_connection

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  and should_run_async(code)


## Functions

In [3]:
def load_text_data(forum="special-needs", group="parent"):
    corpus = pickle.load(open(path_corpus_pkl.format(forum, group), 'rb'))
    dictionary = corpora.Dictionary.load(path_dictionary_gensim.format(forum, group))
    return corpus, dictionary

In [4]:
def load_db_data(subforum="special-needs", group="parent"):
    conn = create_connection(path_db)
    sql = gen_sql_dates(subforum, group)
    df = pd.read_sql_query(sql, conn)
    conn.close()
    df = replace_email(df)
    df = replace_lonely_numbers(df)
    df = drop_nonalpha(df)
    return df

In [5]:
def format_topics_sentences(ldamodel, corpus):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    return(sent_topics_df)

## File Locations

In [6]:
p = Path.cwd()
path_parent = p.parents[0]

In [7]:
path_db = str(path_parent / "database" / "youbemom-merged.db")
path_model = str(path_parent / "clean_data" / "lda_tune_special-needs_parent_{}.gensim")
path_corpus_pkl = str(path_parent / "clean_data" / "corpus_{0}_{1}.pkl")
path_dictionary_gensim = str(path_parent / "clean_data" / "dictionary_{0}_{1}.gensim")

In [8]:
path_topic = str(path_parent / "clean_data" / "topic_model_{0}_{1}.txt")

## Load Data

In [14]:
n_topics = 15
forum = 'special-needs'
group = 'all'
path_tune_models = str(path_parent / "clean_data" / "lda_tune_{0}_{1}_{2}.gensim")
ldamodel = LdaModel.load(path_tune_models.format(forum,group,str(n_topics)))

In [10]:
corpus, dictionary = load_text_data('special-needs', 'all')

In [11]:
df = load_db_data('special-needs', 'all')

In [15]:
df.head()

Unnamed: 0,message_id,text_clean,date_created
0,104900532,Son has mild Sensory Issuesany privates in Man...,2018-02-18 20:04:00
1,104900615,That’s all he has going on?,2018-02-18 20:09:00
2,104900681,So far. Started some NP testing to determine m...,2018-02-18 20:13:00
3,104900694,well how old is he?,2018-02-18 20:14:00
4,104900704,just turned 5,2018-02-18 20:14:00


## Visualize Topics

In [16]:
pyLDAvis.display(pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False))

## Most Representative Post for Each Topic

In [17]:
topic_sentences = format_topics_sentences(ldamodel, corpus)

In [18]:
topic_sentences.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276510 entries, 0 to 276509
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dominant_Topic     276510 non-null  float64
 1   Perc_Contribution  276510 non-null  float64
 2   Topic_Keywords     276510 non-null  object 
dtypes: float64(2), object(1)
memory usage: 6.3+ MB


In [19]:
df_joined = pd.concat([df.reset_index(drop=True), topic_sentences.reset_index(drop=True)], axis=1)

In [20]:
df_joined.to_csv(path_topic.format('special-needs', 'all'), sep='\t', index=False)

In [None]:
topic_dominant = pd.DataFrame()
topic_grouped = df_joined.groupby('Dominant_Topic')
for i, grp in topic_grouped:
    topic_dominant = pd.concat([topic_dominant,
                                grp.sort_values(['Perc_Contribution'],
                                                ascending=[0]).head(3)],
                               axis=0)
topic_dominant.reset_index(drop=True, inplace=True)
topic_dominant.info()

In [None]:
for t in topic_dominant['text_clean']:
    print(t)