# How do mothers and fathers talk about parenting to different audiences? 

# 4. LDA Topic modelling

### Import modules

In [None]:
## Load needed modules
# Import required packages
import pandas as pd
import numpy as np
import logging
import gensim
import json
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array
%config Completer.use_jedi = False
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import logging
logging.basicConfig(filename='gensim.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

### Necessary functions to calculate coherence values and run the topic model

In [None]:
#coherence values LdaModel
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state = 100, chunksize = 10000, passes = 20, iterations = 100)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
#coherence values LdaMallet
def mallet_compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary, random_seed=100)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

### Testing with nouns and verbs

In [None]:
#dataset with nouns and verbs
preprocessed = pd.read_pickle('tokenized_nouns_verbs.pkl')

In [None]:
from sklearn.feature_extraction import text
add_stop_words = ["time", "hair", "day", "day", "month", "week", "hour", "year", "minute", "idea", "adult", "age", "comment", "people", "person", "man", "sure"]
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
#remove stopwords from tokenized dataset
preprocessed['body'] = preprocessed['body'].apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
#remove empty comments
preprocessed = preprocessed[preprocessed.astype(str)['body'] != "[]"]
#Reset the indexes
preprocessed = preprocessed.reset_index(drop=True)

In [None]:
# Convert to array
docs_old = list(preprocessed['body'].copy())

In [None]:
#Create Biagram & Trigram Models 
from gensim.models import Phrases
# Add bigrams and trigrams to docs,minimum count 20 means only that appear 20 times or more.
bigram = Phrases(docs_old, min_count=20)

for idx in range(len(docs_old)):
    for token in bigram[docs_old[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs_old[idx].append(token)

In [None]:
docs = docs_old.copy()

In [None]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
#Gensim filter_extremes
#Filter out tokens that appear in less than 15 documents (absolute number) or more than 0.5 documents (fraction of total corpus size, not absolute number). 
dictionary.filter_extremes(no_below=5, no_above=0.20)
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
print(corpus[:5])

In [None]:
empty_comments = 0
for comment in corpus:
    if len(comment) == 0:
        empty_comments += 1
print(empty_comments)

In [None]:
#remove empty comments
corpus = [x for x in corpus if x != []]

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=docs, start=2, limit=50, step=1)

In [None]:
# Show graph
import matplotlib.pyplot as plt
limit=50; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
#Coherence values were lower than the model with only nouns so I decides to keep the latter

### Testing with only nouns

In [None]:
#dataset with only nouns
preprocessed_nouns = pd.read_pickle('tokenized_nouns.pkl')

In [None]:
from sklearn.feature_extraction import text
add_stop_words = ["time", "hair", "day", "month", "week", "hour", "year", "minute", "idea", "adult", "age", "comment", "people", "person", "man", "sure"]
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
#remove stopwords from tokenized dataset
preprocessed_nouns['body'] = preprocessed_nouns['body'].apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
#remove empty comments
preprocessed_nouns = preprocessed_nouns[preprocessed_nouns.astype(str)['body'] != "[]"]
#Reset the indexes
preprocessed_nouns = preprocessed_nouns.reset_index(drop=True)

In [None]:
# Convert to array
docs_old_nouns = list(preprocessed_nouns['body'].copy())

In [None]:
#Create Biagram & Trigram Models 
from gensim.models import Phrases
# Add bigrams and trigrams to docs,minimum count 20 means only that appear 20 times or more.
bigram = Phrases(docs_old_nouns, min_count=20)

for idx in range(len(docs_old_nouns)):
    for token in bigram[docs_old_nouns[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs_old_nouns[idx].append(token)

In [None]:
docs_nouns = docs_old_nouns.copy()

In [None]:
# Create a dictionary representation of the documents. It gives an integer to each word
dictionary_nouns = Dictionary(docs_nouns)
#Gensim filter_extremes
#Filter out tokens that appear in less than 15 documents (absolute number) or more than 0.5 documents (fraction of total corpus size, not absolute number). 
dictionary_nouns.filter_extremes(no_below=5, no_above=0.20)
corpus_nouns = [dictionary_nouns.doc2bow(doc) for doc in docs_nouns]
print('Number of unique tokens: %d' % len(dictionary_nouns))
print('Number of documents: %d' % len(corpus_nouns))
print(corpus_nouns[:5])

In [None]:
empty_comments = 0
for comment in corpus_nouns:
    if len(comment) == 0:
        empty_comments += 1
print(empty_comments)

In [None]:
#remove empty comments
corpus_nouns = [x for x in corpus_nouns if x != []]

In [None]:
#Trying LDAmallet
mallet_path = '/Users/melodys/Downloads/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_nouns, num_topics=10, id2word=dictionary_nouns)
mallet_model_list, mallet_coherence_values = mallet_compute_coherence_values(dictionary=dictionary_nouns, corpus=corpus_nouns, texts=docs_nouns, start=2, limit=50, step=1)

In [None]:
# Show graph
limit=50; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, mallet_coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Show Topics
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_nouns, num_topics=13, id2word=dictionary_nouns, random_seed=100)
ldamallet.print_topics()
#The resulting models had higher coherence scores but were less interpretable than Gensim LDAModel so I decided to continue with the latter

In [None]:
model_list_nouns, coherence_values_nouns = compute_coherence_values(dictionary=dictionary_nouns, corpus=corpus_nouns, texts=docs_nouns, start=2, limit=50, step=1)

In [None]:
# Show graph
import matplotlib.pyplot as plt
fig = plt.figure()
limit=50; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values_nouns)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
fig.savefig('coherence score graph.jpeg', bbox_inches='tight', dpi=600)

In [None]:
#once we have chosen the number of topics we want, then we calculate the coherence scores for different values of the hyperparameters alpha and beta
def compute_coherence_values_a_b(dictionary, corpus, texts):
    # Alpha parameter
    alpha = list(np.arange(0.01, 0.62, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')
    # Beta parameter
    beta = list(np.arange(0.01, 0.62, 0.3))
    beta.append('symmetric')
 
    model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

    for a in alpha:
        # iterare through beta values
        for b in beta:    
            lda_model = LdaModel(corpus = corpus,
                                id2word = dictionary,
                                num_topics = 12, 
                                random_state = 100,
                                chunksize = 1000,
                                passes = 20,
                                iterations = 100,
                                alpha = a,
                                eta = b)
            coherencemodel = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
            model_results['Topics'].append(12)
            model_results['Alpha'].append(a)
            model_results['Beta'].append(b)
            model_results['Coherence'].append(coherencemodel.get_coherence())


    return pd.DataFrame(model_results)

In [None]:
alpha_beta_12 = compute_coherence_values_a_b(dictionary=dictionary_nouns, corpus=corpus_nouns, texts=docs_nouns)

In [None]:
alpha_beta_12

In [None]:
# Compute Coherence Score using c_v
coherence_model_lda_12 = CoherenceModel(model=lda_model_12, texts=docs_nouns, dictionary=dictionary_nouns, coherence='c_v')
coherence_lda_12 = coherence_model_lda_12.get_coherence()
print('\nCoherence Score: ', coherence_lda_12)

In [None]:
# Set the parameters of the LDA model with 12 topics
lda_model_12 = LdaModel(corpus=corpus_nouns,
                        id2word=dictionary_nouns,
                        num_topics = 12, 
                        random_state = 100,
                        chunksize = 5000,
                        passes = 40,
                        iterations = 1000,
                        alpha = 0.01,
                        eta = 0.61)
# Print the Keyword in the 12 topics
lda_model_12.print_topics()

In [None]:
#visualise the topic model
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model_12, corpus_nouns, dictionary=lda_model_12.id2word)
vis

In [None]:
#read the file that we will apply to give a score to comments
data_clean = pd.read_pickle('NEW_data_clean.pkl')

In [None]:
#check the dataframe
data_clean

In [None]:
#First, get a list of posts, one by one
CompletePosts = list(data_clean["lemmatized"])

#create a bag of words for each of the comments
#for this, we will use doc2bow --> document to bag of words       
new_doc2bow = [dictionary_nouns.doc2bow(post) for post in CompletePosts]
#The vector will contain all the weights for each of the 14 topics
vector = lda_model_12.get_document_topics(new_doc2bow)

#create a list of dictioaries for scores relating to optimum LDA
newdictlist = []
for i in range(0, len(vector)):
    newdictlist.append(dict(vector[i]))

#create a pandas dataframe
topic_bow = pd.DataFrame(newdictlist)
topic_bow = topic_bow.reindex(sorted(topic_bow.columns), axis=1)

#if there are values with none values (topic does not apply), then fill with 0's
topic_bow.fillna(0, inplace=True)

In [None]:
#create list of topic names to be columns for the new df
TopicList = []
TopicNum = lda_model_12.num_topics

for i in range(0,TopicNum):
    TopicList.append('Topic_' + str(i))
    
old_column_names = [i for i in range(14)]
new_column_names = TopicList
#rename each of the columns so that each column is called Table_X
topic_bow.rename(columns=dict(zip(old_column_names, new_column_names)), inplace=True)

In [None]:
comments_topics = pd.concat([data_clean, topic_bow], axis=1)

In [None]:
topics_gender_subreddit = pd.DataFrame(comments_topics["Topic_0"].groupby([comments_topics['gender'], comments_topics['subreddit']]).mean())

In [None]:
topics_gender_subreddit

In [None]:
old_column_names = [i for i in range(12)]
new_column_names = ["Thank you/appreciation", "Medical care", "Education/Family advice", "Furniture/Design", "Birth/Pregnancy", "Change/Potty training", "Physical appearance/Picture", "Work/Raise children", "Food", "Leisure activities", "School/Teaching", "Sleep training"]

for topic in comments_topics.iloc[:, 8:19]:
    average = pd.DataFrame(comments_topics[topic].groupby([comments_topics['gender'], comments_topics['subreddit']]).mean())
    topics_gender_subreddit = pd.concat([topics_gender_subreddit, average], axis=1, ignore_index = True)

topics_gender_subreddit.rename(columns=dict(zip(old_column_names, new_column_names)), inplace=True)


topics_gender_subreddit

In [None]:
#save to excel the file with the score of each topic for each category
topics_gender_subreddit.to_excel('table_topics_gender_subreddits.xlsx')

In [None]:
topics_gender_subreddit = pd.read_excel('table_topics_gender_subreddits.xlsx', index_col = [0,1])

In [None]:
subreddits = ['Fathers/Parenting', 'Fathers/Daddit', 'Mothers/Mommit', 'Mothers/Parenting']
import numpy as np
import matplotlib.pyplot as plt

#create a heatmap
fig = plt.figure(figsize=(12, 6))

#X = A

plt.pcolor(topics_gender_subreddit, norm=None, cmap='Blues')

N, K = topics_gender_subreddit.shape

#Topics = ['Sleep training', 'Breastfeeding','Potty training']
#topic_labels = [k for k in Topics]

# put the major ticks at the middle of each cell
# the trailing semicolon ';' suppresses output
plt.yticks(np.arange(topics_gender_subreddit.shape[0])+0.5, subreddits, rotation = 0, fontsize = 10);

plt.xticks(np.arange(topics_gender_subreddit.shape[1])+0.5, new_column_names, rotation = 0, fontsize = 8.5);
plt.ylabel('Subreddits', fontsize=20)
plt.xlabel('LDA Topics', fontsize=20)

# flip the y-axis so the texts are in the order we anticipate
plt.gca().invert_yaxis()

plt.tick_params('x', length=10, width=2, which='major')

# rotate the ticks on the x-axis
plt.xticks()

# add a legend
plt.colorbar(cmap='Blues')

plt.tight_layout() 
plt.show()
fig.set_size_inches(15, 15)
#fig.savefig('RedditCompare.png')

In [None]:
#create a copy of topics_gender_subreddit and add a column group with the names of each category
plot_df = topics_gender_subreddit.copy()
plot_df["group"] = subreddits
plot_df

In [None]:
fig_2 = plt.figure(figsize=(5, 2))
plt.scatter(x=np.arange(12), y=plot_df[plot_df["group"] == "Fathers/Parenting"].values[0][:-1], label='Fathers/Parenting')
plt.scatter(x=np.arange(12), y=plot_df[plot_df["group"] == "Fathers/Daddit"].values[0][:-1], label="Fathers/Daddit")
plt.scatter(x=np.arange(12), y=plot_df[plot_df["group"] == "Mothers/Mommit"].values[0][:-1], label="Mothers/Mommit")
plt.scatter(x=np.arange(12), y=plot_df[plot_df["group"] == "Mothers/Parenting"].values[0][:-1], label="Mothers/Parenting")


plt.xticks(np.arange(topics_gender_subreddit.shape[1]), new_column_names, rotation = 0, fontsize = 9)
plt.ylabel('Average score', fontsize=17)
plt.xlabel('LDA Topics', fontsize=17)

plt.tick_params('x', length=10, width=2, which='major')

# rotate the ticks on the x-axis
plt.xticks()
plt.tight_layout() 
plt.legend(loc = "best")
plt.grid()
fig_2.set_size_inches(25, 25)
plt.show()
fig_2.savefig('score topics all authors.jpeg', bbox_inches='tight', dpi=600)


In [None]:
new_column_names = ["Thank you/Appreciation", "Medical care", "Education/Family advice", "Furniture/Design", "Birth/Pregnancy", "Change/Potty training", "Physical appearance/Picture", "Work/Raise children", "Food", "Leisure activities", "School/Teaching", "Sleep training"]
import numpy as np
import matplotlib.pyplot as plt
plt.rc('axes', axisbelow=True)

fig_3 = plt.figure(figsize=(5, 2))
plt.grid()
plt.scatter(x=np.arange(12), y=plot_df[plot_df["group"] == "Fathers/Daddit"].values[0][:-1], s=400,label="Fathers/Daddit", c="orangered")
plt.scatter(x=np.arange(12), y=plot_df[plot_df["group"] == "Fathers/Parenting"].values[0][:-1], s=400, label='Fathers/Parenting', c="pink")
plt.scatter(x=np.arange(12), y=plot_df[plot_df["group"] == "Mothers/Mommit"].values[0][:-1], s=400,label="Mothers/Mommit", c="royalblue")
plt.scatter(x=np.arange(12), y=plot_df[plot_df["group"] == "Mothers/Parenting"].values[0][:-1], s=400,label="Mothers/Parenting", c="skyblue")


plt.xticks(np.arange(plot_df.shape[1]-1), new_column_names, rotation = 40,  fontsize = 20, ha="right")
plt.yticks(fontsize = 20)

plt.ylabel('Average score', fontsize=22)
plt.xlabel('LDA Topics', fontsize=22)

plt.tick_params('x', length=10, width=2, which='major')

# rotate the ticks on the x-axis

plt.xticks()
plt.tight_layout() 
plt.legend(loc = "best", fontsize='large', ncol = 2)

fig_3.set_size_inches(20, 10)
plt.show()
fig_3.savefig('comparison groups per topic.jpeg', bbox_inches='tight', dpi=600)


In [None]:
plot_df_pergroup = plot_df.copy()
plot_df_pergroup.reset_index(drop=True)
plot_df_pergroup = plot_df_pergroup.set_index('group')
plot_df_pergroup = plot_df_pergroup.T
cols = ["Fathers/Parenting", "Fathers/Daddit", "Mothers/Mommit", "Mothers/Parenting"]
#to have percentages, uncomment the comment bellow
#perc_plot_df_pergroup[cols] = perc_plot_df_pergroup[cols].div(perc_plot_df_pergroup[cols].sum(axis=0), axis=1).multiply(100)

In [None]:
fig_4 = plt.figure(figsize=(5, 2))
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Thank you/appreciation",:].values, s=300, label='Thank you/appreciation')
plt.plot(np.arange(4), plot_df_pergroup.loc["Thank you/appreciation",:].values)
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Medical care",:].values, s=300, label='Medical care')
plt.plot(np.arange(4), plot_df_pergroup.loc["Medical care",:].values)
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Education/Family advice",:].values, s=300, label='Education/Family advice')
plt.plot(np.arange(4), plot_df_pergroup.loc["Education/Family advice",:].values)
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Furniture/Design",:].values, s=300, label='Furniture/Design')
plt.plot(np.arange(4), plot_df_pergroup.loc["Furniture/Design",:].values)
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Birth/Pregnancy",:].values, s=300, label='Birth/Pregnancy')
plt.plot(np.arange(4), plot_df_pergroup.loc["Birth/Pregnancy",:].values)
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Change/Potty training",:].values, s=300, label='Change/Potty training')
plt.plot(np.arange(4), plot_df_pergroup.loc["Change/Potty training",:].values)
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Physical appearance/Picture",:].values, s=300, label='Physical appearance/Picture')
plt.plot(np.arange(4), plot_df_pergroup.loc["Physical appearance/Picture"])
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Work/Raise children",:].values, s=300, label='Work/Raise children')
plt.plot(np.arange(4), plot_df_pergroup.loc["Work/Raise children",:].values)                                                 
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Food",:].values, s=300, label='Food')
plt.plot(np.arange(4), plot_df_pergroup.loc["Food",:].values)
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Leisure activities",:].values, s=300, label='Leisure activities')
plt.plot(np.arange(4), plot_df_pergroup.loc["Leisure activities",:].values)
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["School/Teaching",:].values, s=300, label='School/Teaching')
plt.plot(np.arange(4), plot_df_pergroup.loc["School/Teaching",:].values)
plt.scatter(x=np.arange(4), y=plot_df_pergroup.loc["Sleep training",:].values, s=300, label='Sleep training')
plt.plot(np.arange(4), plot_df_pergroup.loc["Sleep training",:].values)


plt.xticks(np.arange(topics_gender_subreddit.shape[0]), cols, rotation = 0, fontsize = 20)
plt.yticks(fontsize = 20)
plt.ylabel('Average score', fontsize=22)
plt.xlabel('Categories', fontsize=22)

plt.tick_params('x', length=10, width=2, which='major')

# rotate the ticks on the x-axis
plt.xticks()
plt.tight_layout() 
plt.legend(loc="upper right", ncol=2, fontsize='large')
plt.grid()
fig_4.set_size_inches(20, 30)
plt.show()
fig_4.savefig('topics per group.jpeg', bbox_inches='tight', dpi=600)



# Find the most representative comments for each topic

In [None]:
#Topic 0
comments_topics[comments_topics["Topic_0"]>0.99]

In [None]:
#Topic 1
comments_topics[comments_topics["Topic_1"]>0.994]

In [None]:
#Topic 2
comments_topics.body[comments_topics["Topic_2"]>0.995]

In [None]:
#Topic 3
comments_topics.body[comments_topics["Topic_3"]>0.99]

In [None]:
#Topic 4
comments_topics[comments_topics["Topic_4"]>0.98]

In [None]:
#Topic 5
comments_topics[comments_topics["Topic_5"]>0.98]

In [None]:
#Topic 6
comments_topics[comments_topics["Topic_6"]>0.983]

In [None]:
#Topic 7
comments_topics[comments_topics["Topic_7"]>0.987]

In [None]:
#Topic 8
comments_topics[comments_topics["Topic_8"]>0.99]

In [None]:
#Topic 9
comments_topics[comments_topics["Topic_9"]>0.99]

In [None]:
#Topic 10
comments_topics[comments_topics["Topic_10"]>0.992]

In [None]:
#Topic 11
comments_topics[comments_topics["Topic_11"]>0.994]