# Digital Methods - Topic Modelling
_____

## Table of Content

1. [Libraries](#libraries)
2. [Data Preprocessing](#data-preprocessing)
3. [Topic Modelling](#topic-modelling)
_____

## Libraries

All libraries which are needed to execute the code are listed here. Install the packages by using the `requirements.txt` file. 

The documentation can be found in the [README.md](README.md) file.

In [33]:
# import packages
import pandas as pd 
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm 
from gensim.corpora import Dictionary
from gensim.models import LdaModel, LsiModel, HdpModel

# import functions
from preprocessing_functions import *

## Data Preprocessing
___

In [34]:
# set working directory
df = pd.read_csv('comments.csv', index_col=0)

#set pandas option to show more text
pd.set_option('display.max_colwidth', 2000)

In [35]:
# process data with using functions from functions.py
processed_df = (
    df.pipe(remove_users, 'text')
      .pipe(lowercase_text, 'text')
      .pipe(remove_whitespace, 'text')
      .pipe(remove_stopwords, 'text')
      .pipe(remove_punctuation, 'text')
)

In [36]:
# text column to string
processed_df['text'] = processed_df['text'].astype('str')
processed_df['text'] = processed_df['text'].str.replace('\'', '')

In [37]:
# use stemming to reduce words to their root words
processed_df = stem_words(processed_df, 'text')

In [38]:
# use lemmatization to reduce words to their root form
processed_df = lemmatize_words(processed_df, 'text')

In [39]:
# convert date format
processed_df = convert_date_format(processed_df, 'published_at')

In [40]:
# Replacing NaN-values and aggregating data by date
processed_df.lemmatized_text = processed_df.lemmatized_text.apply(lambda x: '' if str(x) == 'nan' else x)
processed_df.stemmed_text = processed_df.stemmed_text.apply(lambda x: '' if str(x) == 'nan' else x)


# aggregating tweet data by dates and affiliation 
df_agg = processed_df.groupby(['published_at', 'video_id'], as_index = False).agg({'text': ' '.join, 
                                                                            'lemmatized_text': ' '.join,
                                                                            'stemmed_text': ' '.join})
# checking dimensions of new dataset and viewing the dataset
print(df_agg.shape)

#Defining NLTK's TweetTokenizer
tokenizer = TweetTokenizer()

tqdm.pandas()

# tokenizing and creating a column of unigrams from the stemmed tweet text. 
df_agg['unigrams'] = df_agg['stemmed_text'].progress_apply(lambda x: tokenizer.tokenize(x))

(7333, 5)


100%|██████████| 7333/7333 [00:05<00:00, 1226.80it/s]


In [41]:
#Creating a column with bigrams by applying function to column of unigrams
df_agg['bigrams'] = df_agg.unigrams.progress_apply(lambda x: bigrams(x))
df_agg['tokens'] = df_agg.unigrams+df_agg.bigrams

100%|██████████| 7333/7333 [00:00<00:00, 27452.09it/s]


In [42]:
# insert the column where you saved unigram and bigram tokens between the parentheses
id2word = Dictionary(df_agg['tokens']) 

# viewing how many words are in our vocabulary
print(len(id2word))

928879


In [43]:
# removing very frequent and infrequent words
id2word.filter_extremes(no_below=10, 
                        no_above=.999,
                        keep_n=None) 

# viewing how many words are now in our vocabulary
print(len(id2word))

19559


In [44]:
# creating corpus
corpus = [id2word.doc2bow(doc) for doc in df_agg['tokens']] 

## Topic Modelling
___


- using topic modelling to explore for keywords.
- using LSI, HDP, and LDA to get an impression on the topics of our observed data

In [45]:
# creating LSI Model
lsi_model = LsiModel(corpus=corpus, num_topics=10, id2word=id2word)
lsi_model.show_topics(num_topics=10)

[(0,
  '0.386*"peopl" + 0.305*"’" + 0.229*"govern" + 0.184*"land" + 0.165*"fire" + 0.161*"like" + 0.155*"get" + 0.128*"go" + 0.121*"climat" + 0.117*"need"'),
 (1,
  '0.507*"climat" + 0.341*"chang" + 0.264*"climat_chang" + -0.186*"land" + -0.174*"govern" + -0.157*"fire" + -0.142*"peopl" + 0.131*"year" + -0.126*"maui" + -0.123*"tulsi"'),
 (2,
  '0.287*"kerri" + -0.259*"chang" + 0.222*"😂" + 0.210*"fuel" + -0.209*"climat" + -0.197*"climat_chang" + 0.177*"fossil" + 0.168*"lie" + 0.159*"fossil_fuel" + 0.151*"like"'),
 (3,
  '-0.428*"kerri" + 0.304*"fuel" + 0.279*"fossil" + 0.252*"fossil_fuel" + -0.211*"john" + -0.172*"lie" + -0.171*"john_kerri" + -0.159*"fli" + 0.152*"oil" + 0.136*"energi"'),
 (4,
  '0.340*"😂" + 0.265*"expert" + 0.202*"kid" + 0.190*"chang" + -0.168*"kerri" + -0.160*"thank" + 0.156*"😂_😂" + -0.153*"toni" + 0.141*"climat_chang" + 0.140*"greta"'),
 (5,
  '0.264*"greta" + 0.204*"expert" + 0.186*"know" + 0.177*"co2" + -0.168*"climat_chang" + -0.167*"chang" + -0.164*"climat" + 0.15

In [46]:
# Creating HDP Model
hdp_model = HdpModel(corpus=corpus, id2word=id2word)
hdp_model.show_topics()[:10]

[(0,
  '0.011*climat + 0.008*’ + 0.008*peopl + 0.007*chang + 0.005*like + 0.005*climat_chang + 0.005*year + 0.004*get + 0.004*one + 0.004*would + 0.004*go + 0.003*know + 0.003*us + 0.003*need + 0.003*say + 0.003*co2 + 0.003*time + 0.003*use + 0.003*make + 0.003*thank'),
 (1,
  '0.009*climat + 0.005*peopl + 0.005*chang + 0.005*’ + 0.004*one + 0.004*like + 0.004*would + 0.004*climat_chang + 0.003*world + 0.003*year + 0.003*energi + 0.003*co2 + 0.003*get + 0.003*make + 0.003*thank + 0.003*know + 0.003*scienc + 0.003*use + 0.003*go + 0.003*us'),
 (2,
  '0.011*climat + 0.006*chang + 0.005*’ + 0.004*climat_chang + 0.004*peopl + 0.004*like + 0.004*year + 0.003*would + 0.003*one + 0.003*know + 0.003*thank + 0.003*go + 0.003*say + 0.003*time + 0.003*get + 0.002*world + 0.002*scienc + 0.002*think + 0.002*dr + 0.002*warm'),
 (3,
  '0.008*climat + 0.005*chang + 0.004*climat_chang + 0.004*’ + 0.004*peopl + 0.003*like + 0.003*year + 0.003*would + 0.003*get + 0.003*thank + 0.003*go + 0.002*one + 0.00

In [47]:
# Creating LDA Model
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=id2word)
lda_model.show_topics()

[(0,
  '0.015*"toni" + 0.010*"thank" + 0.009*"climat" + 0.008*"year" + 0.007*"peopl" + 0.007*"chang" + 0.005*"like" + 0.005*"climat_chang" + 0.004*"thank_toni" + 0.004*"’"'),
 (1,
  '0.008*"climat" + 0.008*"’" + 0.008*"peopl" + 0.007*"😂" + 0.006*"know" + 0.006*"like" + 0.005*"would" + 0.005*"get" + 0.005*"go" + 0.004*"chang"'),
 (2,
  '0.012*"climat" + 0.007*"co2" + 0.006*"year" + 0.006*"’" + 0.005*"chang" + 0.005*"earth" + 0.005*"like" + 0.004*"carbon" + 0.004*"climat_chang" + 0.004*"would"'),
 (3,
  '0.018*"climat" + 0.012*"chang" + 0.008*"climat_chang" + 0.007*"’" + 0.006*"scienc" + 0.005*"peopl" + 0.005*"scientist" + 0.004*"one" + 0.004*"like" + 0.004*"know"'),
 (4,
  '0.023*"climat" + 0.013*"chang" + 0.010*"climat_chang" + 0.007*"peopl" + 0.006*"’" + 0.005*"earth" + 0.005*"one" + 0.005*"greenhous" + 0.004*"warm" + 0.004*"co2"'),
 (5,
  '0.015*"climat" + 0.012*"chang" + 0.009*"climat_chang" + 0.009*"’" + 0.008*"peopl" + 0.006*"like" + 0.005*"year" + 0.004*"get" + 0.004*"one" + 0.00