# Digital Methods - Data Analysis
_____

## Table of Content

1. [Libraries](#libraries)
2. [Data Preprocessing](#data-preprocessing)
3. [Topic Modelling](#topic-modelling)
_____

## Libraries

All libraries which are needed to execute the code are listed here. Install the packages by using the `requirements.txt` file. 

The documentation can be found in the [README.md](README.md) file.

In [3]:
# import packages
import pandas as pd 
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm 
from gensim.corpora import Dictionary
from gensim.models import LdaModel, LsiModel, HdpModel

# import functions
from preprocessing_functions import *

## Data Preprocessing
___

In [4]:
# set working directory
df = pd.read_csv('comments.csv', index_col=0)

#set pandas option to show more text
pd.set_option('display.max_colwidth', 2000)

In [5]:
# process data with using functions from functions.py
processed_df = (
    df.pipe(remove_users, 'text')
      .pipe(lowercase_text, 'text')
      .pipe(remove_numbers, 'text')
      .pipe(remove_whitespace, 'text')
      .pipe(remove_stopwords, 'text')
      .pipe(remove_punctuation, 'text')
)

In [6]:
# text column to string
processed_df['text'] = processed_df['text'].astype('str')

In [7]:
# use stemming to reduce words to their root words
processed_df = stem_words(processed_df, 'text')

In [8]:
# use lemmatization to reduce words to their root form
processed_df = lemmatize_words(processed_df, 'text')

In [9]:
# convert date format
processed_df = convert_date_format(processed_df, 'published_at')

In [10]:
# Replacing NaN-values and aggregating data by date
processed_df.lemmatized_text = processed_df.lemmatized_text.apply(lambda x: '' if str(x) == 'nan' else x)
processed_df.stemmed_text = processed_df.stemmed_text.apply(lambda x: '' if str(x) == 'nan' else x)


# aggregating tweet data by dates and affiliation 
df_agg = processed_df.groupby(['published_at', 'video_id'], as_index = False).agg({'text': ' '.join, 
                                                                            'lemmatized_text': ' '.join,
                                                                            'stemmed_text': ' '.join})
# checking dimensions of new dataset and viewing the dataset
print(df_agg.shape)

#Defining NLTK's TweetTokenizer
tokenizer = TweetTokenizer()

tqdm.pandas()

# tokenizing and creating a column of unigrams from the stemmed tweet text. 
df_agg['unigrams'] = df_agg['stemmed_text'].progress_apply(lambda x: tokenizer.tokenize(x))

(7333, 5)


100%|██████████| 7333/7333 [00:05<00:00, 1333.30it/s]


In [11]:
#Creating a column with bigrams by applying function to column of unigrams
df_agg['bigrams'] = df_agg.unigrams.progress_apply(lambda x: bigrams(x))
df_agg['tokens'] = df_agg.unigrams+df_agg.bigrams

100%|██████████| 7333/7333 [00:00<00:00, 16481.80it/s]


In [12]:
# insert the column where you saved unigram and bigram tokens between the parentheses
id2word = Dictionary(df_agg['tokens']) 

# viewing how many words are in our vocabulary
print(len(id2word))

896734


In [13]:
# removing very frequent and infrequent words
id2word.filter_extremes(no_below=10, 
                        no_above=.999,
                        keep_n=None) 

# viewing how many words are now in our vocabulary
print(len(id2word))

19198


In [14]:
# creating corpus
corpus = [id2word.doc2bow(doc) for doc in df_agg['tokens']] 

## Topic Modelling
___


- using topic modelling to explore for keywords.
- using LSI, HDP, and LDA to get an impression on the topics of our observed data

In [15]:
# creating LSI Model
lsi_model = LsiModel(corpus=corpus, num_topics=10, id2word=id2word)
lsi_model.show_topics(num_topics=10)

[(0,
  '0.387*"peopl" + 0.306*"’" + 0.229*"govern" + 0.184*"land" + 0.165*"fire" + 0.162*"like" + 0.155*"get" + 0.128*"go" + 0.122*"climat" + 0.117*"need"'),
 (1,
  '0.507*"climat" + 0.341*"chang" + 0.264*"climat_chang" + -0.187*"land" + -0.174*"govern" + -0.157*"fire" + -0.143*"peopl" + 0.132*"year" + -0.126*"maui" + -0.124*"tulsi"'),
 (2,
  '0.289*"kerri" + -0.260*"chang" + 0.222*"😂" + -0.210*"climat" + 0.209*"fuel" + -0.197*"climat_chang" + 0.176*"fossil" + 0.169*"lie" + 0.159*"fossil_fuel" + 0.152*"like"'),
 (3,
  '0.427*"kerri" + -0.304*"fuel" + -0.279*"fossil" + -0.252*"fossil_fuel" + 0.211*"john" + 0.172*"lie" + 0.171*"john_kerri" + 0.159*"fli" + -0.152*"oil" + -0.136*"energi"'),
 (4,
  '-0.340*"😂" + -0.266*"expert" + -0.203*"kid" + -0.190*"chang" + 0.169*"kerri" + 0.159*"thank" + -0.156*"😂_😂" + 0.153*"toni" + -0.142*"greta" + -0.140*"climat_chang"'),
 (5,
  '0.263*"greta" + 0.202*"expert" + 0.185*"know" + 0.181*"co" + -0.171*"climat_chang" + -0.170*"chang" + -0.165*"climat" + 0

In [16]:
# Creating HDP Model
hdp_model = HdpModel(corpus=corpus, id2word=id2word)
hdp_model.show_topics()[:10]

[(0,
  '0.011*climat + 0.008*’ + 0.008*peopl + 0.007*chang + 0.005*like + 0.005*climat_chang + 0.005*year + 0.004*get + 0.004*one + 0.004*would + 0.004*go + 0.004*know + 0.003*us + 0.003*co + 0.003*need + 0.003*say + 0.003*time + 0.003*make + 0.003*use + 0.003*world'),
 (1,
  '0.011*climat + 0.006*chang + 0.005*’ + 0.005*peopl + 0.004*climat_chang + 0.004*like + 0.004*year + 0.004*would + 0.004*one + 0.003*scienc + 0.003*co + 0.003*thank + 0.003*get + 0.003*use + 0.003*time + 0.003*know + 0.003*scientist + 0.003*earth + 0.003*say + 0.003*warm'),
 (2,
  '0.006*climat + 0.004*peopl + 0.003*chang + 0.003*’ + 0.003*energi + 0.003*like + 0.003*use + 0.002*climat_chang + 0.002*one + 0.002*world + 0.002*year + 0.002*would + 0.002*need + 0.002*get + 0.002*make + 0.002*dr + 0.002*know + 0.002*us + 0.002*power + 0.002*thank'),
 (3,
  '0.007*climat + 0.004*chang + 0.003*peopl + 0.003*climat_chang + 0.003*co + 0.003*scienc + 0.003*year + 0.003*global + 0.003*like + 0.002*say + 0.002*thank + 0.002*

In [17]:
# Creating LDA Model
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=id2word)
lda_model.show_topics()

[(0,
  '0.011*"’" + 0.009*"climat" + 0.006*"know" + 0.006*"peopl" + 0.005*"like" + 0.005*"chang" + 0.005*"would" + 0.005*"year" + 0.004*"😂" + 0.004*"climat_chang"'),
 (1,
  '0.008*"energi" + 0.007*"co" + 0.007*"climat" + 0.005*"use" + 0.004*"would" + 0.004*"temperatur" + 0.004*"tom" + 0.004*"thank" + 0.004*"chang" + 0.004*"atmospher"'),
 (2,
  '0.034*"fossil" + 0.028*"fuel" + 0.025*"fossil_fuel" + 0.014*"oil" + 0.008*"climat" + 0.006*"energi" + 0.006*"use" + 0.005*"’" + 0.004*"would" + 0.004*"like"'),
 (3,
  '0.015*"climat" + 0.010*"chang" + 0.008*"climat_chang" + 0.007*"peopl" + 0.007*"year" + 0.006*"’" + 0.005*"like" + 0.005*"get" + 0.005*"toni" + 0.004*"know"'),
 (4,
  '0.014*"climat" + 0.009*"’" + 0.007*"chang" + 0.007*"peopl" + 0.007*"year" + 0.005*"climat_chang" + 0.005*"scienc" + 0.005*"thank" + 0.004*"govern" + 0.004*"get"'),
 (5,
  '0.016*"climat" + 0.008*"co" + 0.008*"chang" + 0.007*"year" + 0.006*"thank" + 0.005*"’" + 0.005*"like" + 0.005*"climat_chang" + 0.004*"earth" + 0.0