# Digital Methods - Data Analysis
_____

## Table of Content

1. [Libraries](#libraries)
2. [Data Preprocessing](#data-preprocessing)

_____

## Libraries

All libraries which are needed to execute the code are listed here. Install the packages by using the `requirements.txt` file. 

The documentation can be found in the [README.md](README.md) file.

In [30]:
# import packages
import pandas as pd 
import os

from nltk.tokenize import TweetTokenizer
import nltk
import string
import re
from collections import defaultdict
from tqdm import tqdm
import spacy 
from spacy import displacy
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel

from preprocessing_function import *

## Data Preprocessing

In [10]:
# set working directory
os.chdir('/Users/marco/Documents/Master Social Data Science/Semester 2/Digital Methods/data') 
df = pd.read_csv('comments.csv', index_col=0)

#set pandas option to show more text
pd.set_option('display.max_colwidth', 2000)

In [11]:
# process data with using functions from functions.py
processed_df = (
    df.pipe(remove_users, 'text')
      .pipe(lowercase_text, 'text')
      .pipe(remove_numbers, 'text')
      .pipe(remove_whitespace, 'text')
      .pipe(remove_stopwords, 'text')
      .pipe(remove_punctuation, 'text')
)

In [13]:
processed_df['text'] = processed_df['text'].astype('str')
processed_df.dtypes

video_id        object
published_at    object
like_count       int64
text            object
author          object
dtype: object

In [14]:
# use stemming to reduce words to their root words
processed_df = stem_words(processed_df, 'text')

In [15]:
# use lemmatization to reduce words to their root form
processed_df = lemmatize_words(processed_df, 'text')

In [34]:
processed_df.to_csv("processed_comments.csv")

In [145]:
processed_df = pd.read_csv('processed_comments.csv', index_col=0)

In [16]:
def convert_date_format(df, column_name):
    # Convert the column to datetime
    df[column_name] = pd.to_datetime(df[column_name])
    
    # Format the dates back to 'dd mm yyyy' strings
    df[column_name] = df[column_name].dt.strftime('%Y-%m-%d')
    
    return df

processed_df = convert_date_format(processed_df, 'published_at')

Unnamed: 0,video_id,published_at,like_count,text,author,stemmed_text,lemmatized_text
0,uW6fi2tCnAc,2023-02-19,1,answer china india help matter much money rest world throws reducing carbon footprint complete waste trillion dollars 🤦‍♀️,@ilandgrl,answer china india help matter much money rest world throw reduc carbon footprint complet wast trillion dollar 🤦‍♀️,answer china india help matter much money rest world throw reduce carbon footprint complete waste trillion dollar 🤦‍♀️
1,uW6fi2tCnAc,2023-02-19,2,guy expert screwed,@lobusdiMortis,guy expert screw,guy expert screw
2,uW6fi2tCnAc,2023-02-18,4,kennedy gem,@skitz1337,kennedi gem,kennedy gem
3,uW6fi2tCnAc,2023-02-18,0,get nation like china cooperate even trust telling truth say absolute ridiculousness occurring western nations supposedly advanced capable leads science technology willing cripple impoverish millions sake iffy pseudo science experts care anyway wealthy elitists least affected ramifications extreme measures would promote rest world,@dodieodie498,get nation like china cooper even trust tell truth say absolut ridicul occur western nation suppos advanc capabl lead scienc technolog will crippl impoverish million sake iffi pseudo scienc expert care anyway wealthi elitist least affect ramif extrem measur would promot rest world,get nation like china cooperate even trust tell truth say absolute ridiculousness occur western nation supposedly advance capable lead science technology willing cripple impoverish million sake iffy pseudo science expert care anyway wealthy elitist least affected ramification extreme measure would promote rest world
4,uW6fi2tCnAc,2023-02-18,3,man going oscar,@karenh4458,man go oscar,man go oscar
...,...,...,...,...,...,...,...
96590,eAnVFKndFoY,2023-01-23,0,course ’ theatre global warming hoax use fear control low iq individuals,@internetexplorer6097,cours ’ theatr global warm hoax use fear control low iq individu,course ’ theatre global warm hoax use fear control low iq individual
96591,eAnVFKndFoY,2023-01-23,4,yup,@bonitabeach3127,yup,yup
96592,eAnVFKndFoY,2023-01-23,5,staged,@kevinford6372,stage,stag
96593,eAnVFKndFoY,2023-01-23,1,greta thunberg waste skin oxygen,@stevenewberry6460,greta thunberg wast skin oxygen,greta thunberg waste skin oxygen


In [17]:
# 3. Replacing NaN-values and aggregating data by date
processed_df.lemmatized_text = processed_df.lemmatized_text.apply(lambda x: '' if str(x) == 'nan' else x)
processed_df.stemmed_text = processed_df.stemmed_text.apply(lambda x: '' if str(x) == 'nan' else x)


#Aggregating tweet data by dates and affiliation 
# agg() is an aggregation function in shape of a dictionary in which we specify which variables (documents here) we want aggregated (joined)
df_agg = processed_df.groupby(['published_at', 'video_id'], as_index = False).agg({'text': ' '.join, #as_index= False, so these variables are not the index
                                                                            'lemmatized_text': ' '.join, # documents are joined with a spacce
                                                                            'stemmed_text': ' '.join})
#Checking dimensions of new dataset and viewing the dataset
print(df_agg.shape)

# 4. Tokenizing stemmed text

#Defining NLTK's TweetTokenizer
tokenizer = TweetTokenizer()

tqdm.pandas() #Creates a progress bar and below use "progress_apply" instead of "apply" to create a progress bar (This is more of a "nice to have" than a "need to have")

#Tokenizing and creating a column of unigrams from the stemmed tweet text. 
df_agg['unigrams'] = df_agg['stemmed_text'].progress_apply(lambda x: tokenizer.tokenize(x))

(7333, 5)


100%|██████████| 7333/7333 [00:05<00:00, 1332.39it/s]


In [20]:
#Defining a function that will create bigrams 
def bigrams(doc): # a doc is a list of unigrams in same order as in tweets 
    
    bigrams = [] #Empty list to save the bigrams
    
    for bigram in list(nltk.bigrams(doc)):  #Creating bigrams as tuples with nltk.bigrams and iterating over these them
        bigrams.append("_".join(bigram))    #Joining each bigram-tuple pair with an underscore and saving to list
    
    return bigrams

#Creating a column with bigrams by applying function to column of unigrams
df_agg['bigrams'] = df_agg.unigrams.progress_apply(lambda x: bigrams(x))

100%|██████████| 7333/7333 [00:00<00:00, 27893.13it/s]


In [21]:
df_agg['tokens'] = df_agg.unigrams+df_agg.bigrams

## Topic Modelling

using topic modelling to explore for keywords.


In [22]:
# 1. Create a id2word dictionary

#Insert the column where you saved unigram and bigram tokens between the parentheses
id2word = Dictionary(df_agg['tokens']) # A dictionary is created through which each token gets a unique id 

#Viewing how many words are in our vocabulary
print(len(id2word))

896734


In [23]:
# 2. Removing very frequent and infrequent words
id2word.filter_extremes(no_below=10, #filtering out words appearing in less than 10 documents
                        no_above=.999, #filtering out words appearing in more than 99,9% of all documents
                        keep_n=None) # If we don't set keep_n=None, then the vocabulary "only" contains the 10000 most frequent words 

#Viewing how many words are now in our vocabulary after filtering
print(len(id2word))

19198


In [24]:
# 4. Creating corpus

# Convert documents into the bag-of-words (BoW) format. 
# We get a list of tuples with (token_id, token_count) for each document.
corpus = [id2word.doc2bow(doc) for doc in df_agg['tokens']] 

# Topic Modelling

In [31]:
lsi_model = LsiModel(corpus=corpus, num_topics=10, id2word=id2word)
topics = lsi_model.show_topics(num_topics=10)

In [33]:
topics

[(0,
  '0.387*"peopl" + 0.306*"’" + 0.229*"govern" + 0.184*"land" + 0.165*"fire" + 0.162*"like" + 0.155*"get" + 0.128*"go" + 0.122*"climat" + 0.117*"need"'),
 (1,
  '0.507*"climat" + 0.341*"chang" + 0.264*"climat_chang" + -0.187*"land" + -0.174*"govern" + -0.157*"fire" + -0.143*"peopl" + 0.132*"year" + -0.126*"maui" + -0.124*"tulsi"'),
 (2,
  '-0.289*"kerri" + 0.260*"chang" + -0.222*"😂" + 0.210*"climat" + -0.209*"fuel" + 0.197*"climat_chang" + -0.176*"fossil" + -0.169*"lie" + -0.159*"fossil_fuel" + -0.152*"like"'),
 (3,
  '0.427*"kerri" + -0.304*"fuel" + -0.279*"fossil" + -0.252*"fossil_fuel" + 0.211*"john" + 0.172*"lie" + 0.171*"john_kerri" + 0.159*"fli" + -0.152*"oil" + -0.136*"energi"'),
 (4,
  '0.340*"😂" + 0.266*"expert" + 0.203*"kid" + 0.190*"chang" + -0.169*"kerri" + -0.159*"thank" + 0.156*"😂_😂" + -0.153*"toni" + 0.142*"greta" + 0.140*"climat_chang"'),
 (5,
  '-0.263*"greta" + -0.202*"expert" + -0.185*"know" + -0.181*"co" + 0.171*"climat_chang" + 0.170*"chang" + 0.165*"climat" + 

In [28]:
hdp_model = HdpModel(corpus=corpus, id2word=id2word)
hdp_model.show_topics()[:10]

[(0,
  '0.011*climat + 0.008*peopl + 0.008*’ + 0.007*chang + 0.005*like + 0.005*climat_chang + 0.005*year + 0.004*get + 0.004*one + 0.004*would + 0.004*go + 0.004*know + 0.003*us + 0.003*need + 0.003*co + 0.003*say + 0.003*make + 0.003*time + 0.003*use + 0.003*thank'),
 (1,
  '0.010*climat + 0.006*chang + 0.004*like + 0.004*peopl + 0.004*year + 0.004*climat_chang + 0.004*’ + 0.004*co + 0.004*would + 0.003*one + 0.003*scienc + 0.003*oil + 0.003*warm + 0.003*earth + 0.003*energi + 0.003*thank + 0.003*get + 0.003*time + 0.003*say + 0.003*fossil'),
 (2,
  '0.008*climat + 0.005*’ + 0.005*chang + 0.004*peopl + 0.004*like + 0.004*energi + 0.004*climat_chang + 0.003*one + 0.003*would + 0.003*year + 0.003*get + 0.003*need + 0.003*thank + 0.003*world + 0.003*use + 0.003*know + 0.003*go + 0.002*time + 0.002*us + 0.002*make'),
 (3,
  '0.009*climat + 0.006*chang + 0.005*’ + 0.005*climat_chang + 0.004*peopl + 0.004*thank + 0.003*year + 0.003*would + 0.003*one + 0.003*like + 0.003*co + 0.003*scienc +

In [36]:
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=id2word)
lda_model.show_topics()

[(0,
  '0.012*"’" + 0.008*"peopl" + 0.007*"climat" + 0.006*"get" + 0.005*"would" + 0.005*"like" + 0.005*"go" + 0.005*"us" + 0.005*"know" + 0.004*"fire"'),
 (1,
  '0.015*"climat" + 0.011*"chang" + 0.009*"peopl" + 0.009*"’" + 0.008*"climat_chang" + 0.007*"year" + 0.006*"like" + 0.005*"go" + 0.004*"one" + 0.004*"get"'),
 (2,
  '0.018*"climat" + 0.009*"chang" + 0.008*"toni" + 0.008*"thank" + 0.006*"year" + 0.006*"climat_chang" + 0.005*"’" + 0.005*"peopl" + 0.004*"one" + 0.004*"co"'),
 (3,
  '0.014*"’" + 0.013*"peopl" + 0.009*"govern" + 0.008*"get" + 0.007*"like" + 0.006*"one" + 0.005*"say" + 0.005*"know" + 0.005*"climat" + 0.004*"need"'),
 (4,
  '0.009*"’" + 0.008*"peopl" + 0.008*"climat" + 0.006*"like" + 0.006*"know" + 0.006*"chang" + 0.005*"want" + 0.004*"year" + 0.004*"climat_chang" + 0.004*"say"'),
 (5,
  '0.010*"😂" + 0.008*"climat" + 0.005*"😂_😂" + 0.005*"thank" + 0.005*"chang" + 0.005*"’" + 0.004*"peopl" + 0.004*"say" + 0.004*"climat_chang" + 0.004*"🤣"'),
 (6,
  '0.010*"climat" + 0.00