# Digital Methods - Data Analysis
_____

## Table of Content

1. [Libraries](#libraries)
2. [Data Preprocessing](#data-preprocessing)
3. [Topic Modelling](#topic-modelling)
_____

## Libraries

All libraries which are needed to execute the code are listed here. Install the packages by using the `requirements.txt` file. 

The documentation can be found in the [README.md](README.md) file.

In [None]:
# import packages
import pandas as pd 
import os
from nltk.tokenize import TweetTokenizer
import nltk
import string
import re
from collections import defaultdict
from tqdm import tqdm
import spacy 
from spacy import displacy
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel

# import functions
from preprocessing_functions import *

## Data Preprocessing
___

In [None]:
# set working directory
df = pd.read_csv('comments.csv', index_col=0)

#set pandas option to show more text
pd.set_option('display.max_colwidth', 2000)

In [None]:
# process data with using functions from functions.py
processed_df = (
    df.pipe(remove_users, 'text')
      .pipe(lowercase_text, 'text')
      .pipe(remove_numbers, 'text')
      .pipe(remove_whitespace, 'text')
      .pipe(remove_stopwords, 'text')
      .pipe(remove_punctuation, 'text')
)

In [None]:
# text column to string
processed_df['text'] = processed_df['text'].astype('str')

In [None]:
# use stemming to reduce words to their root words
processed_df = stem_words(processed_df, 'text')

In [None]:
# use lemmatization to reduce words to their root form
processed_df = lemmatize_words(processed_df, 'text')

In [None]:
# convert date format
processed_df = convert_date_format(processed_df, 'published_at')

In [None]:
# Replacing NaN-values and aggregating data by date
processed_df.lemmatized_text = processed_df.lemmatized_text.apply(lambda x: '' if str(x) == 'nan' else x)
processed_df.stemmed_text = processed_df.stemmed_text.apply(lambda x: '' if str(x) == 'nan' else x)


# aggregating tweet data by dates and affiliation 
df_agg = processed_df.groupby(['published_at', 'video_id'], as_index = False).agg({'text': ' '.join, 
                                                                            'lemmatized_text': ' '.join,
                                                                            'stemmed_text': ' '.join})
# checking dimensions of new dataset and viewing the dataset
print(df_agg.shape)

#Defining NLTK's TweetTokenizer
tokenizer = TweetTokenizer()

tqdm.pandas()

# tokenizing and creating a column of unigrams from the stemmed tweet text. 
df_agg['unigrams'] = df_agg['stemmed_text'].progress_apply(lambda x: tokenizer.tokenize(x))

In [None]:
#Creating a column with bigrams by applying function to column of unigrams
df_agg['bigrams'] = df_agg.unigrams.progress_apply(lambda x: bigrams(x))
df_agg['tokens'] = df_agg.unigrams+df_agg.bigrams

In [None]:
# insert the column where you saved unigram and bigram tokens between the parentheses
id2word = Dictionary(df_agg['tokens']) 

# viewing how many words are in our vocabulary
print(len(id2word))

In [None]:
# removing very frequent and infrequent words
id2word.filter_extremes(no_below=10, 
                        no_above=.999,
                        keep_n=None) 

# viewing how many words are now in our vocabulary
print(len(id2word))

In [None]:
# creating corpus
corpus = [id2word.doc2bow(doc) for doc in df_agg['tokens']] 

## Topic Modelling
___


- using topic modelling to explore for keywords.
- using LSI, HDP, and LDA to get an impression on the topics of our observed data

In [None]:
# creating LSI Model
lsi_model = LsiModel(corpus=corpus, num_topics=10, id2word=id2word)
lsi_model.show_topics(num_topics=10)

In [None]:
# Creating HDP Model
hdp_model = HdpModel(corpus=corpus, id2word=id2word)
hdp_model.show_topics()[:10]

In [None]:
# Creating LDA Model
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=id2word)
lda_model.show_topics()