<a href="https://colab.research.google.com/github/milanbargiel/csma/blob/main/Cluster_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

In this notebook we perform Methods for making Sense of the Cluster Data.

Therefore we will preprocess the data and then print for Cluster x:

- Top 10 most frequently used word
- Word Cloud
- Topic Modeling

# Install Packages

In [148]:
!pip install fasttext
!pip install contractions



# Import Packages

In [149]:
import pandas as pd
import numpy as np
# packages for data preperation
import nltk
import string
import fasttext
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
# download missing libraries
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [150]:
# Set pandas printing options to improve readability
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.options.mode.chained_assignment = None
pd.set_option('display.max_colwidth', 100)

# Import clusters

In [151]:
!python --version

Python 3.7.10


In [152]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [153]:
input_path = "/content/drive/My Drive/CSMA/Data/Clusters/grouped_by_year/"

In [154]:
data = pd.read_pickle(input_path+'/data_clustered_2020'+'.pkl')
data = data.sort_values(by=['label_kmedoids'], ascending=False)
data.head()

Unnamed: 0.1,Unnamed: 0,publishedAt,authorName,text,isReply,video_id,cleaned,label_manual,label_kmedoids,distance_kmedoids,highlight
6426,29903,2020-07-28 00:46:33,Cody,"@Elyjah Stark During this plandemic, it's nice that they geoengineers have scaled back their wea...",1,wpSRp_R0J9A,"@ During this plandemic, it's nice that they geoengineers have scaled back their weather manipul...",-1.0,51,3.2e-05,0.0
11775,53832,2020-11-27 06:16:46,Matt,"@Vnimaniye I'd check out the US senate bill 517 of the 109th Congress (""the weather modification...",1,b1Enrzgrl1w,"@ I'd check out the US senate bill 517 of the 109th Congress (""the weather modification and deve...",-1.0,51,6.4e-05,0.0
11778,53835,2020-11-25 12:52:56,Vnimaniye,You can manipulate weather and climate,1,b1Enrzgrl1w,You can manipulate weather and climate,-1.0,51,5.3e-05,0.0
11789,53846,2020-11-09 05:26:25,perf b,"we have been unintentionally geoengineering for 100 years, time to get intentional about it, ass...",0,b1Enrzgrl1w,"we have been unintentionally geoengineering for 100 years, time to get intentional about it, ass...",-1.0,51,5.4e-05,0.0
11793,53850,2020-11-08 11:35:20,Slaterdom,I have 10years of photos and videos documenting the change in the skys. the already massive geoe...,0,b1Enrzgrl1w,I have 10years of photos and videos documenting the change in the skys. the already massive geoe...,-1.0,51,8e-05,0.0


# Select a cluster

In [158]:
# Choose a cluster
cluster_number =   12
data['label_kmedoids'] = data['label_kmedoids'].astype('category')
cluster = data[data['label_kmedoids']==cluster_number]
cluster.head()

Unnamed: 0.1,Unnamed: 0,publishedAt,authorName,text,isReply,video_id,cleaned,label_manual,label_kmedoids,distance_kmedoids,highlight
13400,55548,2020-01-02 16:29:45,amorag59,Sinister To see if it works ðŸ˜‚,1,1hhzrormtP4,Sinister To see if it works ðŸ˜‚,-1.0,12,4.7e-05,0.0
11854,53911,2020-11-02 15:13:33,LOOKUP2,Only Jesus Acts 4:12,1,b1Enrzgrl1w,Only Jesus Acts 4:12,-1.0,12,0.000101,0.0
6083,29543,2020-02-18 04:32:43,Helen Johnson- Tyus,@siouxperb5570 LOLOLOLOL!!!! LOLOLOL!!!! LOLOLOL!!!!,1,wpSRp_R0J9A,@ LOLOLOLOL!!!! LOLOLOL!!!! LOLOLOL!!!!,-1.0,12,0.000107,0.0
11851,53908,2020-11-01 14:49:40,Bartlemy,They&#39;recreating the fkn CC !,0,b1Enrzgrl1w,They'recreating the fkn CC !,-1.0,12,8.9e-05,0.0
6253,29713,2020-01-31 22:04:01,Gary Larocca,Lol. Narcissists lol.,0,wpSRp_R0J9A,Lol. Narcissists lol.,-1.0,12,6.2e-05,0.0


# Data Preperation

In [159]:
# Merge all comments of a cluster into one
cluster_merged = {'text': ''}

for index, row in cluster.iterrows():
  cluster_merged['text'] += row['cleaned']

# Create dataframe out of dictionary
cdf = pd.DataFrame(cluster_merged, index=[0])

# Do data preperation
# 1. Expand Contractions (We would'nt -> We would not)
cdf['no_contract'] = cdf['text'].apply(lambda x:[contractions.fix(word) for word in x.split()])
cdf['text_str'] = [' '.join(map(str, l)) for l in cdf['no_contract']] # Detokenize

# 2. Tokenization
cdf['tokenized'] = cdf['text_str'].apply(word_tokenize)

#3. Convert to lower case
cdf['lower'] = cdf['tokenized'].apply(lambda x: [word.lower() for word in x])

#4. Removing punctuation
punc = string.punctuation
cdf['no_punc'] = cdf['lower'].apply(lambda x: [word for word in x if word not in punc])

#5. Removing stopwords
stop_words = set(stopwords.words('english'))
cdf['stopwords_removed'] = cdf['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])

#6. Lemmatization
# Apply part of speech tags: Determine the part of speech (ie. noun, verb, adverb, etc.) for each word.
cdf['pos_tags'] = cdf['stopwords_removed'].apply(nltk.tag.pos_tag)

# Convert to wordnet pos for NLTKâ€™s word lemmatizer
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

cdf['wordnet_pos'] = cdf['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

# Apply NLTKâ€™s word lemmatizer
wnl = WordNetLemmatizer()
cdf['lemmatized'] = cdf['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
cdf.head()

Unnamed: 0,text,no_contract,text_str,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,Sinister To see if it works ðŸ˜‚ Only Jesus Acts 4:12 @ LOLOLOLOL!!!! LOLOLOL!!!! LOLOLOL!!!! Th...,"[Sinister, To, see, if, it, works, ðŸ˜‚, Only, Jesus, Acts, 4:12, @, LOLOLOLOL!!!!, LOLOLOL!!!!, LO...",Sinister To see if it works ðŸ˜‚ Only Jesus Acts 4:12 @ LOLOLOLOL!!!! LOLOLOL!!!! LOLOLOL!!!! They'...,"[Sinister, To, see, if, it, works, ðŸ˜‚, Only, Jesus, Acts, 4:12, @, LOLOLOLOL, !, !, !, !, LOLOLOL...","[sinister, to, see, if, it, works, ðŸ˜‚, only, jesus, acts, 4:12, @, lolololol, !, !, !, !, lololol...","[sinister, to, see, if, it, works, ðŸ˜‚, only, jesus, acts, 4:12, lolololol, lololol, lololol, they...","[sinister, see, works, ðŸ˜‚, jesus, acts, 4:12, lolololol, lololol, lololol, they'recreating, fkn, ...","[(sinister, NN), (see, VBP), (works, VBZ), (ðŸ˜‚, NNP), (jesus, NN), (acts, VBZ), (4:12, CD), (lolo...","[(sinister, n), (see, v), (works, v), (ðŸ˜‚, n), (jesus, n), (acts, v), (4:12, n), (lolololol, n), ...","[sinister, see, work, ðŸ˜‚, jesus, act, 4:12, lolololol, lololol, lololol, they'recreating, fkn, cc..."
