# Digital Methods - Dictionary Classifier
_____

## Table of Content

1. [Libraries](#libraries)
2. [Load Data](#load-data)
3. [Data Preprocessing](#preprocessing-of-the-data)
4. [Set Up of Dictionary](#building-dictionary)
5. [Classifier](#classifier)
_____

## Libraries

In [13]:
# import packages
import pandas as pd 
import os
from tqdm import tqdm
from nltk.tokenize import TweetTokenizer
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams

from preprocessing_functions import *

## Load Data

Load collected YouTube comments from the Data Collection.

In [14]:
df = pd.read_csv('data/comments_final.csv', index_col=0)

In [15]:
# process data with using functions from functions.py
processed_df = (
    df.pipe(remove_users, 'text')
      .pipe(lowercase_text, 'text')
      .pipe(remove_whitespace, 'text')
      .pipe(remove_punctuation, 'text')
)

In [16]:
# heading the processed df (removed users, lowercased, removed whitspace and punctuation, stemmed and lemmatized)
processed_df.head()

Unnamed: 0,video_id,published_at,like_count,text,author
0,uW6fi2tCnAc,2023-02-19T21:22:45Z,1,the answer is if china and india dont help it ...,0
1,uW6fi2tCnAc,2023-02-19T00:43:40Z,2,and that guy is an expert were screwed,1
2,uW6fi2tCnAc,2023-02-18T22:57:38Z,4,kennedy is a gem,2
3,uW6fi2tCnAc,2023-02-18T22:22:49Z,0,and just how do we get a nation like china to ...,3
4,uW6fi2tCnAc,2023-02-18T21:44:49Z,3,that man was going for an oscar,4


# Preprocessing of the Data

- Stemming and Lemmatizing
- Tokenization of the comments
- Building N-grams

In [17]:
# text column to string
processed_df['text'] = processed_df['text'].astype('str')
processed_df['text'] = processed_df['text'].str.replace('\'', '')

# use stemming to reduce words to their root words
processed_df = stem_words(processed_df, 'text')

# use lemmatization to reduce words to their root form
processed_df = lemmatize_words(processed_df, 'text')

# convert date format
processed_df = convert_date_format(processed_df, 'published_at')

In [18]:
#dropping the na from lemmatized & stemmed text to avoid issues with creating n_grams
processed_df.lemmatized_text = processed_df.lemmatized_text.apply(lambda x: '' if str(x) == 'nan' else x)
processed_df.stemmed_text = processed_df.stemmed_text.apply(lambda x: '' if str(x) == 'nan' else x)

In [19]:
# Tokenizing the lemmatized and stemmed text before creating n-grams

def tokenize_words(text):
    words = word_tokenize(text)
    #words_with_quotes = [f"'{word}'" for word in words]
    return words

processed_df["stemmed_tokens"] = processed_df["stemmed_text"].apply(lambda x: tokenize_words(x))
processed_df["lemmatized_tokens"] = processed_df["lemmatized_text"].apply(lambda x: tokenize_words(x))
processed_df.head()

Unnamed: 0,video_id,published_at,like_count,text,author,stemmed_text,lemmatized_text,stemmed_tokens,lemmatized_tokens
0,uW6fi2tCnAc,2023-02-19,1,the answer is if china and india dont help it ...,0,the answer is if china and india dont help it ...,the answer be if china and india dont help it ...,"[the, answer, is, if, china, and, india, dont,...","[the, answer, be, if, china, and, india, dont,..."
1,uW6fi2tCnAc,2023-02-19,2,and that guy is an expert were screwed,1,and that guy is an expert were screw,and that guy be an expert be screw,"[and, that, guy, is, an, expert, were, screw]","[and, that, guy, be, an, expert, be, screw]"
2,uW6fi2tCnAc,2023-02-18,4,kennedy is a gem,2,kennedi is a gem,kennedy be a gem,"[kennedi, is, a, gem]","[kennedy, be, a, gem]"
3,uW6fi2tCnAc,2023-02-18,0,and just how do we get a nation like china to ...,3,and just how do we get a nation like china to ...,and just how do we get a nation like china to ...,"[and, just, how, do, we, get, a, nation, like,...","[and, just, how, do, we, get, a, nation, like,..."
4,uW6fi2tCnAc,2023-02-18,3,that man was going for an oscar,4,that man was go for an oscar,that man be go for an oscar,"[that, man, was, go, for, an, oscar]","[that, man, be, go, for, an, oscar]"


In [21]:
# Creating n-grams 
tqdm.pandas() #Creates a progress bar and below use "progress_apply" instead of "apply" to create a progress bar (This is more of a "nice to have" than a "need to have")

#Defining a function that will create bigrams 
def bigrams(doc): # a doc is a list of tokens/unigrams in same order as in tweets 
    
    bigrams = [] #Empty list to save the bigrams
    
    for bigram in list(nltk.bigrams(doc)):  #Creating bigrams as tuples with nltk.bigrams and iterating over these them
        bigrams.append("_".join(bigram))    #Joining each bigram-tuple pair with an underscore and saving to list
    
    return bigrams

#Defining a function that will create bigrams 
def trigrams(doc): # a doc is a list of unigrams in same order as in tweets 
    
    trigrams = [] #Empty list to save the bigrams
    
    for trigram in list(nltk.trigrams(doc)):  #Creating bigrams as tuples with nltk.bigrams and iterating over these them
        trigrams.append("_".join(trigram))    #Joining each bigram-tuple pair with an underscore and saving to list
    
    return trigrams

#Defining a function that will create bigrams 
def fourgrams(doc): # a doc is a list of unigrams in same order as in tweets 
    
    fourgrams = [] #Empty list to save the bigrams
    
    for fourgram in list(ngrams(doc, 4)):  #Creating bigrams as tuples with nltk.bigrams and iterating over these them
        fourgrams.append("_".join(fourgram))    #Joining each bigram-tuple pair with an underscore and saving to list
    
    return fourgrams

#Creating a column with bigrams by applying function to column of unigrams
processed_df['bigrams_lemma'] = processed_df["lemmatized_tokens"].progress_apply(lambda x: bigrams(x))
processed_df['trigrams_lemma'] = processed_df['lemmatized_tokens'].progress_apply(lambda x : trigrams(x))
processed_df['fourgrams_lemma'] = processed_df['lemmatized_tokens'].progress_apply(lambda x : fourgrams(x))

processed_df['bigrams_stem'] = processed_df["stemmed_tokens"].progress_apply(lambda x: bigrams(x))
processed_df['trigrams_stem'] = processed_df['stemmed_tokens'].progress_apply(lambda x : trigrams(x))
processed_df['fourgrams_stem'] = processed_df['stemmed_tokens'].progress_apply(lambda x : fourgrams(x))


100%|██████████| 96595/96595 [00:00<00:00, 142119.22it/s]
100%|██████████| 96595/96595 [00:01<00:00, 84056.67it/s] 
100%|██████████| 96595/96595 [00:00<00:00, 125328.73it/s]
100%|██████████| 96595/96595 [00:00<00:00, 149434.13it/s]
100%|██████████| 96595/96595 [00:01<00:00, 79017.48it/s] 
100%|██████████| 96595/96595 [00:00<00:00, 129581.19it/s]


In [22]:
# creating one column with all n-grams (unigrams, bigrams, trigrams, fourgrams)
processed_df["all_n_grams_lemmatized"] = processed_df["lemmatized_tokens"] + processed_df["bigrams_lemma"] + processed_df["trigrams_lemma"] + processed_df["fourgrams_lemma"]
processed_df["all_n_grams_stemmed"] = processed_df["stemmed_tokens"] + processed_df["bigrams_stem"] + processed_df["trigrams_stem"] + processed_df["fourgrams_stem"]
processed_df.head()

Unnamed: 0,video_id,published_at,like_count,text,author,stemmed_text,lemmatized_text,stemmed_tokens,lemmatized_tokens,bigrams_lemma,trigrams_lemma,fourgrams_lemma,bigrams_stem,trigrams_stem,fourgrams_stem,all_n_grams_lemmatized,all_n_grams_stemmed
0,uW6fi2tCnAc,2023-02-19,1,the answer is if china and india dont help it ...,0,the answer is if china and india dont help it ...,the answer be if china and india dont help it ...,"[the, answer, is, if, china, and, india, dont,...","[the, answer, be, if, china, and, india, dont,...","[the_answer, answer_be, be_if, if_china, china...","[the_answer_be, answer_be_if, be_if_china, if_...","[the_answer_be_if, answer_be_if_china, be_if_c...","[the_answer, answer_is, is_if, if_china, china...","[the_answer_is, answer_is_if, is_if_china, if_...","[the_answer_is_if, answer_is_if_china, is_if_c...","[the, answer, be, if, china, and, india, dont,...","[the, answer, is, if, china, and, india, dont,..."
1,uW6fi2tCnAc,2023-02-19,2,and that guy is an expert were screwed,1,and that guy is an expert were screw,and that guy be an expert be screw,"[and, that, guy, is, an, expert, were, screw]","[and, that, guy, be, an, expert, be, screw]","[and_that, that_guy, guy_be, be_an, an_expert,...","[and_that_guy, that_guy_be, guy_be_an, be_an_e...","[and_that_guy_be, that_guy_be_an, guy_be_an_ex...","[and_that, that_guy, guy_is, is_an, an_expert,...","[and_that_guy, that_guy_is, guy_is_an, is_an_e...","[and_that_guy_is, that_guy_is_an, guy_is_an_ex...","[and, that, guy, be, an, expert, be, screw, an...","[and, that, guy, is, an, expert, were, screw, ..."
2,uW6fi2tCnAc,2023-02-18,4,kennedy is a gem,2,kennedi is a gem,kennedy be a gem,"[kennedi, is, a, gem]","[kennedy, be, a, gem]","[kennedy_be, be_a, a_gem]","[kennedy_be_a, be_a_gem]",[kennedy_be_a_gem],"[kennedi_is, is_a, a_gem]","[kennedi_is_a, is_a_gem]",[kennedi_is_a_gem],"[kennedy, be, a, gem, kennedy_be, be_a, a_gem,...","[kennedi, is, a, gem, kennedi_is, is_a, a_gem,..."
3,uW6fi2tCnAc,2023-02-18,0,and just how do we get a nation like china to ...,3,and just how do we get a nation like china to ...,and just how do we get a nation like china to ...,"[and, just, how, do, we, get, a, nation, like,...","[and, just, how, do, we, get, a, nation, like,...","[and_just, just_how, how_do, do_we, we_get, ge...","[and_just_how, just_how_do, how_do_we, do_we_g...","[and_just_how_do, just_how_do_we, how_do_we_ge...","[and_just, just_how, how_do, do_we, we_get, ge...","[and_just_how, just_how_do, how_do_we, do_we_g...","[and_just_how_do, just_how_do_we, how_do_we_ge...","[and, just, how, do, we, get, a, nation, like,...","[and, just, how, do, we, get, a, nation, like,..."
4,uW6fi2tCnAc,2023-02-18,3,that man was going for an oscar,4,that man was go for an oscar,that man be go for an oscar,"[that, man, was, go, for, an, oscar]","[that, man, be, go, for, an, oscar]","[that_man, man_be, be_go, go_for, for_an, an_o...","[that_man_be, man_be_go, be_go_for, go_for_an,...","[that_man_be_go, man_be_go_for, be_go_for_an, ...","[that_man, man_was, was_go, go_for, for_an, an...","[that_man_was, man_was_go, was_go_for, go_for_...","[that_man_was_go, man_was_go_for, was_go_for_a...","[that, man, be, go, for, an, oscar, that_man, ...","[that, man, was, go, for, an, oscar, that_man,..."


In [23]:
def contains_climate(lst):
    return 'climate' in lst

# Applying the function to create a boolean mask
climate_mask = processed_df['all_n_grams_lemmatized'].apply(contains_climate)

# Summing the rows where the mask is True
sum_rows_with_climate = climate_mask.sum()

print(f"Number of rows containing 'climate': {sum_rows_with_climate}")

Number of rows containing 'climate': 14821


## Building Dictionary

Building the Dictionary for the Classifier. This process is based on the qualitative research and Word2Vec Model + Topic Modelling.

In [25]:
# Creating Dictionary 

sc1_kw = ['no_climate_emergency', 'melting', 'arctic_ice', 'arctice_sea ice', 'sea_level_rise', 'extreme_weather', 'global_cooling', 'greenland_ice',
          'ice_cap', 'arctic_ice', 'extreme_heat', 'extreme_cold' ]
# unsure &  included: 'melting'
# unsure & not included: 'glacier', 'wildfires', 'climate emergency', 'unproven', 'global warming'

sc2_kw = ['natural_cycle', 'CO2_is_not_the_cause', 'greenhouse_gas', 'no_CO2_Greenhouse_Effect', 'no_effect', 'miniscule_effect', 'Man_has_no_control']
# unsure & not included: 'natural process'

sc3_kw = ['plant_food', 'plant_growth', 'thrive', 'carbon_element_is_essential', 'aveage_temperature_increase', '1_degree', 
          'more_fossil_fuels', 'no_co2', 'plant_food', 'not_pollution', '0.1C', 'ppm', 'not_a_pollutant']
# unsure & not included: 'beneficial'

sc4_kw = ['green_energy', 'renewable_energy', 'energy_production', 'windmills', 'solar_panel']
# unsure &  included: 'renewable energy'

sc5_kw = ['alarmism', 'catastrophist', 'doomsday_cult', 'climate_hysteric', 'unscientific', 'corrupt_politician', 'LIE_ABOUT_EVERYTHING',
          'idiocy', 'lunatics', 'CLIMATE_Worship', 'Climatists', 'alarmists', 'compliant media', 'climate_hysteria', 'climate_narrative', 'climate_cult',
             'scientism', 'climate_science_myths', 'lying_in_science', 'climate_apocalypse', 'propaganda', 'doomsayers', 'clown_show', 'fake_climate',
               'climate_change_agenda', 'money_made', 'fake_news', 'climate_terrorists']
# unsure & not included: 'scientist', 'global warming scam' (could also be sc7), 'greta', 'john kerry'

sc7_kw = ['globalist', 'globalist_elites', 'elitist', 'global_government', 'one_world', 'one_world_government', 'globalism', 
          'one_world_utopia', 'new_world_order', 'enriching_themselves', 'saving_the_planet', 'control_over_your_lives', 
          'tyranny', 'global_elite', 'wef', 'population_control']
# unsure & included: 'tyranny', 'population control'
# unsure & not included: 'totalitarian'

print(len(sc1_kw))
print(len(sc2_kw))
print(len(sc3_kw))
print(len(sc4_kw))
print(len(sc5_kw))
print(len(sc7_kw))

12
7
13
5
28
16


In [26]:
# lowercase and stemming/lemmatizing the keyword lists 

import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag

# Function to remove underscores and convert to lowercase
def preprocess_keywords(keywords):
    return [keyword.replace('_', ' ').lower() for keyword in keywords]

# Stem words
def stem_words(words):
    stemmer = SnowballStemmer(language='english')
    return [" ".join([stemmer.stem(word) for word in word_tokenize(keyword)]) for keyword in words]

# Lemmatize words
def lemmatize_words(words):
    lemmatizer = WordNetLemmatizer()
    
    # Mapping NLTK POS tags to WordNet POS tags
    tag_map = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'N': wordnet.NOUN
    }
    
    lemmatized_keywords = []
    for keyword in words:
        tokens = word_tokenize(keyword)
        pos_tags = pos_tag(tokens)
        lemmatized_tokens = [lemmatizer.lemmatize(word, tag_map.get(tag[0], wordnet.NOUN)) for word, tag in pos_tags]
        lemmatized_keywords.append(" ".join(lemmatized_tokens))
    
    return lemmatized_keywords


# Replacing the whitespaces with underscores again to create n-grams
def postprocess_keywords(keywords):
    return [keyword.replace(' ', '_') for keyword in keywords]

# Apply the preprocessing, stemming, and lemmatization
sc1_kw_lemmatized = postprocess_keywords(lemmatize_words(preprocess_keywords(sc1_kw)))
sc2_kw_lemmatized = postprocess_keywords(lemmatize_words(preprocess_keywords(sc2_kw)))
sc3_kw_lemmatized = postprocess_keywords(lemmatize_words(preprocess_keywords(sc3_kw)))
sc4_kw_lemmatized = postprocess_keywords(lemmatize_words(preprocess_keywords(sc4_kw)))
sc5_kw_lemmatized = postprocess_keywords(lemmatize_words(preprocess_keywords(sc5_kw)))
sc7_kw_lemmatized = postprocess_keywords(lemmatize_words(preprocess_keywords(sc7_kw)))

sc1_kw_stemmed = postprocess_keywords(stem_words(preprocess_keywords(sc1_kw)))
sc2_kw_stemmed = postprocess_keywords(stem_words(preprocess_keywords(sc2_kw)))
sc3_kw_stemmed = postprocess_keywords(stem_words(preprocess_keywords(sc3_kw)))
sc4_kw_stemmed = postprocess_keywords(stem_words(preprocess_keywords(sc4_kw)))
sc5_kw_stemmed = postprocess_keywords(stem_words(preprocess_keywords(sc5_kw)))
sc7_kw_stemmed = postprocess_keywords(stem_words(preprocess_keywords(sc7_kw)))

# print lemmatized dictionary
print("sc1_kw_lemmatized:", sc1_kw_lemmatized)
print("sc2_kw_lemmatized:", sc2_kw_lemmatized)
print("sc3_kw_lemmatized:", sc3_kw_lemmatized)
print("sc4_kw_lemmatized:", sc4_kw_lemmatized)
print("sc5_kw_lemmatized:", sc5_kw_lemmatized)
print("sc7_kw_lemmatized:", sc7_kw_lemmatized)

# print stemmed dictionary
print("sc1_kw_stemmed:", sc1_kw_stemmed)
print("sc2_kw_stemmed:", sc2_kw_stemmed)
print("sc3_kw_stemmed:", sc3_kw_stemmed)
print("sc4_kw_stemmed:", sc4_kw_stemmed)
print("sc5_kw_stemmed:", sc5_kw_stemmed)
print("sc7_kw_stemmed:", sc7_kw_stemmed)

sc1_kw_lemmatized: ['no_climate_emergency', 'melt', 'arctic_ice', 'arctice_sea_ice', 'sea_level_rise', 'extreme_weather', 'global_cooling', 'greenland_ice', 'ice_cap', 'arctic_ice', 'extreme_heat', 'extreme_cold']
sc2_kw_lemmatized: ['natural_cycle', 'co2_be_not_the_cause', 'greenhouse_gas', 'no_co2_greenhouse_effect', 'no_effect', 'miniscule_effect', 'man_have_no_control']
sc3_kw_lemmatized: ['plant_food', 'plant_growth', 'thrive', 'carbon_element_be_essential', 'aveage_temperature_increase', '1_degree', 'more_fossil_fuel', 'no_co2', 'plant_food', 'not_pollution', '0.1c', 'ppm', 'not_a_pollutant']
sc4_kw_lemmatized: ['green_energy', 'renewable_energy', 'energy_production', 'windmill', 'solar_panel']
sc5_kw_lemmatized: ['alarmism', 'catastrophist', 'doomsday_cult', 'climate_hysteric', 'unscientific', 'corrupt_politician', 'lie_about_everything', 'idiocy', 'lunatic', 'climate_worship', 'climatists', 'alarmist', 'compliant_medium', 'climate_hysteria', 'climate_narrative', 'climate_cult',

In [27]:
# Creating the dictionaries for the classifier
keyword_dict_lemmatized = {
    1: sc1_kw_lemmatized,
    2: sc2_kw_lemmatized,
    3: sc3_kw_lemmatized,
    4: sc4_kw_lemmatized,
    5: sc5_kw_lemmatized,
    7: sc7_kw_lemmatized
}


keyword_dict_stemmed = {
    1: sc1_kw_stemmed,
    2: sc2_kw_stemmed,
    3: sc3_kw_stemmed,
    4: sc4_kw_stemmed,
    5: sc5_kw_stemmed,
    7: sc7_kw_stemmed
}

keyword_dict_lemmatized_valid = {
    1: ['climate'],
    2: ['climate_change']
}

#print keyword dictionaries
print(keyword_dict_lemmatized)
print(keyword_dict_stemmed)

{1: ['no_climate_emergency', 'melt', 'arctic_ice', 'arctice_sea_ice', 'sea_level_rise', 'extreme_weather', 'global_cooling', 'greenland_ice', 'ice_cap', 'arctic_ice', 'extreme_heat', 'extreme_cold'], 2: ['natural_cycle', 'co2_be_not_the_cause', 'greenhouse_gas', 'no_co2_greenhouse_effect', 'no_effect', 'miniscule_effect', 'man_have_no_control'], 3: ['plant_food', 'plant_growth', 'thrive', 'carbon_element_be_essential', 'aveage_temperature_increase', '1_degree', 'more_fossil_fuel', 'no_co2', 'plant_food', 'not_pollution', '0.1c', 'ppm', 'not_a_pollutant'], 4: ['green_energy', 'renewable_energy', 'energy_production', 'windmill', 'solar_panel'], 5: ['alarmism', 'catastrophist', 'doomsday_cult', 'climate_hysteric', 'unscientific', 'corrupt_politician', 'lie_about_everything', 'idiocy', 'lunatic', 'climate_worship', 'climatists', 'alarmist', 'compliant_medium', 'climate_hysteria', 'climate_narrative', 'climate_cult', 'scientism', 'climate_science_myth', 'lie_in_science', 'climate_apocalypse

## Classifier

Classification of YouTube comments related to each claim, based on the text of the comment.

In [28]:
# Classifiying the comments into categories
def classify_comments(comments, keyword_dict):
    classifications = [] #initialize empty list of classifications
    
    for comment in comments: #loop through each comment of the df 
        categories = [] #initialize empty list of categories
        comment_str = ",".join(comment)  # Join the tokens of one comement into a single string for easier matching
        
        for category, keywords in keyword_dict.items(): #iterating through each key-value pair of the dictionary
            for keyword in keywords: #for each category: iterate through list of keywords. Check if each keyword is present in comment_str
                if keyword in comment_str:
                    categories.append(category) #if a keyworrd is found, the category is appended to list.
                    break  # Stop checking more keywords for this category
        
        if not categories:
            categories = ['uncategorized']
        
        classifications.append(categories)
    
    return classifications

# Apply classifier to lemmatized comments 
processed_df['category_lemmatized_comments'] = classify_comments(processed_df['all_n_grams_lemmatized'], keyword_dict_lemmatized)
processed_df.head()


Unnamed: 0,video_id,published_at,like_count,text,author,stemmed_text,lemmatized_text,stemmed_tokens,lemmatized_tokens,bigrams_lemma,trigrams_lemma,fourgrams_lemma,bigrams_stem,trigrams_stem,fourgrams_stem,all_n_grams_lemmatized,all_n_grams_stemmed,category_lemmatized_comments
0,uW6fi2tCnAc,2023-02-19,1,the answer is if china and india dont help it wont matter how much money the rest of the world throws at reducing carbon footprint complete waste of 50 trillion dollars 🤦‍♀️,0,the answer is if china and india dont help it wont matter how much money the rest of the world throw at reduc carbon footprint complet wast of 50 trillion dollar 🤦‍♀️,the answer be if china and india dont help it wont matter how much money the rest of the world throw at reduce carbon footprint complete waste of 50 trillion dollar 🤦‍♀️,"[the, answer, is, if, china, and, india, dont, help, it, wont, matter, how, much, money, the, rest, of, the, world, throw, at, reduc, carbon, footprint, complet, wast, of, 50, trillion, dollar, 🤦‍♀️]","[the, answer, be, if, china, and, india, dont, help, it, wont, matter, how, much, money, the, rest, of, the, world, throw, at, reduce, carbon, footprint, complete, waste, of, 50, trillion, dollar, 🤦‍♀️]","[the_answer, answer_be, be_if, if_china, china_and, and_india, india_dont, dont_help, help_it, it_wont, wont_matter, matter_how, how_much, much_money, money_the, the_rest, rest_of, of_the, the_world, world_throw, throw_at, at_reduce, reduce_carbon, carbon_footprint, footprint_complete, complete_waste, waste_of, of_50, 50_trillion, trillion_dollar, dollar_🤦‍♀️]","[the_answer_be, answer_be_if, be_if_china, if_china_and, china_and_india, and_india_dont, india_dont_help, dont_help_it, help_it_wont, it_wont_matter, wont_matter_how, matter_how_much, how_much_money, much_money_the, money_the_rest, the_rest_of, rest_of_the, of_the_world, the_world_throw, world_throw_at, throw_at_reduce, at_reduce_carbon, reduce_carbon_footprint, carbon_footprint_complete, footprint_complete_waste, complete_waste_of, waste_of_50, of_50_trillion, 50_trillion_dollar, trillion_dollar_🤦‍♀️]","[the_answer_be_if, answer_be_if_china, be_if_china_and, if_china_and_india, china_and_india_dont, and_india_dont_help, india_dont_help_it, dont_help_it_wont, help_it_wont_matter, it_wont_matter_how, wont_matter_how_much, matter_how_much_money, how_much_money_the, much_money_the_rest, money_the_rest_of, the_rest_of_the, rest_of_the_world, of_the_world_throw, the_world_throw_at, world_throw_at_reduce, throw_at_reduce_carbon, at_reduce_carbon_footprint, reduce_carbon_footprint_complete, carbon_footprint_complete_waste, footprint_complete_waste_of, complete_waste_of_50, waste_of_50_trillion, of_50_trillion_dollar, 50_trillion_dollar_🤦‍♀️]","[the_answer, answer_is, is_if, if_china, china_and, and_india, india_dont, dont_help, help_it, it_wont, wont_matter, matter_how, how_much, much_money, money_the, the_rest, rest_of, of_the, the_world, world_throw, throw_at, at_reduc, reduc_carbon, carbon_footprint, footprint_complet, complet_wast, wast_of, of_50, 50_trillion, trillion_dollar, dollar_🤦‍♀️]","[the_answer_is, answer_is_if, is_if_china, if_china_and, china_and_india, and_india_dont, india_dont_help, dont_help_it, help_it_wont, it_wont_matter, wont_matter_how, matter_how_much, how_much_money, much_money_the, money_the_rest, the_rest_of, rest_of_the, of_the_world, the_world_throw, world_throw_at, throw_at_reduc, at_reduc_carbon, reduc_carbon_footprint, carbon_footprint_complet, footprint_complet_wast, complet_wast_of, wast_of_50, of_50_trillion, 50_trillion_dollar, trillion_dollar_🤦‍♀️]","[the_answer_is_if, answer_is_if_china, is_if_china_and, if_china_and_india, china_and_india_dont, and_india_dont_help, india_dont_help_it, dont_help_it_wont, help_it_wont_matter, it_wont_matter_how, wont_matter_how_much, matter_how_much_money, how_much_money_the, much_money_the_rest, money_the_rest_of, the_rest_of_the, rest_of_the_world, of_the_world_throw, the_world_throw_at, world_throw_at_reduc, throw_at_reduc_carbon, at_reduc_carbon_footprint, reduc_carbon_footprint_complet, carbon_footprint_complet_wast, footprint_complet_wast_of, complet_wast_of_50, wast_of_50_trillion, of_50_trillion_dollar, 50_trillion_dollar_🤦‍♀️]","[the, answer, be, if, china, and, india, dont, help, it, wont, matter, how, much, money, the, rest, of, the, world, throw, at, reduce, carbon, footprint, complete, waste, of, 50, trillion, dollar, 🤦‍♀️, the_answer, answer_be, be_if, if_china, china_and, and_india, india_dont, dont_help, help_it, it_wont, wont_matter, matter_how, how_much, much_money, money_the, the_rest, rest_of, of_the, the_world, world_throw, throw_at, at_reduce, reduce_carbon, carbon_footprint, footprint_complete, complete_waste, waste_of, of_50, 50_trillion, trillion_dollar, dollar_🤦‍♀️, the_answer_be, answer_be_if, be_if_china, if_china_and, china_and_india, and_india_dont, india_dont_help, dont_help_it, help_it_wont, it_wont_matter, wont_matter_how, matter_how_much, how_much_money, much_money_the, money_the_rest, the_rest_of, rest_of_the, of_the_world, the_world_throw, world_throw_at, throw_at_reduce, at_reduce_carbon, reduce_carbon_footprint, carbon_footprint_complete, footprint_complete_waste, complete_waste_of, waste_of_50, of_50_trillion, 50_trillion_dollar, trillion_dollar_🤦‍♀️, the_answer_be_if, answer_be_if_china, be_if_china_and, if_china_and_india, china_and_india_dont, and_india_dont_help, india_dont_help_it, ...]","[the, answer, is, if, china, and, india, dont, help, it, wont, matter, how, much, money, the, rest, of, the, world, throw, at, reduc, carbon, footprint, complet, wast, of, 50, trillion, dollar, 🤦‍♀️, the_answer, answer_is, is_if, if_china, china_and, and_india, india_dont, dont_help, help_it, it_wont, wont_matter, matter_how, how_much, much_money, money_the, the_rest, rest_of, of_the, the_world, world_throw, throw_at, at_reduc, reduc_carbon, carbon_footprint, footprint_complet, complet_wast, wast_of, of_50, 50_trillion, trillion_dollar, dollar_🤦‍♀️, the_answer_is, answer_is_if, is_if_china, if_china_and, china_and_india, and_india_dont, india_dont_help, dont_help_it, help_it_wont, it_wont_matter, wont_matter_how, matter_how_much, how_much_money, much_money_the, money_the_rest, the_rest_of, rest_of_the, of_the_world, the_world_throw, world_throw_at, throw_at_reduc, at_reduc_carbon, reduc_carbon_footprint, carbon_footprint_complet, footprint_complet_wast, complet_wast_of, wast_of_50, of_50_trillion, 50_trillion_dollar, trillion_dollar_🤦‍♀️, the_answer_is_if, answer_is_if_china, is_if_china_and, if_china_and_india, china_and_india_dont, and_india_dont_help, india_dont_help_it, ...]",[uncategorized]
1,uW6fi2tCnAc,2023-02-19,2,and that guy is an expert were screwed,1,and that guy is an expert were screw,and that guy be an expert be screw,"[and, that, guy, is, an, expert, were, screw]","[and, that, guy, be, an, expert, be, screw]","[and_that, that_guy, guy_be, be_an, an_expert, expert_be, be_screw]","[and_that_guy, that_guy_be, guy_be_an, be_an_expert, an_expert_be, expert_be_screw]","[and_that_guy_be, that_guy_be_an, guy_be_an_expert, be_an_expert_be, an_expert_be_screw]","[and_that, that_guy, guy_is, is_an, an_expert, expert_were, were_screw]","[and_that_guy, that_guy_is, guy_is_an, is_an_expert, an_expert_were, expert_were_screw]","[and_that_guy_is, that_guy_is_an, guy_is_an_expert, is_an_expert_were, an_expert_were_screw]","[and, that, guy, be, an, expert, be, screw, and_that, that_guy, guy_be, be_an, an_expert, expert_be, be_screw, and_that_guy, that_guy_be, guy_be_an, be_an_expert, an_expert_be, expert_be_screw, and_that_guy_be, that_guy_be_an, guy_be_an_expert, be_an_expert_be, an_expert_be_screw]","[and, that, guy, is, an, expert, were, screw, and_that, that_guy, guy_is, is_an, an_expert, expert_were, were_screw, and_that_guy, that_guy_is, guy_is_an, is_an_expert, an_expert_were, expert_were_screw, and_that_guy_is, that_guy_is_an, guy_is_an_expert, is_an_expert_were, an_expert_were_screw]",[uncategorized]
2,uW6fi2tCnAc,2023-02-18,4,kennedy is a gem,2,kennedi is a gem,kennedy be a gem,"[kennedi, is, a, gem]","[kennedy, be, a, gem]","[kennedy_be, be_a, a_gem]","[kennedy_be_a, be_a_gem]",[kennedy_be_a_gem],"[kennedi_is, is_a, a_gem]","[kennedi_is_a, is_a_gem]",[kennedi_is_a_gem],"[kennedy, be, a, gem, kennedy_be, be_a, a_gem, kennedy_be_a, be_a_gem, kennedy_be_a_gem]","[kennedi, is, a, gem, kennedi_is, is_a, a_gem, kennedi_is_a, is_a_gem, kennedi_is_a_gem]",[uncategorized]
3,uW6fi2tCnAc,2023-02-18,0,and just how do we get a nation like china to cooperate or even trust them to be telling the truth if they say that they are absolute ridiculousness that is occurring in western nations supposedly the most advanced and capable with the leads in science and technology but willing to cripple and impoverish millions for the sake of iffy pseudo science and what do these experts care anyway the wealthy elitists will be the least affected by the ramifications for the extreme measures that they would promote for the rest of the world,3,and just how do we get a nation like china to cooper or even trust them to be tell the truth if they say that they are absolut ridicul that is occur in western nation suppos the most advanc and capabl with the lead in scienc and technolog but will to crippl and impoverish million for the sake of iffi pseudo scienc and what do these expert care anyway the wealthi elitist will be the least affect by the ramif for the extrem measur that they would promot for the rest of the world,and just how do we get a nation like china to cooperate or even trust them to be tell the truth if they say that they be absolute ridiculousness that be occur in western nation supposedly the most advanced and capable with the lead in science and technology but willing to cripple and impoverish million for the sake of iffy pseudo science and what do these expert care anyway the wealthy elitist will be the least affected by the ramification for the extreme measure that they would promote for the rest of the world,"[and, just, how, do, we, get, a, nation, like, china, to, cooper, or, even, trust, them, to, be, tell, the, truth, if, they, say, that, they, are, absolut, ridicul, that, is, occur, in, western, nation, suppos, the, most, advanc, and, capabl, with, the, lead, in, scienc, and, technolog, but, will, to, crippl, and, impoverish, million, for, the, sake, of, iffi, pseudo, scienc, and, what, do, these, expert, care, anyway, the, wealthi, elitist, will, be, the, least, affect, by, the, ramif, for, the, extrem, measur, that, they, would, promot, for, the, rest, of, the, world]","[and, just, how, do, we, get, a, nation, like, china, to, cooperate, or, even, trust, them, to, be, tell, the, truth, if, they, say, that, they, be, absolute, ridiculousness, that, be, occur, in, western, nation, supposedly, the, most, advanced, and, capable, with, the, lead, in, science, and, technology, but, willing, to, cripple, and, impoverish, million, for, the, sake, of, iffy, pseudo, science, and, what, do, these, expert, care, anyway, the, wealthy, elitist, will, be, the, least, affected, by, the, ramification, for, the, extreme, measure, that, they, would, promote, for, the, rest, of, the, world]","[and_just, just_how, how_do, do_we, we_get, get_a, a_nation, nation_like, like_china, china_to, to_cooperate, cooperate_or, or_even, even_trust, trust_them, them_to, to_be, be_tell, tell_the, the_truth, truth_if, if_they, they_say, say_that, that_they, they_be, be_absolute, absolute_ridiculousness, ridiculousness_that, that_be, be_occur, occur_in, in_western, western_nation, nation_supposedly, supposedly_the, the_most, most_advanced, advanced_and, and_capable, capable_with, with_the, the_lead, lead_in, in_science, science_and, and_technology, technology_but, but_willing, willing_to, to_cripple, cripple_and, and_impoverish, impoverish_million, million_for, for_the, the_sake, sake_of, of_iffy, iffy_pseudo, pseudo_science, science_and, and_what, what_do, do_these, these_expert, expert_care, care_anyway, anyway_the, the_wealthy, wealthy_elitist, elitist_will, will_be, be_the, the_least, least_affected, affected_by, by_the, the_ramification, ramification_for, for_the, the_extreme, extreme_measure, measure_that, that_they, they_would, would_promote, promote_for, for_the, the_rest, rest_of, of_the, the_world]","[and_just_how, just_how_do, how_do_we, do_we_get, we_get_a, get_a_nation, a_nation_like, nation_like_china, like_china_to, china_to_cooperate, to_cooperate_or, cooperate_or_even, or_even_trust, even_trust_them, trust_them_to, them_to_be, to_be_tell, be_tell_the, tell_the_truth, the_truth_if, truth_if_they, if_they_say, they_say_that, say_that_they, that_they_be, they_be_absolute, be_absolute_ridiculousness, absolute_ridiculousness_that, ridiculousness_that_be, that_be_occur, be_occur_in, occur_in_western, in_western_nation, western_nation_supposedly, nation_supposedly_the, supposedly_the_most, the_most_advanced, most_advanced_and, advanced_and_capable, and_capable_with, capable_with_the, with_the_lead, the_lead_in, lead_in_science, in_science_and, science_and_technology, and_technology_but, technology_but_willing, but_willing_to, willing_to_cripple, to_cripple_and, cripple_and_impoverish, and_impoverish_million, impoverish_million_for, million_for_the, for_the_sake, the_sake_of, sake_of_iffy, of_iffy_pseudo, iffy_pseudo_science, pseudo_science_and, science_and_what, and_what_do, what_do_these, do_these_expert, these_expert_care, expert_care_anyway, care_anyway_the, anyway_the_wealthy, the_wealthy_elitist, wealthy_elitist_will, elitist_will_be, will_be_the, be_the_least, the_least_affected, least_affected_by, affected_by_the, by_the_ramification, the_ramification_for, ramification_for_the, for_the_extreme, the_extreme_measure, extreme_measure_that, measure_that_they, that_they_would, they_would_promote, would_promote_for, promote_for_the, for_the_rest, the_rest_of, rest_of_the, of_the_world]","[and_just_how_do, just_how_do_we, how_do_we_get, do_we_get_a, we_get_a_nation, get_a_nation_like, a_nation_like_china, nation_like_china_to, like_china_to_cooperate, china_to_cooperate_or, to_cooperate_or_even, cooperate_or_even_trust, or_even_trust_them, even_trust_them_to, trust_them_to_be, them_to_be_tell, to_be_tell_the, be_tell_the_truth, tell_the_truth_if, the_truth_if_they, truth_if_they_say, if_they_say_that, they_say_that_they, say_that_they_be, that_they_be_absolute, they_be_absolute_ridiculousness, be_absolute_ridiculousness_that, absolute_ridiculousness_that_be, ridiculousness_that_be_occur, that_be_occur_in, be_occur_in_western, occur_in_western_nation, in_western_nation_supposedly, western_nation_supposedly_the, nation_supposedly_the_most, supposedly_the_most_advanced, the_most_advanced_and, most_advanced_and_capable, advanced_and_capable_with, and_capable_with_the, capable_with_the_lead, with_the_lead_in, the_lead_in_science, lead_in_science_and, in_science_and_technology, science_and_technology_but, and_technology_but_willing, technology_but_willing_to, but_willing_to_cripple, willing_to_cripple_and, to_cripple_and_impoverish, cripple_and_impoverish_million, and_impoverish_million_for, impoverish_million_for_the, million_for_the_sake, for_the_sake_of, the_sake_of_iffy, sake_of_iffy_pseudo, of_iffy_pseudo_science, iffy_pseudo_science_and, pseudo_science_and_what, science_and_what_do, and_what_do_these, what_do_these_expert, do_these_expert_care, these_expert_care_anyway, expert_care_anyway_the, care_anyway_the_wealthy, anyway_the_wealthy_elitist, the_wealthy_elitist_will, wealthy_elitist_will_be, elitist_will_be_the, will_be_the_least, be_the_least_affected, the_least_affected_by, least_affected_by_the, affected_by_the_ramification, by_the_ramification_for, the_ramification_for_the, ramification_for_the_extreme, for_the_extreme_measure, the_extreme_measure_that, extreme_measure_that_they, measure_that_they_would, that_they_would_promote, they_would_promote_for, would_promote_for_the, promote_for_the_rest, for_the_rest_of, the_rest_of_the, rest_of_the_world]","[and_just, just_how, how_do, do_we, we_get, get_a, a_nation, nation_like, like_china, china_to, to_cooper, cooper_or, or_even, even_trust, trust_them, them_to, to_be, be_tell, tell_the, the_truth, truth_if, if_they, they_say, say_that, that_they, they_are, are_absolut, absolut_ridicul, ridicul_that, that_is, is_occur, occur_in, in_western, western_nation, nation_suppos, suppos_the, the_most, most_advanc, advanc_and, and_capabl, capabl_with, with_the, the_lead, lead_in, in_scienc, scienc_and, and_technolog, technolog_but, but_will, will_to, to_crippl, crippl_and, and_impoverish, impoverish_million, million_for, for_the, the_sake, sake_of, of_iffi, iffi_pseudo, pseudo_scienc, scienc_and, and_what, what_do, do_these, these_expert, expert_care, care_anyway, anyway_the, the_wealthi, wealthi_elitist, elitist_will, will_be, be_the, the_least, least_affect, affect_by, by_the, the_ramif, ramif_for, for_the, the_extrem, extrem_measur, measur_that, that_they, they_would, would_promot, promot_for, for_the, the_rest, rest_of, of_the, the_world]","[and_just_how, just_how_do, how_do_we, do_we_get, we_get_a, get_a_nation, a_nation_like, nation_like_china, like_china_to, china_to_cooper, to_cooper_or, cooper_or_even, or_even_trust, even_trust_them, trust_them_to, them_to_be, to_be_tell, be_tell_the, tell_the_truth, the_truth_if, truth_if_they, if_they_say, they_say_that, say_that_they, that_they_are, they_are_absolut, are_absolut_ridicul, absolut_ridicul_that, ridicul_that_is, that_is_occur, is_occur_in, occur_in_western, in_western_nation, western_nation_suppos, nation_suppos_the, suppos_the_most, the_most_advanc, most_advanc_and, advanc_and_capabl, and_capabl_with, capabl_with_the, with_the_lead, the_lead_in, lead_in_scienc, in_scienc_and, scienc_and_technolog, and_technolog_but, technolog_but_will, but_will_to, will_to_crippl, to_crippl_and, crippl_and_impoverish, and_impoverish_million, impoverish_million_for, million_for_the, for_the_sake, the_sake_of, sake_of_iffi, of_iffi_pseudo, iffi_pseudo_scienc, pseudo_scienc_and, scienc_and_what, and_what_do, what_do_these, do_these_expert, these_expert_care, expert_care_anyway, care_anyway_the, anyway_the_wealthi, the_wealthi_elitist, wealthi_elitist_will, elitist_will_be, will_be_the, be_the_least, the_least_affect, least_affect_by, affect_by_the, by_the_ramif, the_ramif_for, ramif_for_the, for_the_extrem, the_extrem_measur, extrem_measur_that, measur_that_they, that_they_would, they_would_promot, would_promot_for, promot_for_the, for_the_rest, the_rest_of, rest_of_the, of_the_world]","[and_just_how_do, just_how_do_we, how_do_we_get, do_we_get_a, we_get_a_nation, get_a_nation_like, a_nation_like_china, nation_like_china_to, like_china_to_cooper, china_to_cooper_or, to_cooper_or_even, cooper_or_even_trust, or_even_trust_them, even_trust_them_to, trust_them_to_be, them_to_be_tell, to_be_tell_the, be_tell_the_truth, tell_the_truth_if, the_truth_if_they, truth_if_they_say, if_they_say_that, they_say_that_they, say_that_they_are, that_they_are_absolut, they_are_absolut_ridicul, are_absolut_ridicul_that, absolut_ridicul_that_is, ridicul_that_is_occur, that_is_occur_in, is_occur_in_western, occur_in_western_nation, in_western_nation_suppos, western_nation_suppos_the, nation_suppos_the_most, suppos_the_most_advanc, the_most_advanc_and, most_advanc_and_capabl, advanc_and_capabl_with, and_capabl_with_the, capabl_with_the_lead, with_the_lead_in, the_lead_in_scienc, lead_in_scienc_and, in_scienc_and_technolog, scienc_and_technolog_but, and_technolog_but_will, technolog_but_will_to, but_will_to_crippl, will_to_crippl_and, to_crippl_and_impoverish, crippl_and_impoverish_million, and_impoverish_million_for, impoverish_million_for_the, million_for_the_sake, for_the_sake_of, the_sake_of_iffi, sake_of_iffi_pseudo, of_iffi_pseudo_scienc, iffi_pseudo_scienc_and, pseudo_scienc_and_what, scienc_and_what_do, and_what_do_these, what_do_these_expert, do_these_expert_care, these_expert_care_anyway, expert_care_anyway_the, care_anyway_the_wealthi, anyway_the_wealthi_elitist, the_wealthi_elitist_will, wealthi_elitist_will_be, elitist_will_be_the, will_be_the_least, be_the_least_affect, the_least_affect_by, least_affect_by_the, affect_by_the_ramif, by_the_ramif_for, the_ramif_for_the, ramif_for_the_extrem, for_the_extrem_measur, the_extrem_measur_that, extrem_measur_that_they, measur_that_they_would, that_they_would_promot, they_would_promot_for, would_promot_for_the, promot_for_the_rest, for_the_rest_of, the_rest_of_the, rest_of_the_world]","[and, just, how, do, we, get, a, nation, like, china, to, cooperate, or, even, trust, them, to, be, tell, the, truth, if, they, say, that, they, be, absolute, ridiculousness, that, be, occur, in, western, nation, supposedly, the, most, advanced, and, capable, with, the, lead, in, science, and, technology, but, willing, to, cripple, and, impoverish, million, for, the, sake, of, iffy, pseudo, science, and, what, do, these, expert, care, anyway, the, wealthy, elitist, will, be, the, least, affected, by, the, ramification, for, the, extreme, measure, that, they, would, promote, for, the, rest, of, the, world, and_just, just_how, how_do, do_we, we_get, get_a, ...]","[and, just, how, do, we, get, a, nation, like, china, to, cooper, or, even, trust, them, to, be, tell, the, truth, if, they, say, that, they, are, absolut, ridicul, that, is, occur, in, western, nation, suppos, the, most, advanc, and, capabl, with, the, lead, in, scienc, and, technolog, but, will, to, crippl, and, impoverish, million, for, the, sake, of, iffi, pseudo, scienc, and, what, do, these, expert, care, anyway, the, wealthi, elitist, will, be, the, least, affect, by, the, ramif, for, the, extrem, measur, that, they, would, promot, for, the, rest, of, the, world, and_just, just_how, how_do, do_we, we_get, get_a, ...]",[7]
4,uW6fi2tCnAc,2023-02-18,3,that man was going for an oscar,4,that man was go for an oscar,that man be go for an oscar,"[that, man, was, go, for, an, oscar]","[that, man, be, go, for, an, oscar]","[that_man, man_be, be_go, go_for, for_an, an_oscar]","[that_man_be, man_be_go, be_go_for, go_for_an, for_an_oscar]","[that_man_be_go, man_be_go_for, be_go_for_an, go_for_an_oscar]","[that_man, man_was, was_go, go_for, for_an, an_oscar]","[that_man_was, man_was_go, was_go_for, go_for_an, for_an_oscar]","[that_man_was_go, man_was_go_for, was_go_for_an, go_for_an_oscar]","[that, man, be, go, for, an, oscar, that_man, man_be, be_go, go_for, for_an, an_oscar, that_man_be, man_be_go, be_go_for, go_for_an, for_an_oscar, that_man_be_go, man_be_go_for, be_go_for_an, go_for_an_oscar]","[that, man, was, go, for, an, oscar, that_man, man_was, was_go, go_for, for_an, an_oscar, that_man_was, man_was_go, was_go_for, go_for_an, for_an_oscar, that_man_was_go, man_was_go_for, was_go_for_an, go_for_an_oscar]",[uncategorized]


In [118]:
processed_df['category_lemmatized_comments'].value_counts()

category_lemmatized_comments
[uncategorized]    88645
[5]                 2958
[7]                 1891
[1]                 1065
[4]                  580
[3]                  570
[2]                  298
[5, 7]               141
[1, 5]                71
[3, 5]                51
[1, 3]                42
[2, 3]                37
[3, 7]                30
[4, 5]                28
[1, 7]                28
[1, 2]                24
[2, 5]                18
[1, 4]                18
[4, 7]                16
[3, 4]                11
[2, 4]                10
[1, 3, 5]              9
[1, 2, 3, 5]           7
[2, 7]                 7
[1, 5, 7]              5
[1, 2, 3]              5
[2, 3, 5]              5
[1, 2, 5]              4
[2, 3, 5, 7]           4
[1, 3, 7]              3
[1, 3, 4, 7]           3
[1, 2, 7]              2
[2, 5, 7]              1
[1, 2, 4]              1
[3, 5, 7]              1
[1, 2, 3, 4, 5]        1
[2, 3, 4]              1
[1, 3, 4, 5]           1
[1, 2, 3, 5, 7]      

In [74]:
# Apply classifier to stemmed comments 
processed_df['category_stemmed_comments'] = classify_comments(processed_df['all_n_grams_stemmed'], keyword_dict_stemmed)

In [102]:
# Apply classifier to lemmatized valid dictionariy to check if results make sense
processed_df['category_lemmatized_comments_validated'] = classify_comments(processed_df['all_n_grams_lemmatized'], keyword_dict_lemmatized_valid)

In [106]:
print(processed_df['category_lemmatized_comments'].value_counts())

category_lemmatized_comments
[uncategorized]    88645
[5]                 2958
[7]                 1891
[1]                 1065
[4]                  580
[3]                  570
[2]                  298
[5, 7]               141
[1, 5]                71
[3, 5]                51
[1, 3]                42
[2, 3]                37
[3, 7]                30
[4, 5]                28
[1, 7]                28
[1, 2]                24
[2, 5]                18
[1, 4]                18
[4, 7]                16
[3, 4]                11
[2, 4]                10
[1, 3, 5]              9
[1, 2, 3, 5]           7
[2, 7]                 7
[1, 5, 7]              5
[1, 2, 3]              5
[2, 3, 5]              5
[1, 2, 5]              4
[2, 3, 5, 7]           4
[1, 3, 7]              3
[1, 3, 4, 7]           3
[1, 2, 7]              2
[2, 5, 7]              1
[1, 2, 4]              1
[3, 5, 7]              1
[1, 2, 3, 4, 5]        1
[2, 3, 4]              1
[1, 3, 4, 5]           1
[1, 2, 3, 5, 7]      

In [103]:
print(processed_df['category_stemmed_comments'].value_counts())

category_stemmed_comments
[uncategorized]       86050
[7]                    4312
[5]                    2913
[1]                     807
[4]                     540
[3]                     521
[5, 7]                  339
[1, 7]                  282
[2]                     226
[3, 7]                   82
[2, 7]                   80
[1, 5]                   58
[4, 7]                   51
[3, 5]                   43
[1, 5, 7]                32
[4, 5]                   23
[1, 3]                   23
[1, 3, 7]                22
[2, 3, 7]                21
[1, 2]                   17
[2, 3]                   17
[2, 5]                   16
[3, 5, 7]                13
[1, 4]                   12
[3, 4]                   11
[1, 2, 7]                 8
[2, 4]                    8
[1, 3, 5, 7]              7
[2, 3, 5, 7]              6
[4, 5, 7]                 6
[1, 2, 3, 5]              5
[1, 2, 5, 7]              5
[2, 5, 7]                 5
[1, 2, 3]                 4
[1, 3, 4, 7]          

In [93]:
print(processed_df['category_lemmatized_comments_validated'].value_counts())

category_lemmatized_comments_validated
[uncategorized]    81774
[1]                 7698
[1, 2]              7123
Name: count, dtype: int64


In [69]:
keyword_dict_lemmatized.items()

dict_items([(1, ['no_climate_emergency', 'melt', 'arctic_ice', 'arctice_sea_ice', 'sea_level_rise', 'extreme_weather', 'global_cooling', 'greenland_ice', 'ice_cap', 'arctic_ice', 'extreme_heat', 'extreme_cold']), (2, ['natural_cycle', 'co2_be_not_the_cause', 'greenhouse_gas', 'no_co2_greenhouse_effect', 'no_effect', 'miniscule_effect', 'man_have_no_control']), (3, ['plant_food', 'plant_growth', 'thrive', 'carbon_element_be_essential', 'aveage_temperature_increase', '1_degree', 'more_fossil_fuel', 'no_co2', 'plant_food', 'not_pollution', '0.1c', 'ppm', 'not_a_pollutant']), (4, ['green_energy', 'renewable_energy', 'energy_production', 'windmill', 'solar_panel']), (5, ['alarmism', 'catastrophist', 'doomsday_cult', 'climate_hysteric', 'unscientific', 'corrupt_politician', 'lie_about_everything', 'idiocy', 'lunatic', 'climate_worship', 'climatists', 'alarmist', 'compliant_medium', 'climate_hysteria', 'climate_narrative', 'climate_cult', 'scientism', 'climate_science_myth', 'lie_in_science',

In [77]:
keyword_dict_stemmed.items()

dict_items([(1, ['no_climat_emerg', 'melt', 'arctic_ice', 'arctic_sea_ice', 'sea_level_rise', 'extrem_weather', 'global_cool', 'greenland_ice', 'ice_cap', 'arctic_ice', 'extrem_heat', 'extrem_cold']), (2, ['natur_cycl', 'co2_is_not_the_caus', 'greenhous_gas', 'no_co2_greenhous_effect', 'no_effect', 'miniscul_effect', 'man_has_no_control']), (3, ['plant_food', 'plant_growth', 'thrive', 'carbon_element_is_essenti', 'aveag_temperatur_increas', '1_degre', 'more_fossil_fuel', 'no_co2', 'plant_food', 'not_pollut', '0.1c', 'ppm', 'not_a_pollut']), (4, ['green_energi', 'renew_energi', 'energi_product', 'windmil', 'solar_panel']), (5, ['alarm', 'catastrophist', 'doomsday_cult', 'climat_hyster', 'unscientif', 'corrupt_politician', 'lie_about_everyth', 'idioci', 'lunat', 'climat_worship', 'climatist', 'alarmist', 'compliant_media', 'climat_hysteria', 'climat_narrat', 'climat_cult', 'scientism', 'climat_scienc_myth', 'lie_in_scienc', 'climat_apocalyps', 'propaganda', 'doomsay', 'clown_show', 'fake