In [2]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from nltk.stem import WordNetLemmatizer

import nltk
from nltk.corpus import words

lemm = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
eng_set  = set(words.words())

import string
punct = set(string.punctuation)

In [3]:
import gdown
import pandas as pd

url = 'https://drive.google.com/uc?id=1DcWCYXWuFmWNqlqfJT8L1OpaYuuzaI9e'
output = 'final_df.csv'

gdown.download(url, output, quiet=False)

df = pd.read_csv(output)

Downloading...
From (original): https://drive.google.com/uc?id=1DcWCYXWuFmWNqlqfJT8L1OpaYuuzaI9e
From (redirected): https://drive.google.com/uc?id=1DcWCYXWuFmWNqlqfJT8L1OpaYuuzaI9e&confirm=t&uuid=64648cde-f366-4bf0-b85e-f70c087bb1b1
To: /Users/vaibhavnakrani/calel/final_df.csv
100%|██████████| 366M/366M [01:05<00:00, 5.58MB/s] 


In [3]:
def clean_text(text:str):
    """
    Removes emojis, double quotation marks, and other non-word characters, 
    keeping only English words.

    Args:
        text (str): The input text string.

    Returns:
        str: The cleaned text with only English words and spaces.
    """
    if not isinstance(text, str):
        text = str(text)
    #Lower case all text
    text = text.lower()
    
    if isinstance(text, float):  # Check if the input is a float
        text = str(text)
    # 1. Emoji Removal:
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # Flags 
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        "]+", flags=re.UNICODE)
    
    text = emoji_pattern.sub(r'', text) 

    # 2. Remove double quotation marks:
    text = text.replace('"', '')

    #Remove mentions
    pattern = r"@\w+"

    text = re.sub(pattern, '', text)

    #Clean single letters and numbers
    pattern = r'(\b[a-zA-Z]\s|\d+)'
    text = re.sub(pattern, '', text)

    # Remove Punctuations
    text = " ".join([word for word in text.split() if word not in punct])
    
    #Remove Stop words
    text = " ".join([word for word in text.split() if word not in stopwords])

    #Only words in english language
    text = " ".join([word for word in text.split() if word.lower() in eng_set])

    #Clean the corpus on what it was mined for.
    sxt = ['q', 'qresearch', 'qanon', 'Qanon', 'QAnon', 'QANon', 'RT', 'QANOn', 'QANON', 'Q', 'trump', 'Trump','would', 'like', 'trump', 'post', 'thread']
    
    text = " ".join(word for word in text.split() if word not in sxt)

    #Lemmatization
    text = lemm.lemmatize(text)

    text = text.strip()
    
    return text
df['clean_text'] = df['text'].apply(clean_text)

#### Length of post 

In [4]:
def length(row):
    text = row['clean_text']
    lst = text.split()
    return len(lst)
df['len'] = df.apply(length, axis=1)

Removes all posts less than 5 words

In [5]:
len(df)
df = df[df['len'] > 5]
len(df)

553003

Truncate very Lengthy text 

In [6]:
len(df)
df = df[df['len'] < 200]
len(df)

551543

In [7]:
all_words_clean = df.clean_text.str.split(expand=True).unstack().value_counts()
all_words_clean = all_words_clean[0:50]
px.bar(all_words_clean,x = all_words_clean.index,y = all_words_clean.values, color_continuous_scale='viridis',
       color = all_words_clean.values)

In [8]:
df['group'] = df.index // 20

df_grouped = df.groupby('group').agg({
    'timestamp': 'last',
    'clean_text': ' '.join
}).reset_index(drop=True)

len(df_grouped)

49081

In [9]:
df_grouped['len'] = df_grouped.apply(length, axis = 1)

In [10]:
def tokenize(text,clean_further = False):
        return nltk.word_tokenize(text)

text_col = 'clean_text'
df_grouped['tokens'] = df_grouped[text_col].apply(tokenize)

In [11]:
from gensim import corpora, models

documents = df_grouped['tokens'].to_list()

# Create a dictionary from the documents
dictionary = corpora.Dictionary(documents)

# Create a corpus (bag of words representation) from the dictionary
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [12]:
from gensim import models

num_topics = 5

lda_model = models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes = 1)

for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.011*"people" + 0.009*"us" + 0.008*"one" + 0.007*"think" + 0.006*"get" + 0.005*"know" + 0.005*"new" + 0.005*"time" + 0.005*"going" + 0.005*"even"
Topic 1: 0.009*"people" + 0.009*"think" + 0.008*"one" + 0.007*"us" + 0.006*"know" + 0.005*"get" + 0.005*"graphics" + 0.005*"research" + 0.004*"see" + 0.004*"need"
Topic 2: 0.010*"people" + 0.009*"us" + 0.006*"think" + 0.005*"know" + 0.005*"see" + 0.005*"get" + 0.005*"one" + 0.005*"new" + 0.004*"research" + 0.004*"time"
Topic 3: 0.009*"people" + 0.007*"one" + 0.007*"know" + 0.007*"see" + 0.007*"think" + 0.006*"us" + 0.005*"get" + 0.005*"congress" + 0.004*"must" + 0.004*"go"
Topic 4: 0.008*"people" + 0.008*"us" + 0.007*"think" + 0.005*"one" + 0.005*"said" + 0.005*"know" + 0.005*"get" + 0.005*"time" + 0.005*"see" + 0.004*"good"


In [13]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

text = df_grouped.clean_text.to_list()
vect = TfidfVectorizer(stop_words='english',smooth_idf=True)
input_matrix = vect.fit_transform(text)


NMF_model = NMF(n_components=10, random_state=1)
W = NMF_model.fit_transform(input_matrix)
H = NMF_model.components_

In [None]:
vocab = vect.get_feature_names_out()
topic_word_list = []
def get_topics(components): 
  for i, comp in enumerate(components):
    terms_comp = zip(vocab,comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    topic=" "
    for t in sorted_terms:
        topic= topic + ' ' + t[0]
        topic_word_list.append(topic)
        # print(topic_word_list)
  return topic_word_list
get_topics(H)