In [1]:
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer
import re
from pprint import pprint
nltk.download('punkt_tab')
nltk.download('stopwords') # Download stopwords data
nltk.download('punkt')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
df = pd.read_csv("/content/drive/My Drive/tweets.csv")

In [6]:
df.info()
#Check if there is any missing values
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52542 entries, 0 to 52541
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   author            52542 non-null  object 
 1   content           52542 non-null  object 
 2   country           36 non-null     object 
 3   date_time         52542 non-null  object 
 4   id                52542 non-null  float64
 5   language          52542 non-null  object 
 6   latitude          1 non-null      float64
 7   longitude         1 non-null      float64
 8   number_of_likes   52542 non-null  int64  
 9   number_of_shares  52542 non-null  int64  
dtypes: float64(3), int64(2), object(5)
memory usage: 4.0+ MB


Unnamed: 0,0
author,0
content,0
country,52506
date_time,0
id,0
language,0
latitude,52541
longitude,52541
number_of_likes,0
number_of_shares,0


In [7]:
porter = PorterStemmer()
lancaster = LancasterStemmer()

In [8]:
def preprocess_text(text):
    if not isinstance(text, str):
        return []

    # Lowercase
    text = text.lower()

    #remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove all non-word and non-whitespace characters
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    return tokens

def porterstem(tokens):
    return [porter.stem(t) for t in tokens]

def lancasterstem(tokens):
    return [lancaster.stem(t) for t in tokens]

# Apply preprocessing
df['token_content'] = df['content'].apply(preprocess_text)
df['Porter_content'] = df['token_content'].apply(porterstem)
df['Lancaster_content'] = df['token_content'].apply(lancasterstem)

# Display the processed DataFrame
print(df[['content', 'token_content']].head())

                                             content  \
0  Is history repeating itself...?#DONTNORMALIZEH...   
1  @barackobama Thank you for your incredible gra...   
2                Life goals. https://t.co/XIn1qKMKQl   
3            Me right now 🙏🏻 https://t.co/gW55C1wrwd   
4  SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...   

                                       token_content  
0  [is, history, repeating, itselfdontnormalizehate]  
1  [barackobama, thank, you, for, your, incredibl...  
2                                      [life, goals]  
3                                   [me, right, now]  
4          [sisters, are, doin, it, for, themselves]  


In [9]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)

# Tokenize the 'content' column
data_words = list(sent_to_words(df['token_content']))

print(data_words[:10])


[['is', 'history', 'repeating'], ['barackobama', 'thank', 'you', 'for', 'your', 'incredible', 'grace', 'in', 'leadership', 'and', 'for', 'being', 'an', 'exceptional'], ['life', 'goals'], ['me', 'right', 'now'], ['sisters', 'are', 'doin', 'it', 'for', 'themselves'], ['happy', 'th', 'gma', 'fourmoreyears', 'lacma', 'los', 'angeles', 'county', 'museum', 'of', 'art'], ['kyoto', 'japan'], ['sanrio', 'puroland'], ['resolution', 'to', 'embody', 'authenticity'], ['sisters']]


In [10]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

In [11]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [12]:
print(trigram_mod[bigram_mod[data_words[0]]])

['is', 'history', 'repeating']


In [13]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    stop_words = gensim.utils.simple_preprocess('english')
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'VERB']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def keep_nouns_verbs(texts, chunk_size=50000):

    texts_out = []
    combined_text = " ".join([" ".join(sent) for sent in texts])  # Join all sentences into a single string

    # Process the text in chunks
    for i in range(0, len(combined_text), chunk_size):
        chunk = combined_text[i:i + chunk_size]
        doc = nlp(chunk)  # Process the chunk

        for sent in doc.sents:
            # Extract only the nouns and verbs from each sentence
            sentence = [token.text for token in sent if token.pos_ in ['NOUN']]
            texts_out.append(sentence)

    return texts_out


In [15]:
  # !pip install spacy
  # !python -m spacy download en_core_web_sm
  import spacy

  # Remove Stop Words
  data_words_nostops = remove_stopwords(data_words)




In [16]:
# Flatten the list of lists
flat_tokens = [token for sublist in data_words_nostops for token in sublist]

# Create the frequency distribution
freq = nltk.FreqDist(flat_tokens)

# Display the most common words
print(freq.most_common(15))

[('the', 18836), ('to', 14207), ('you', 11699), ('and', 8608), ('for', 8142), ('in', 7889), ('of', 7831), ('my', 6892), ('on', 6769), ('is', 6021), ('this', 4878), ('with', 4645), ('it', 4172), ('so', 3950), ('love', 3490)]


In [17]:
import nltk

# Flatten the list of lists to calculate frequency
flat_tokens = [token for sublist in data_words_nostops for token in sublist]

# Create frequency distribution
freq = nltk.FreqDist(flat_tokens)

# Define the frequency threshold (e.g., remove words that appear less than 5 times)
frequency_threshold = 5

# Get the list of words to keep (words that appear more than the threshold)
words_to_keep = [word for word, count in freq.items() if count >= frequency_threshold]

# Remove low-frequency words from data_words_nostops
data_words_nostops_filtered1 = [
    [word for word in sublist if word in words_to_keep]
    for sublist in data_words_nostops
]

# Check the filtered data
print(data_words_nostops_filtered1[:5])


[['is', 'history'], ['barackobama', 'thank', 'you', 'for', 'your', 'incredible', 'grace', 'in', 'leadership', 'and', 'for', 'being', 'an'], ['life', 'goals'], ['me', 'right', 'now'], ['sisters', 'are', 'doin', 'it', 'for', 'themselves']]


In [18]:
# Flatten the list of lists to calculate frequency
flat_tokens = [token for sublist in data_words_nostops_filtered1 for token in sublist]

# Create frequency distribution
freq = nltk.FreqDist(flat_tokens)

# Get the top 10 most common words
most_common_words = [word for word, count in freq.most_common(10)]
print("Most common words to remove:", most_common_words)

# Remove the top 10 most common words from data_words_nostops
data_words_nostops_filtered = [
    [word for word in sublist if word not in most_common_words]
    for sublist in data_words_nostops_filtered1
]

# Check the filtered data
print(data_words_nostops_filtered[:5])  # Print the first 5 rows


Most common words to remove: ['the', 'to', 'you', 'and', 'for', 'in', 'of', 'my', 'on', 'is']
[['history'], ['barackobama', 'thank', 'your', 'incredible', 'grace', 'leadership', 'being', 'an'], ['life', 'goals'], ['me', 'right', 'now'], ['sisters', 'are', 'doin', 'it', 'themselves']]


In [19]:
words_to_remove = ['time', 'fun', 'video', 'tag', 'thanks', 'night', 'guys', 'babes', 'tomorrow', 'today', 'year', 'stops', 'days', 'twitter', 'ways', 'este', 'tweets', 'por', 'day','years','ver','people','baby']

In [20]:
data_words_nostops_filtered2 = [
    [word for word in sublist if word not in words_to_remove]
    for sublist in data_words_nostops_filtered
]

In [21]:
noun_verbs = keep_nouns_verbs(data_words_nostops_filtered2)

In [22]:
# # Form Bigrams
data_words_bigrams = make_bigrams(noun_verbs)

In [23]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1)]]


In [24]:
id2word[0]

'authenticity'

In [25]:
frq = [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]]

In [26]:
frq

[[('authenticity', 1),
  ('barackobama', 1),
  ('carol', 1),
  ('cheer', 1),
  ('corner', 1),
  ('earth', 1),
  ('fam', 1),
  ('grace', 1),
  ('history', 1),
  ('holiday', 2),
  ('holidays', 1),
  ('leadership', 1),
  ('life', 1),
  ('light', 1),
  ('love', 2),
  ('music', 1),
  ('pls', 1),
  ('presents', 1),
  ('program', 1),
  ('resolution', 1),
  ('santa', 1),
  ('shannonwoodward', 1),
  ('shopping', 1),
  ('singalong', 1),
  ('sisters', 2),
  ('therapy', 1),
  ('tho', 1),
  ('tour', 1),
  ('tryna', 1),
  ('week', 1),
  ('wrap', 1)],
 [('holiday', 1),
  ('cloud', 1),
  ('cut', 1),
  ('family', 1),
  ('girl', 1),
  ('help', 1),
  ('imma', 1),
  ('jump', 1),
  ('mall', 1),
  ('masterpiece', 1),
  ('nights', 1),
  ('omg', 1),
  ('performance', 1),
  ('petition', 1),
  ('photos', 1),
  ('playlist', 1),
  ('proof', 1),
  ('rock', 1),
  ('season', 1),
  ('support', 1),
  ('taste', 1),
  ('text', 1),
  ('timeless', 1),
  ('unicef', 1),
  ('voice', 1),
  ('want', 1),
  ('water', 1)],
 [('li

In [27]:
type(corpus)

list

In [28]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]]

[[('authenticity', 1),
  ('barackobama', 1),
  ('carol', 1),
  ('cheer', 1),
  ('corner', 1),
  ('earth', 1),
  ('fam', 1),
  ('grace', 1),
  ('history', 1),
  ('holiday', 2),
  ('holidays', 1),
  ('leadership', 1),
  ('life', 1),
  ('light', 1),
  ('love', 2),
  ('music', 1),
  ('pls', 1),
  ('presents', 1),
  ('program', 1),
  ('resolution', 1),
  ('santa', 1),
  ('shannonwoodward', 1),
  ('shopping', 1),
  ('singalong', 1),
  ('sisters', 2),
  ('therapy', 1),
  ('tho', 1),
  ('tour', 1),
  ('tryna', 1),
  ('week', 1),
  ('wrap', 1)],
 [('holiday', 1),
  ('cloud', 1),
  ('cut', 1),
  ('family', 1),
  ('girl', 1),
  ('help', 1),
  ('imma', 1),
  ('jump', 1),
  ('mall', 1),
  ('masterpiece', 1),
  ('nights', 1),
  ('omg', 1),
  ('performance', 1),
  ('petition', 1),
  ('photos', 1),
  ('playlist', 1),
  ('proof', 1),
  ('rock', 1),
  ('season', 1),
  ('support', 1),
  ('taste', 1),
  ('text', 1),
  ('timeless', 1),
  ('unicef', 1),
  ('voice', 1),
  ('want', 1),
  ('water', 1)],
 [('li

In [56]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state= 50,
                                           update_every=1,
                                           chunksize=50,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

  and should_run_async(code)


In [57]:
# Print the Keyword in the 20 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.000*"athens" + 0.000*"bother" + 0.000*"hausoffanis" + 0.000*"scent" + '
  '0.000*"triumph" + 0.000*"core" + 0.000*"stomach" + 0.000*"rouge" + '
  '0.000*"asktonyandgaga" + 0.000*"horn"'),
 (1,
  '0.000*"athens" + 0.000*"bother" + 0.000*"hausoffanis" + 0.000*"scent" + '
  '0.000*"triumph" + 0.000*"core" + 0.000*"stomach" + 0.000*"rouge" + '
  '0.000*"asktonyandgaga" + 0.000*"horn"'),
 (2,
  '0.152*"weekend" + 0.105*"watch" + 0.086*"food" + 0.061*"makeup" + '
  '0.056*"hair" + 0.046*"tweet" + 0.039*"halloween" + 0.034*"surprise" + '
  '0.026*"olympics" + 0.024*"rain"'),
 (3,
  '0.365*"instagram" + 0.182*"art" + 0.127*"studio" + 0.075*"times" + '
  '0.019*"back" + 0.017*"table" + 0.007*"seat" + 0.000*"dailyfluff" + '
  '0.000*"project" + 0.000*"weekends_hashtag"'),
 (4,
  '0.194*"tonights" + 0.177*"ad" + 0.068*"president" + 0.035*"drop" + '
  '0.005*"run" + 0.003*"debates" + 0.000*"nbcthevoice" + 0.000*"album" + '
  '0.000*"shaks" + 0.000*"shakhq"'),
 (5,
  '0.393*"life" + 0.165

  and should_run_async(code)


In [58]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts = data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Perplexity:  -21.068377433924155

Coherence Score:  0.38609007782898425


In [49]:
import numpy as np

# Check if there are any complex numbers in the corpus
for doc in corpus:
    for word_id, count in doc:
        if np.iscomplex(count):
            print(f"Complex number found: word_id={word_id}, count={count}")



  and should_run_async(code)


In [52]:
# Rebuild the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word=id2word)


  and should_run_async(code)


In [53]:
# Visualize the topics
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models # Import the gensim_models submodule

pyLDAvis.enable_notebook()
# Use gensim_models instead of gensim
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  and should_run_async(code)




In [54]:
# Use LdaMulticore
ldamallet = gensim.models.LdaMulticore(corpus=corpus, num_topics=20, id2word=id2word, random_state=100, passes=10)

  and should_run_async(code)


In [55]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts= data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

  and should_run_async(code)


[(9,
  [('love', 0.061139632),
   ('song', 0.023681197),
   ('album', 0.01328627),
   ('life', 0.0118155265),
   ('tonight', 0.011706211),
   ('hint', 0.009409953),
   ('heart', 0.009068973),
   ('things', 0.008224596),
   ('woman', 0.008148724),
   ('show', 0.007847193)]),
 (18,
  [('dailyfluff', 0.03752059),
   ('project', 0.01976846),
   ('weekends_hashtag', 0.018240696),
   ('world', 0.017412137),
   ('life', 0.015256238),
   ('instagram', 0.01490524),
   ('submissions', 0.010218441),
   ('instameet', 0.009346071),
   ('week', 0.008815792),
   ('art', 0.008792493)]),
 (12,
  [('world', 0.024389222),
   ('love', 0.019418051),
   ('fans', 0.016214157),
   ('hondacivictour_futurenow', 0.014625172),
   ('demiworldtour', 0.014509308),
   ('tonight', 0.011860295),
   ('life', 0.011042753),
   ('honor', 0.0093338275),
   ('brisbane', 0.00921165),
   ('story', 0.009039124)]),
 (0,
  [('instagram', 0.014729318),
   ('tour', 0.014620946),
   ('world', 0.014114779),
   ('project', 0.013697872

In [37]:
vis = pyLDAvis.gensim_models.prepare(ldamallet, corpus, id2word)
vis

  and should_run_async(code)


In [38]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=None): # Change data to texts=None
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([pd.Series([int(topic_num), round(prop_topic,4), topic_keywords])])], ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    # Check if texts is provided, otherwise use an empty Series
    if texts is not None:  # Check if texts is provided
        contents = pd.Series(texts)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)

    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_words_bigrams)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

  and should_run_async(code)


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,3,0.7709,"project, explore, tonight, world, love, week, ...","[history, barackobama, grace, leadership, life..."
1,1,1,0.4439,"tonight, fallontonight, show, president, photo...","[imma, playlist, taste, omg, cut, proof, voice..."
2,2,4,0.9396,"project, tonight, instagram_story, world, life...","[bridge, guide, nytimes, tweet, realdonaldtrum..."
3,3,1,0.4588,"tonight, fallontonight, show, president, photo...","[way, turn, words, steps, men, women, freedom,..."
4,4,0,0.1,"project, love, week, world, artist, tonight, i...",[]
5,5,9,0.5499,"tonight, love, workout, game, world, twister, ...",[history]
6,6,5,0.8151,"submissions, life, instagram, birthday, love, ...","[omg, idk, text, roar, score, singing, airline..."
7,7,6,0.5905,"love, life, world, girl, week, fuck, tonight, ...","[strength, book, pic, world, covergirl, home, ..."
8,8,9,0.574,"tonight, love, workout, game, world, twister, ...","[birthday, birthday, birthday, purchase, lifes..."
9,9,0,0.1,"project, love, week, world, artist, tonight, i...",[]


In [39]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)],
                                            axis=0)

# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head(15)

  and should_run_async(code)


Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.9861,"project, love, week, world, artist, tonight, i...","[code, yesterday, flood, text, flood, support,..."
1,1,0.9735,"tonight, fallontonight, show, president, photo...","[volunteers, president, ofafallsummit, office,..."
2,2,0.9891,"tonight, love, show, photographer, song, music...","[heres, performance, woods, hollywood, blvd, t..."
3,3,0.9939,"project, explore, tonight, world, love, week, ...","[tweet, love, love, lovetwitter, thing, stpatr..."
4,4,0.9896,"project, tonight, instagram_story, world, life...","[takes, asktonyandgaga, jazz, roots, rules, as..."
5,5,0.9845,"submissions, life, instagram, birthday, love, ...","[community, lovetwitter, lifes, chrishemsworth..."
6,6,0.9867,"love, life, world, girl, week, fuck, tonight, ...","[season, periscope, gap, pop, shop, pop, shop,..."
7,7,0.9849,"world, videos, tonight, submissions, shadows, ...","[soup, therock, degree, reality, adventure, es..."
8,8,0.9883,"love, tonight, tour, president, health, brisba...","[ones, conditions, obamacare, plan, progress, ..."
9,9,0.9763,"tonight, love, workout, game, world, twister, ...","[hankgreen, minute, field, timeline, nights, m..."


In [40]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics_sorted = df_dominant_topics.sort_values(by=['Perc_Documents'], ascending=False)
df_dominant_topics_sorted.head()

  and should_run_async(code)


Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
2,4,"project, tonight, instagram_story, world, life...",907.0,0.2113
0,3,"project, explore, tonight, world, love, week, ...",890.0,0.2074
6,5,"submissions, life, instagram, birthday, love, ...",630.0,0.1468
4,0,"project, love, week, world, artist, tonight, i...",342.0,0.0797
5,9,"tonight, love, workout, game, world, twister, ...",332.0,0.0774


In [41]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample data for the Twitter topics and Newsgroup topics
twitter_topics = [
    (9, [('love', 0.06115393), ('song', 0.02379765), ('album', 0.013305273), ('life', 0.011848725),
         ('tonight', 0.011720518), ('hint', 0.009394263), ('heart', 0.009067771), ('things', 0.008216149),
         ('show', 0.0078661395), ('stone', 0.00774152)]),
    (12, [('world', 0.024278637), ('love', 0.01929273), ('fans', 0.016112046), ('hondacivictour_futurenow', 0.014531105),
          ('demiworldtour', 0.01441953), ('tonight', 0.011775983), ('life', 0.011038898),
          ('honor', 0.009267515), ('story', 0.009238742), ('brisbane', 0.009152281)]),
    (10, [('love', 0.032979287), ('thank', 0.021915438), ('album', 0.013777344), ('devonnebydemi', 0.01373335),
          ('track', 0.013673817), ('girl', 0.012695572), ('purpose', 0.012589291), ('work', 0.011562192),
          ('life', 0.009516285), ('song', 0.009241227)]),
    (14, [('tonight', 0.11773955), ('show', 0.05404548), ('music', 0.0405931), ('fallontonight', 0.036717724),
          ('game', 0.013194365), ('song', 0.011869849), ('lets', 0.008769403), ('performance', 0.008713063),
          ('album', 0.008026071), ('thank', 0.007476611)]),
    (17, [('love', 0.11177825), ('lol', 0.0153613025), ('girl', 0.014663266), ('life', 0.013605122),
          ('babies', 0.012773862), ('heart', 0.011598225), ('tonight', 0.011394098), ('hope', 0.011304251),
          ('thank', 0.010107974), ('omg', 0.010077409)]),
    (8, [('tonight', 0.047379334), ('pieceofme', 0.025228307), ('stage', 0.01873429), ('tune', 0.01708145),
         ('amp', 0.016622458), ('weekend', 0.015006226), ('fuck', 0.014996029), ('love', 0.013960137),
         ('birthday', 0.013671034), ('thank', 0.0134208575)]),
    (4, [('police', 0.018693691), ('officials', 0.013928789), ('tour', 0.0138496), ('trump', 0.01286801),
         ('official', 0.008882677), ('vote', 0.007826951), ('man', 0.0074702096), ('attack', 0.0067741824),
         ('plane', 0.0062467726), ('tonight', 0.0060603768)]),
]

newsgroup_topics = [
    (0, [('week', 0.008637846), ('video', 0.00804256), ('love', 0.0040145474), ('child', 0.0040109195),
         ('world', 0.0040104985), ('work', 0.0038012052), ('wedding', 0.0035145862), ('job', 0.0034305998),
         ('hour', 0.0033325676), ('bowl', 0.003236318)]),
    (1, [('state', 0.00672022), ('week', 0.006051368), ('election', 0.005339939), ('president', 0.0049975333),
         ('man', 0.004568345), ('world', 0.0044991532), ('show', 0.0043784673), ('police', 0.0042377845),
         ('child', 0.0040631685), ('attack', 0.0038225085)]),
    (2, [('week', 0.014457856), ('video', 0.010279458), ('love', 0.0063664317), ('world', 0.006342215),
         ('mother', 0.006011352), ('child', 0.0045926217), ('night', 0.003906293), ('story', 0.003892944),
         ('work', 0.0037842656), ('home', 0.0035902534)]),
    (3, [('video', 0.0120646125), ('week', 0.009013428), ('night', 0.005106818), ('school', 0.0045635863),
         ('family', 0.004543826), ('state', 0.004443208), ('film', 0.0042448225), ('show', 0.003398651),
         ('child', 0.0033762364), ('game', 0.0032092095)]),
    (4, [('world', 0.009449489), ('child', 0.006794866), ('work', 0.0050347704), ('family', 0.0048021455),
         ('week', 0.0041088997), ('business', 0.004108848), ('part', 0.004079357), ('school', 0.0037438832),
         ('country', 0.0037157224), ('community', 0.003535347)]),
    (5, [('video', 0.0063205487), ('world', 0.0049284063), ('love', 0.0041358187), ('child', 0.004021161),
         ('home', 0.0038003777), ('week', 0.003712925), ('help', 0.0034921353), ('work', 0.003426201),
         ('part', 0.0033138874), ('food', 0.0031798477)]),
    (6, [('child', 0.008918958), ('video', 0.007994141), ('wedding', 0.0078018373), ('week', 0.006504108),
         ('food', 0.0063826116), ('world', 0.0059692813), ('home', 0.0054620844), ('love', 0.0054596025),
         ('kid', 0.005254038), ('parent', 0.0052448753)]),
    (7, [('week', 0.0053595034), ('world', 0.0053306273), ('school', 0.0052033137), ('man', 0.005121191),
         ('video', 0.004879572), ('student', 0.00443848), ('show', 0.004308636), ('state', 0.0041216444),
         ('family', 0.0038493867), ('child', 0.0036705618)])
]

# Create a dictionary to hold all the words and their weights
words = set()

# Process Twitter topics
twitter_dict = {}
for topic_id, topic_words in twitter_topics:
    for word, weight in topic_words:
        twitter_dict[(topic_id, word)] = weight
        words.add(word)

# Process Newsgroup topics
newsgroup_dict = {}
for topic_id, topic_words in newsgroup_topics:
    for word, weight in topic_words:
        newsgroup_dict[(topic_id, word)] = weight
        words.add(word)

# Convert the 'words' set to a list
words = list(words)

# Create DataFrames for Twitter and Newsgroup topics
twitter_df = pd.DataFrame(index=range(len(twitter_topics)), columns=words, data=0)
newsgroup_df = pd.DataFrame(index=range(len(newsgroup_topics)), columns=words, data=0)

# Fill DataFrames with the word weights from both Twitter and Newsgroup topics
for (topic_id, word), weight in twitter_dict.items():
    twitter_df.at[topic_id, word] = weight

for (topic_id, word), weight in newsgroup_dict.items():
    newsgroup_df.at[topic_id, word] = weight

# Aggregate the word weights for each topic set (by summing or averaging)
twitter_vector = twitter_df.sum(axis=0).values.reshape(1, -1)  # Sum the words for all Twitter topics
newsgroup_vector = newsgroup_df.sum(axis=0).values.reshape(1, -1)  # Sum the words for all Newsgroup topics

# Compute Cosine Similarity between the two topic sets
cosine_sim = cosine_similarity(twitter_vector, newsgroup_vector)

# Output the similarity score
print(f"Cosine Similarity between Twitter topics and Newsgroup topics: {cosine_sim[0][0]}")


  and should_run_async(code)
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_i

Cosine Similarity between Twitter topics and Newsgroup topics: 0.18645007838152536


  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
  newsgroup_df.at[topic_id, word] = weight
