#### **This notebook tests the topic of the original tweets**

#### **Test LDA**

In [9]:
import gensim
from gensim import corpora

# Sample documents
documents = [
    "I love to play football",
    "Football is a popular sport",
    "I enjoy playing basketball",
    "Basketball is my favorite sport",
    "Soccer is a great game"
]

# Tokenize the documents
tokenized_docs = [doc.lower().split() for doc in documents]

# Create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(tokenized_docs)

# Convert tokenized documents to vectors (bag of words representation)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
# print(corpus)

# Perform LDA topic modeling
num_topics = 2
lda_model = gensim.models.LdaModel(corpus, 
                                   num_topics=num_topics, 
                                   id2word=dictionary, 
                                   passes=10
                                  )

# Print the topics and their corresponding words
print(len(lda_model.print_topics()[0]))
for topic in lda_model.print_topics():
    print(topic)

# Get the topic distribution for each document
for i, doc in enumerate(corpus):
    print(f"Document {i+1} - Topic Distribution:", lda_model.get_document_topics(doc))


2
(0, '0.092*"i" + 0.092*"basketball" + 0.092*"football" + 0.092*"sport" + 0.088*"is" + 0.055*"to" + 0.055*"love" + 0.055*"play" + 0.055*"enjoy" + 0.055*"playing"')
(1, '0.121*"a" + 0.117*"is" + 0.107*"great" + 0.107*"game" + 0.107*"soccer" + 0.038*"popular" + 0.038*"sport" + 0.037*"football" + 0.037*"basketball" + 0.037*"my"')
Document 1 - Topic Distribution: [(0, 0.91029227), (1, 0.08970776)]
Document 2 - Topic Distribution: [(0, 0.85039765), (1, 0.14960231)]
Document 3 - Topic Distribution: [(0, 0.892833), (1, 0.10716701)]
Document 4 - Topic Distribution: [(0, 0.90283024), (1, 0.09716974)]
Document 5 - Topic Distribution: [(0, 0.09132651), (1, 0.90867347)]


#### **Import Packages**

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import datetime
import warnings
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
import sys
import os

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as viz_hp
import helper.helper as hp
import helper.file_helper as file_hp
import config.config as config_hp
import helper.pandas_helper as pd_hp
import helper.twitter_helper as twitter_hp

#### **Original tweets for IO: This includes all the original tweetids**

In [2]:
## For alive tweets
config = config_hp.config()
poster_path = config['POSTER_PATH']
poster_org_tweets = poster_path['parsed_poster_org_tweets']
# poster_alive_file = poster_path['poster_alive_file']

df_org = pd.read_pickle(poster_org_tweets)

In [49]:
df_org['tweetid'].nunique()

96041

In [31]:
df_org.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96041 entries, 0 to 96040
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   text                    96041 non-null  object 
 1   conversation_id         34185 non-null  object 
 2   lang                    34185 non-null  object 
 3   entities                0 non-null      object 
 4   possibly_sensitive      34185 non-null  object 
 5   reply_settings          34185 non-null  object 
 6   created_at              96041 non-null  object 
 7   edit_history_tweet_ids  34185 non-null  object 
 8   tweetid                 96041 non-null  object 
 9   author_id               34185 non-null  object 
 10  retweet_count           34185 non-null  float64
 11  reply_count             34185 non-null  float64
 12  like_count              34185 non-null  float64
 13  quote_count             34185 non-null  float64
 14  impression_count        34185 non-null

In [3]:
df_org['text'].unique()

array(['Could not find tweet with ids: [64953578015571968].',
       'Could not find tweet with ids: [138715327608524800].',
       'Could not find tweet with ids: [175649582045347840].', ...,
       'Could not find tweet with ids: [1387924316130619392].',
       'Could not find tweet with ids: [1387938253119033088].',
       '@rinasketch 仕事終わりに買ってきます👋😃'], dtype=object)

In [6]:
df_org.columns

Index(['text', 'conversation_id', 'lang', 'entities', 'possibly_sensitive',
       'reply_settings', 'created_at', 'edit_history_tweet_ids', 'tweetid',
       'author_id', 'retweet_count', 'reply_count', 'like_count',
       'quote_count', 'impression_count', 'expanded_url', 'display_url',
       'in_reply_to_user_id', 'referenced_tweets', 'context_annotations',
       'entity_annotations', 'cashtags', 'hashtags', 'mentions'],
      dtype='object')

#### **Load Replies**

In [4]:
config = config_hp.config()
balanced = config['BALANCED']

balanced_neg_conversation = balanced['balanced_neg_conversation']
balanced_pos_conversation = balanced['balanced_pos_conversation']

#### Load positive 
df_pos = pd.read_pickle(balanced_pos_conversation)
print('Unique poster tweetid: ', 
      df_pos['poster_tweetid'].nunique())

# #### Load negative 
# df_neg = pd.read_pickle(balanced_neg_conversation)
# print('Unique poster tweetid: ', 
#       df_neg['poster_tweetid'].nunique())

Unique poster tweetid:  3866


In [5]:
df_pos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2160484 entries, 0 to 2673080
Data columns (total 16 columns):
 #   Column           Dtype  
---  ------           -----  
 0   tweet_text       object 
 1   conversation_id  object 
 2   replier_tweetid  object 
 3   replier_userid   object 
 4   poster_userid    object 
 5   poster_tweetid   object 
 6   tweet_time       object 
 7   tweet_language   object 
 8   replier_label    int64  
 9   year             object 
 10  campaign         object 
 11  tweet_label      int64  
 12  tweet_time_year  object 
 13  common           float64
 14  id               object 
 15  username         object 
dtypes: float64(1), int64(2), object(13)
memory usage: 280.2+ MB


#### **Check the missing campaign**

In [9]:
df_campaign_null = df_pos.loc[df_pos['campaign'].isnull()]

In [10]:
df_campaign_null['replier_label'].nunique()

1

In [26]:
df_campaign_null['poster_tweetid'].count()

2093113

In [16]:
df_campaign_null['poster_tweetid'].nunique()

3866

In [17]:
df_campaign = df_pos.loc[~df_pos['campaign'].isnull()]

print(df_campaign['campaign'].nunique())
print(df_campaign['poster_tweetid'].nunique())

28
3866


In [18]:
len(
    set(
        df_campaign['poster_tweetid']).intersection(
        set(df_campaign_null['poster_tweetid'])
    )
)

3866

In [37]:
keys = list(df_campaign_null
            .groupby(['poster_tweetid'])
            .groups.keys()
           )

df_poster_tweetid = pd.DataFrame(data=keys,
                              columns=['poster_tweetid']
                              )

print(df_poster_tweetid.info())

keys = list(df_campaign
            .groupby(['poster_tweetid', 'campaign'])
            .groups.keys()
           )

df_pos_campaign = pd.DataFrame(data=keys,
                              columns=['poster_tweetid', 'campaign']
                              )
print(df_pos_campaign['campaign'].unique())

df_merge = df_poster_tweetid[['poster_tweetid']].merge(
    df_pos_campaign[['poster_tweetid', 'campaign']])

print(df_merge.info())
print(df_merge['poster_tweetid'].nunique())
print(df_merge['campaign'].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3866 entries, 0 to 3865
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   poster_tweetid  3866 non-null   object
dtypes: object(1)
memory usage: 30.3+ KB
None
['serbia_022020' 'saudi_arabia_112019' 'ecuador_082019' 'turkey_052020'
 'sa_eg_ae_022020' 'uae_082019' 'iran_201906' 'honduras_022020'
 'egypt_022020' 'iran_201901_1' 'iranian' 'uganda_0621' 'china_082019'
 'qatar_082020' 'iran_202012' 'venezuela_201901' 'indonesia_022020'
 'spain_082019' 'egypt_uae_082019' 'cuba_082020' 'china_052020'
 'thailand_092020' 'MX_0621' 'Tanzania_0621' 'CNHU_0621' 'Venezuela_0621'
 'ira' 'russia_201901_1']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4271 entries, 0 to 4270
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   poster_tweetid  4271 non-null   object
 1   campaign        4271 non-null   ob

In [38]:
df_org.loc[df_org['tweetid'].isin(df_merge['poster_tweetid']), 
           'campaign'] = df_merge['campaign']

In [47]:
print(df_merge['campaign'].nunique())

28


In [48]:
df_org['campaign'].nunique()

15

In [51]:
df_org.columns

Index(['text', 'conversation_id', 'lang', 'entities', 'possibly_sensitive',
       'reply_settings', 'created_at', 'edit_history_tweet_ids', 'tweetid',
       'author_id', 'retweet_count', 'reply_count', 'like_count',
       'quote_count', 'impression_count', 'expanded_url', 'display_url',
       'in_reply_to_user_id', 'referenced_tweets', 'context_annotations',
       'entity_annotations', 'cashtags', 'hashtags', 'mentions', 'campaign'],
      dtype='object')

In [41]:
config = config_hp.config()
path = config['POSTER_PATH']

org_tweet = path['poster_org_tweet_with_campaign']

df_org.to_pickle(org_tweet)

#### **Get text of original post with campaign?**

In [53]:
df_merge.loc[df_merge['poster_tweetid'].isin(df_org['tweetid']),
             'tweet_text'
            ] = df_org['text']

In [63]:
df_merge.loc[df_merge['poster_tweetid'].isin(df_org['tweetid']),
             'author_id'
            ] = df_org['author_id']

In [74]:
df_merge.loc[df_merge['poster_tweetid'].isin(df_org['tweetid']),
             'lang'
            ] = df_org['lang']

In [75]:
df_merge['campaign'].unique()

array(['serbia_022020', 'saudi_arabia_112019', 'ecuador_082019',
       'turkey_052020', 'sa_eg_ae_022020', 'uae_082019', 'iran_201906',
       'honduras_022020', 'egypt_022020', 'iran_201901_1', 'iranian',
       'uganda_0621', 'china_082019', 'qatar_082020', 'iran_202012',
       'venezuela_201901', 'indonesia_022020', 'spain_082019',
       'egypt_uae_082019', 'cuba_082020', 'china_052020',
       'thailand_092020', 'MX_0621', 'Tanzania_0621', 'CNHU_0621',
       'Venezuela_0621', 'ira', 'russia_201901_1'], dtype=object)

In [66]:
# df_merge['author_id'].unique()

In [56]:
df_merge['poster_tweetid'].nunique()

3866

In [76]:
config = config_hp.config()
path = config['POSTER_PATH']

coordinated_org_tweet = path['coordinated_org_tweet']

df_to_save = df_merge.loc[~df_merge['author_id'].isnull()]
df_to_save.to_pickle(coordinated_org_tweet)

#### **Topic for each campaings**

- Problems: one campaign might have multiple languages

In [83]:
campaigns = df_to_save['campaign'].unique().tolist()

for campaign in campaigns:
    print(campaign)
    df_campaign = df_to_save.loc[df_to_save['campaign'] == campaign]
    
    print(df_campaign[['tweet_text', 'lang']])
    
    print(df_campaign['lang'].unique())
    
    df_grp = (df_campaign.groupby(['lang'])
              .size()
              .to_frame('count')
              .reset_index()
             )
    
    language = 
    df_tr = df_campaign.loc[df_campaign['
    
    
    break

turkey_052020
                                             tweet_text lang
42    Как я голосовал в режиме он-лайн - http://t.co...   ru
69    Куда идти 1 сентября детям Донбасса? http://t....   ru
498                    This is so fucking aggravating 😑   en
693   @LBC consultants have a life too. Just like th...   en
798   @AkilahObviously stop talking. Hot tea. I ice ...   en
...                                                 ...  ...
4083  Is DJ Shiru Uganda's all-time best Deejay? htt...   en
4150  VUČIĆ OTKRIO: Nemačka kompanija gradiće istraž...  und
4183  "Ko je imao sreće da se jutros probudi u Beogr...  und
4197  #ArpiaIII en curso. Asegurando e inspeccionand...   es
4246  Continúa operación #ArpiaIII primeras cápsulas...   es

[235 rows x 2 columns]
['ru' 'en' 'und' 'zxx' 'sr' 'es' 'qht' 'fa' 'tr' 'ar' 'qme' 'ja']
   lang  count
0    ar     69
1    en     20
2    es     32
3    fa      2
4    ja      1
5   qht      1
6   qme      1
7    ru      3
8    sr     37
9    tr      7

#### **Embedding based topic modeling models**

mBERTopic, MUSE, LASER

https://www.pinecone.io/learn/bertopic/

https://github.com/ddangelov/Top2Vec