In [1]:
import pandas as pd
from IPython.display import clear_output, display
import os, datetime
import numpy as np
import networkx as nx

## Fetching tweets with selected hashtags

In [2]:
DATA_PATH = 'data/tweets/'

week_list = [week_dir for week_dir in os.listdir(DATA_PATH) if os.path.isdir(DATA_PATH+week_dir) and not week_dir.endswith('.ipynb_checkpoints')]
print('Amount of extracted weeks: ', len(week_list))

Amount of extracted weeks:  26


In [3]:
week_list.sort()
week_list.remove('week_pr_01')
week_list.insert(week_list.index('week_12')+1, 'week_pr_01')
week_list.remove('week_pr_02')
week_list.insert(week_list.index('week_pr_01')+1, 'week_pr_02')
week_list.remove('week_pr_03')
week_list.insert(week_list.index('week_17')+1, 'week_pr_03')
week_list.remove('week_pr_04')
week_list.insert(week_list.index('week_21')+1, 'week_pr_04')

print(week_list)

['week_01', 'week_02', 'week_03', 'week_04', 'week_05', 'week_06', 'week_07', 'week_08', 'week_09', 'week_10', 'week_11', 'week_12', 'week_pr_01', 'week_pr_02', 'week_13', 'week_14', 'week_15', 'week_16', 'week_17', 'week_pr_03', 'week_18', 'week_19', 'week_20', 'week_21', 'week_pr_04', 'week_22']


In [4]:
# hashtag groups
neutral_hashtags = ['cpidacovid', 'cpidapandemia'] #, 'covid19', 'cpidacovid19','brasil']
agt_cpi_hashtags = ['renanvagabundo', 'cpidocirco'] #, 'renansabiadetudo', 'euautorizopresidente', 'cpidotse', 'bolsonaropresidenteate2026']
pro_cpi_hashtags = ['forabolsonaro', 'forabolsonarogenocida'] #, 'bolsonarogenocida', 'cpidogenocidio', '29mforabolsonaro', '3jforabolsonaro']
selected_hashtags = neutral_hashtags+agt_cpi_hashtags+pro_cpi_hashtags

In [5]:
selected_hashtags

['cpidacovid',
 'cpidapandemia',
 'renanvagabundo',
 'cpidocirco',
 'forabolsonaro',
 'forabolsonarogenocida']

In [6]:
split_tts = []
columns = ['author', 'user_description', 'tweet', 'hashtags']

In [7]:
for week in week_list:
    print(f'Starting {week}:')
    week_df = pd.read_parquet(f'{DATA_PATH+week}/merged_{week}.parquet')
    for idx, tweet in week_df.iterrows():
        user = eval(tweet['user'])
        hashtags = eval(tweet['hashtags'])
        #if hashtags and not set(selected_hashtags).isdisjoint(hashtags):
        #    split_tts.append([user['username'], user['description'], tweet['content'], hashtags])
        if hashtags:
            for hashtag in hashtags:
                hashtag_lower = hashtag.lower()
                if hashtag_lower in selected_hashtags:
                    split_tts.append([user['username'], user['description'], tweet['content'], hashtags])
                    break
    print(f'''
    Finished {week}:
    Total number of tweets: {len(split_tts)}
    \n
    ''')
    clear_output()


In [8]:
split_tt_df = pd.DataFrame(columns=columns, data=split_tts)
split_tt_df.drop_duplicates(subset=['author', 'user_description', 'tweet'], keep='last', inplace=True, ignore_index=True)
split_tt_df.shape

(1554613, 4)

In [9]:
split_tt_df.head()

Unnamed: 0,author,user_description,tweet,hashtags
0,Sandroka131,“O Brasil que come não conhece o Brasil que te...,@zehdeabreu @verabr1 Mais de 100 pedidos de im...,"[CPIdaCovid, GenocidaDesgracado]"
1,thivagando,🏳️‍🌈 O antropólogo que logo sou...\nPalavras-c...,Quase 400 mil mortos no Brasil por Covid-19 e ...,"[BolsonaroGenocida, ForaBolsonaro]"
2,AndreaPorto2021,,@jnascim @planalto Que horror esse @jairbolson...,"[ImpeachmentDeBolsonaroUrgente, CPIdaCovid]"
3,Sandroka131,“O Brasil que come não conhece o Brasil que te...,O PT está se reunindo com especialistas da saú...,"[CPIdaCovid, FINES]"
4,HSarapeck,"Rubro-negro, carioca, eleitor de Lula e do PT ...","Intervenção militar para abrir boates, shoppin...","[CPIdaCovid, ImpeachmentDeBolsonaroUrgente]"


In [10]:
split_tt_df.to_parquet(DATA_PATH+'selected_tt_split.parquet', index=False)
del(split_tt_df)

## Social Network Graph

In [11]:
tt_df = pd.read_parquet(DATA_PATH+'selected_tt_split.parquet')
usr_list = list(tt_df['author'].value_counts().index)
len(usr_list)

274172

In [12]:
# loading the complete network graph
tt_network = nx.read_gexf('data/networks/complete_network.gexf')

In [13]:
nodes = []

In [14]:
edges = []

In [15]:
nodes_len = len(list(tt_network.nodes()))
for usr, data in list(tt_network.nodes(data=True)):
    if usr not in nodes and usr in usr_list:
        nodes.append((usr, data['tt_id'], data['description']))

KeyboardInterrupt: 

In [None]:
len(nodes)

In [None]:
G = nx.MultiDiGraph()

In [None]:
for usr, tt_id, desc in nodes:
    G.add_node(usr, twitter_id=tt_id, description=desc)
len(G.nodes())

In [None]:
nx.write_gexf(G, "data/networks/split_tt_nodes.gexf")

In [None]:
G = nx.read_gexf('data/networks/split_tt_nodes.gexf')
len(G.nodes())

In [None]:
nodes_temp = list(G.nodes())
for src, tgt, data in list(tt_network.edges(data=True)):
    if src in nodes_temp and tgt in nodes_temp:
        edges.append((src, tgt, data['interac_type']))
del(nodes_temp)

In [None]:
len(edges)

In [None]:
pd.DataFrame(data=edges, columns=['source', 'target', 'type']).value_counts()

In [None]:
for src, tgt, int_type in edges:
    G.add_edge(src, tgt, interaction_type=int_type)

In [None]:
print(f'''
Number of nodes: {len(list(G.nodes()))}
Number of edges: {len(list(G.edges()))}
''')

In [None]:
nx.write_gexf(G, 'data/networks/split_tt_network.gexf')

In [None]:
G_complete = nx.read_gexf('data/networks/split_tt_network.gexf')
list(G_complete.edges(data=True))[20:30]

In [None]:
edges_mention = [(src, trg, data) for src, trg, data in G_complete.edges(data=True) if data['interaction_type']=='mention']
edges_quoted = [(src, trg, data) for src, trg, data in G_complete.edges(data=True) if data['interaction_type']=='quoted_rt']
edges_reply = [(src, trg, data) for src, trg, data in G_complete.edges(data=True) if data['interaction_type']=='reply']

G_mention = nx.DiGraph()
G_quoted = nx.DiGraph()
G_reply = nx.DiGraph()

G_mention.add_edges_from(edges_mention)
G_quoted.add_edges_from(edges_quoted)
G_reply.add_edges_from(edges_reply)

G_mention.remove_nodes_from([node for node,degree in dict(G_mention.degree()).items() if degree < 1])
G_quoted.remove_nodes_from([node for node,degree in dict(G_quoted.degree()).items() if degree < 1])
G_reply.remove_nodes_from([node for node,degree in dict(G_reply.degree()).items() if degree < 1])

print(f'''
Mention Network: {len(G_mention.nodes())} nodes and {len(edges_mention)} edges
Quoted Retweet Network: {len(G_quoted.nodes())} nodes and {len(edges_quoted)} edges
Reply Network: edges amount: {len(G_reply.nodes())} nodes and {len(edges_reply)} edges
''')

In [None]:
nx.write_gexf(G_mention, 'data/networks/split_tt_mention_network.gexf')
nx.write_gexf(G_quoted, 'data/networks/split_tt_quoted_network.gexf')
nx.write_gexf(G_reply, 'data/networks/split_tt_reply_network.gexf')