In [2]:
import pandas as pd
from IPython.display import clear_output, display
import os, datetime
import numpy as np
import networkx as nx

## Fetching tweets with selected hashtags

In [4]:
DATA_PATH = 'data/tweets/'

week_list = [week_dir for week_dir in os.listdir(DATA_PATH) if os.path.isdir(DATA_PATH+week_dir) and not week_dir.endswith('.ipynb_checkpoints')]
print('Amount of extracted weeks: ', len(week_list))

Amount of extracted weeks:  26


In [5]:
week_list.sort()
week_list.remove('week_pr_01')
week_list.insert(week_list.index('week_12')+1, 'week_pr_01')
week_list.remove('week_pr_02')
week_list.insert(week_list.index('week_pr_01')+1, 'week_pr_02')
week_list.remove('week_pr_03')
week_list.insert(week_list.index('week_17')+1, 'week_pr_03')
week_list.remove('week_pr_04')
week_list.insert(week_list.index('week_21')+1, 'week_pr_04')

print(week_list)

['week_01', 'week_02', 'week_03', 'week_04', 'week_05', 'week_06', 'week_07', 'week_08', 'week_09', 'week_10', 'week_11', 'week_12', 'week_pr_01', 'week_pr_02', 'week_13', 'week_14', 'week_15', 'week_16', 'week_17', 'week_pr_03', 'week_18', 'week_19', 'week_20', 'week_21', 'week_pr_04', 'week_22']


In [4]:
# hashtag groups
neutral_hashtags = ['cpidacovid', 'cpidapandemia', 'covid19', 'cpidacovid19','brasil']
agt_cpi_hashtags = ['renanvagabundo', 'cpidocirco', 'renansabiadetudo', 'euautorizopresidente', 'cpidotse', 'bolsonaropresidenteate2026']
pro_cpi_hashtags = ['cpidogenocidio', 'forabolsonaro', 'forabolsonarogenocida', 'bolsonarogenocida', '29mforabolsonaro', '3jforabolsonaro']
selected_hashtags = neutral_hashtags+agt_cpi_hashtags+pro_cpi_hashtags

In [5]:
selected_hashtags

['cpidacovid',
 'cpidapandemia',
 'covid19',
 'cpidacovid19',
 'brasil',
 'renanvagabundo',
 'cpidocirco',
 'renansabiadetudo',
 'euautorizopresidente',
 'cpidotse',
 'bolsonaropresidenteate2026',
 'cpidogenocidio',
 'forabolsonaro',
 'forabolsonarogenocida',
 'bolsonarogenocida',
 '29mforabolsonaro',
 '3jforabolsonaro']

In [6]:
split_tts = []
columns = ['author', 'user_description', 'tweet', 'hashtags']

In [7]:
for week in week_list:
    print(f'Starting {week}:')
    week_df = pd.read_parquet(f'{DATA_PATH+week}/merged_{week}.parquet')
    for idx, tweet in week_df.iterrows():
        tt_fields = []
        user = eval(tweet['user'])
        hashtags = eval(tweet['hashtags'])
        if hashtags and not set(selected_hashtags).isdisjoint(hashtags):
            split_tts.append([user['username'], user['description'], tweet['content'], hashtags])            
    print(f'''
    Finished {week}:
    Total number of tweets: {len(split_tts)}
    \n
    ''')
    clear_output(wait=True)
    

Starting week_22:

    Finished week_22:
    Total number of tweets: 83870
    

    


In [8]:
split_tt_df = pd.DataFrame(columns=columns, data=split_tts)
split_tt_df.drop_duplicates(subset=['author', 'user_description', 'tweet'], keep='last', inplace=True, ignore_index=True)
split_tt_df.shape

(83101, 4)

In [9]:
split_tt_df.head()

Unnamed: 0,author,user_description,tweet,hashtags
0,FelipeFogal,"Se mudar a cor da grama, eleitor do bolsonazi ...",@Collor caiu por causa de uma Elba... @dilmabr...,"[imbecil, genocida, assassino, miliciano, bols..."
1,silasfiorotti,Doutor em Antropologia (USP) | Pesquisador | P...,O papel da igreja é denunciar e combater todos...,"[evangélicos, bolsonarogenocida, foragenocida,..."
2,silasfiorotti,Doutor em Antropologia (USP) | Pesquisador | P...,O papel da igreja é denunciar e combater todos...,"[evangélicos, bolsonarogenocida, foragenocida,..."
3,silasfiorotti,Doutor em Antropologia (USP) | Pesquisador | P...,"""A 6 dias do fim, abril se torna o mês mais le...","[bolsonarogenocida, impeachmentja, foragenocida]"
4,DiasSalles1,Humildade,@ZeRicardoAM Que chegue logo 2022.\n#MPdoApaga...,"[MPdoApagao, cpidacovid, SalveaEnergia]"


In [10]:
split_tt_df.to_parquet(DATA_PATH+'selected_tt_split.parquet', index=False)
del(split_tt_df)

## Social Network Graph

In [5]:
tt_df = pd.read_parquet(DATA_PATH+'selected_tt_split.parquet')
usr_list = list(tt_df['author'].value_counts().index)
len(usr_list)

33586

In [6]:
# loading the complete network graph
tt_network = nx.read_gexf('data/networks/complete_network.gexf')

In [7]:
nodes = []

In [8]:
edges = []

In [9]:
count = 0
nodes_len = len(list(tt_network.nodes()))
for usr, data in list(tt_network.nodes(data=True)):
    clear_output(wait=False)
    if usr not in nodes and usr in usr_list:
        nodes.append((usr, data['tt_id'], data['description']))
    count += 1
    print(f'{count} nodes processed, {nodes_len-count} to go...')

325154 nodes processed, 0 to go...


In [10]:
len(nodes)

19965

In [11]:
G = nx.MultiDiGraph()

In [12]:
for usr, tt_id, desc in nodes:
    G.add_node(usr, twitter_id=tt_id, description=desc)
len(G.nodes())

19965

In [14]:
nx.write_gexf(G, "data/networks/split_tt_nodes.gexf")

In [15]:
nodes_temp = list(G.nodes())
count = 0
edges_len = len(list(tt_network.edges()))
for src, tgt, data in list(tt_network.edges(data=True)):
    clear_output(wait=False)
    if src in nodes_temp and tgt in nodes_temp:
        edges.append((src, tgt, data['interac_type']))
    count += 1
    print(f'{count} edges finished, {edges_len-count} to go...')
del(nodes_temp)

5869 edges finished, 1699138 to go...


KeyboardInterrupt: 

In [None]:
len(edges)

In [None]:
pd.DataFrame(data=edges, columns=['source', 'target', 'type']).value_counts()

In [None]:
for src, tgt, int_type in edges:
    G.add_edge(src, tgt, interaction_type=int_type)

In [None]:
print(f'''
Number of nodes: {len(list(G.nodes()))}
Number of edges: {len(list(G.edges()))}
''')