# __Network Creation and Analysis__

In [10]:
import pandas as pd
import networkx as nx
from datetime import datetime, timedelta

### Data

In [11]:
data_path = {
    'tweets': '../data/tweets/',
    'network': '../data/network/'
}

In [12]:
tweets_df = pd.read_csv(data_path['tweets']+'merged_data.csv')
tweets_df = tweets_df.query('is_retweet == True')

tweets_df.shape

(6875, 14)

In [13]:
tweets_df.sample(5)

Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
1783,2022-11-22 20:51:35,1595158073370640384,"Xandão está desesperado, pois sabe que pode se...",CludiaMascaren6,"{'id': 1358532762299416579, 'name': 'CláudiaMa...",False,,False,,False,,True,"{'user': 'joaovoltou', 'user_id': 158583396570...",['#BolsonaroReeleito']
3483,2022-11-22 23:08:51,1595192617112846336,SOBRE O RELATÓRIO DO P.L.: VAMOS AGUARDAR A AN...,MassisanaLima,"{'id': 317320395, 'name': 'Massisana Figueired...",False,,False,,False,,True,"{'user': 'JoaquinTeixeira', 'user_id': 4641281...",['#BolsonaroReeleito']
4859,2022-11-22 23:11:52,1595193372934897664,Bolsonaro 51% é a piada da vez! esse povo aluc...,DavidSa35681946,"{'id': 1474433629191344128, 'name': 'David San...",False,,False,,False,,True,"{'user': 'consallys', 'user_id': 1388223655029...",
1188,2022-11-22 22:05:52,1595176765269352450,O TSE tem que entregar o Código Fonte e reconh...,NovacSil,"{'id': 1577489902274494468, 'name': 'Sil 🇧🇷💛',...",False,,False,,False,,True,"{'user': 'vanete_araujo', 'user_id': 138290417...",['#BolsonaroReeleito']
4179,2022-11-23 00:36:41,1595214717966065665,O LULA TÁ ELEITO\n\nImprensa estrangeira: Cand...,Tatiana73985584,"{'id': 1592825763530768386, 'name': 'Tatiana M...",False,,False,,False,,True,"{'user': 'MarcelaSaraivas', 'user_id': 3075645...",


In [14]:
tweets_df.sort_values(by='created_at', inplace=True)

In [15]:
tweets_df.head(1)

Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
5127,2022-11-22 19:29:08,1595137323385376768,Bora ganhar também o 4º turno dessas eleições....,zurenemanique,"{'id': 1273599615535853569, 'name': 'zu maniqu...",False,,False,,False,,True,"{'user': '_Janoninho', 'user_id': 156389790744...",


In [16]:
tweets_df.tail(1)

Unnamed: 0,created_at,tweet_id,tweet_content,user,user_info,has_mention,mentions,is_reply,reply_to,is_quote,quoted_from,is_retweet,retweeted_from,hashtags
218,2022-11-23 02:58:59,1595250529164304390,Bolsonaro 51%\n#BolsonaroReeleito \n\nNão me C...,guiferreiracode,"{'id': 1497584853184921604, 'name': 'Guilherme...",False,,False,,False,,True,"{'user': 'KaiolOof', 'user_id': 15031679523712...",['#BolsonaroReeleito']


In [17]:
# converting date to GMT-3

def convert_2_gmt_br(date):
    date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    date = date - timedelta(hours=3)
    date = datetime.strftime(date, '%Y-%m-%d %H:%M:%S')
    return date

tweets_df['created_at'] = tweets_df['created_at'].apply(lambda x: convert_2_gmt_br(str(x)))

print(f'''
Min date/time: {tweets_df['created_at'].min()}
Max date/time: {tweets_df['created_at'].max()}
''')


Min date/time: 2022-11-22 16:29:08
Max date/time: 2022-11-22 23:58:59



### Network

In [18]:
# creating edges dataframes

edges = {
    'created_at': [],
    'source': [],
    'target': []
}

for idx, row in tweets_df.iterrows():
    edges['created_at'].append(row['created_at'])
    edges['source'].append(row['user'])
    edges['target'].append(eval(row['retweeted_from'])['user'])

edges_wt_df = pd.DataFrame(data=edges)

edges_wt_df.shape

(6875, 3)

In [19]:
edges_df = edges_wt_df[['source', 'target']]

In [20]:
edges_df.to_csv(data_path['network']+'edges.csv', index=False)
edges_wt_df.to_csv(data_path['network']+'edges_with_time.csv', index=False)

In [21]:
edges_df.value_counts().index[:5]

MultiIndex([(     'VidorLu',   'joaodomenech'),
            (     'VidorLu',     '_Janoninho'),
            (    'bozo_bot', 'marcosaraujjoo'),
            ('joaodomenech',   'joaodomenech'),
            ( 'GuedesMyrna',     '_Janoninho')],
           names=['source', 'target'])

In [22]:
w_edges = {
    'source':[],
    'target':[],
    'weight':[]
}

for edge, weight in list(zip(edges_df.value_counts().index, edges_df.value_counts().values)):
    w_edges['source'].append(edge[0])
    w_edges['target'].append(edge[1])
    w_edges['weight'].append(weight)

w_edges_df = pd.DataFrame(data=w_edges)
w_edges_df.to_csv(data_path['network']+'weighted_edges.csv', index=False)
w_edges_df.shape5

(5896, 3)

In [24]:
w_edges_df.sample(10)

Unnamed: 0,source,target,weight
2855,lucas33074852,euailze,1
2830,mae_tarjapreta,joaodomenech,1
2101,wagnerlbc,AdrianeNelci,1
107,analucia3g,_Janoninho,4
3428,Guirodox_,euailze,1
4476,Pitchucky,RonaldoJBBarros,1
3548,HumbertoSozo,CinthiaMagal,1
5047,Reginal14480771,EstelaEuMesmaSC,1
1388,andrew_cid,PatriaAmada0709,1
2414,jchaves1111,UdaJuris2022,1


In [26]:
# unique users

users = set(w_edges_df['source'].tolist()+w_edges_df['target'].tolist())
len(users)

1560

### Time beans