# __Networks Creation__

Notebook to create the Reply, Quoted Retweet, and Mentions Networks:

- Reply Network: User A replies to user B creates edge A->B
- Quote Retweet Network: User A retweets with quote user B creates edge A->B
- Mention Network: User A mention user B creates edge A->B

Creates files to NetworkX and Gephi.

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import os, datetime

In [2]:
DATA_PATH_TT = 'data/tweets/'
DATA_PATH_NT = 'data/networks/'

In [3]:
def create_networks(tweets_df):
    nodes = []
    edges = []
    for u, rt, qrt, ir, mu in tweets_df[['user', 'retweetedTweet', 'quotedTweet', 'inReplyToUser', 'mentionedUsers']].values:
        user = eval(u)
        retweeted = eval(rt)
        quoted = eval(qrt)
        in_reply = eval(ir)
        mentioned = eval(mu)
        nodes.append((user['username'], user['displayname'], user['description'], user['id']))
        if retweeted:
            if user['username'] != retweeted['user']['username']:
                edges.append((user['username'], retweeted['user']['username'], 'retweet'))
                nodes.append((retweeted['user']['username'], retweeted['user']['displayname'], retweeted['user']['description'], retweeted['user']['id']))
        if quoted:
            if user['username'] != quoted['user']['username']:
                edges.append((user['username'], quoted['user']['username'], 'quoted_rt'))
                nodes.append((quoted['user']['username'], quoted['user']['displayname'], quoted['user']['description'], quoted['user']['id']))
        if in_reply:
            if user['username'] != in_reply['username']:
                edges.append((user['username'], in_reply['username'], 'reply'))
                nodes.append((in_reply['username'], in_reply['displayname'], in_reply['description'], in_reply['id']))
        if mentioned:
            for r_user in mentioned:
                if user['username'] != r_user['username']:
                    edges.append((user['username'], r_user['username'], 'mention'))
                    nodes.append((r_user['username'], r_user['displayname'], r_user['description'], r_user['id']))
                
    nodes_df = pd.DataFrame(nodes, columns=['Id', 'DisplayName', 'Description', 'TtId'])
    edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'Type'])
    nodes_df.drop_duplicates(subset=['Id'], keep='last', ignore_index=True, inplace=True)
    
    return nodes_df, edges_df

def df_2_networkx(nodes, edges):
    w_edges = []
    G = nx.DiGraph()
    for usr, disp_n, desc, tt_id in nodes.values:
        G.add_node(usr, displayname=disp_n, description=(desc if desc else ''), tt_id=tt_id)
    edges_tuples = [tuple(x) for x in edges.to_numpy()]
    edges_count = pd.Series(edges_tuples).value_counts()
    for i in range(edges_count.shape[0]):
        w_edges.append(edges_count.index[i]+(edges_count.values[i],))
    for edge in w_edges:
        G.add_edge(edge[0], edge[1], int_type=edge[2], weight=edge[3])
        
    return G
        
def network_2_file(tweets_df, data_path):
    nodes, edges = create_networks(tweets_df)
    edges_rt = edges.query('Type == "retweet"')
    edges_qt = edges.query('Type == "quoted_rt"')
    edges_rep = edges.query('Type == "reply"')
    edges_mn = edges.query('Type == "mention"')
    nodes.to_csv(f'{data_path}/gephi_nodes.csv', index=False)
    edges_rt.to_csv(f'{data_path}/gephi_edges_rt.csv', index=False)
    edges_qt.to_csv(f'{data_path}/gephi_edges_quote.csv', index=False)
    edges_rep.to_csv(f'{data_path}/gephi_edges_rep.csv', index=False)
    edges_mn.to_csv(f'{data_path}/gephi_edges_mention.csv', index=False)
    #G = df_2_networkx(nodes, edges)
    #nx.write_gexf(G, f'{data_path}/network.gexf')


In [4]:
week_list = [week_dir for week_dir in os.listdir(DATA_PATH_TT) if os.path.isdir(DATA_PATH_TT+week_dir) and not week_dir.endswith('.ipynb_checkpoints')]

week_list.sort()
week_list.remove('week_pr_01')
week_list.insert(week_list.index('week_12')+1, 'week_pr_01')
week_list.remove('week_pr_02')
week_list.insert(week_list.index('week_pr_01')+1, 'week_pr_02')
week_list.remove('week_pr_03')
week_list.insert(week_list.index('week_17')+1, 'week_pr_03')
week_list.remove('week_pr_04')
week_list.insert(week_list.index('week_21')+1, 'week_pr_04')

print(week_list)

['week_01', 'week_02', 'week_03', 'week_04', 'week_05', 'week_06', 'week_07', 'week_08', 'week_09', 'week_10', 'week_11', 'week_12', 'week_pr_01', 'week_pr_02', 'week_13', 'week_14', 'week_15', 'week_16', 'week_17', 'week_pr_03', 'week_18', 'week_19', 'week_20', 'week_21', 'week_pr_04', 'week_22']


In [5]:
day_list = ['day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7']

for week in week_list:
    if not os.path.exists(DATA_PATH_NT+week):
        os.mkdir(DATA_PATH_NT+week)
    for day in day_list:
        if not os.path.exists(DATA_PATH_NT+week+'/'+day):
            os.mkdir(DATA_PATH_NT+week+'/'+day)
        tweets_df = pd.read_parquet(DATA_PATH_TT+week+f'/merged_{day}.parquet')
        network_2_file(tweets_df, DATA_PATH_NT+week+'/'+day)
    #file_name = [file for file in os.listdir(DATA_PATH_TT+week) if file.endswith('.parquet') and file.startswith('merged_week')]
    #tweets_df = pd.read_parquet(DATA_PATH_TT+week+'/'+file_name[0])
    #network_2_file(tweets_df, DATA_PATH_NT+week)

In [6]:
nodes = pd.read_csv('data/networks/week_10/day_4/gephi_nodes.csv')
edges_qrt = pd.read_csv('data/networks/week_10/day_4/gephi_edges_quote.csv')

In [7]:
edges_qrt.head()

Unnamed: 0,Source,Target,Type
0,tatytoyama,randolfeap,quoted_rt
1,juliarib,folha,quoted_rt
2,mariajulia512,folha,quoted_rt
3,felipe_augustow,anthunesarth,quoted_rt
4,gui_3ds,malugaspar,quoted_rt


In [8]:
G = df_2_networkx(nodes, edges_qrt)
G.remove_nodes_from(list(nx.isolates(G)))

In [9]:
list(G.edges(data=True))[:10]

[('juliarib', 'folha', {'int_type': 'quoted_rt', 'weight': 1}),
 ('felipe_augustow', 'anthunesarth', {'int_type': 'quoted_rt', 'weight': 1}),
 ('gui_3ds', 'malugaspar', {'int_type': 'quoted_rt', 'weight': 1}),
 ('ei_quel', 'folha', {'int_type': 'quoted_rt', 'weight': 1}),
 ('rchrdlvs', 'folha', {'int_type': 'quoted_rt', 'weight': 1}),
 ('sissiaraujo_', 'folha', {'int_type': 'quoted_rt', 'weight': 1}),
 ('lucaxcr', 'albertocalmeida', {'int_type': 'quoted_rt', 'weight': 1}),
 ('RenatoPedreira', 'LeandroColon', {'int_type': 'quoted_rt', 'weight': 1}),
 ('Luikls', 'folha', {'int_type': 'quoted_rt', 'weight': 1}),
 ('alwayslouies', 'folha', {'int_type': 'quoted_rt', 'weight': 1})]

In [10]:
sorted(G.in_degree, key=lambda x: x[1], reverse=True)[:10]

[('folha', 141),
 ('bicmuller', 78),
 ('desmentindobozo', 70),
 ('Metropoles', 70),
 ('ErikakHilton', 57),
 ('MarceloFreixo', 54),
 ('cirogomes', 44),
 ('UOLNoticias', 38),
 ('randolfeap', 36),
 ('OmarAzizSenador', 36)]

In [11]:
print(f'''
Number of nodes: {len(G.nodes)}
Number of edges: {len(G.edges)}
''')


Number of nodes: 2790
Number of edges: 2639



In [12]:
from networkx.algorithms.community.centrality import girvan_newman

communities = girvan_newman(G)

In [13]:
node_groups = []
for com in next(communities):
  node_groups.append(list(com))

print(len(node_groups))

393


In [14]:
for idx, cluster in enumerate(node_groups):
    if len(cluster) >= 10:
        print((idx, len(cluster)))

(0, 1797)
(20, 78)
(65, 12)


In [15]:
node_groups[0][:30]

['GGsavoy',
 'Politic_Nicki',
 'geraldorufino',
 'mario_rasec666',
 'boisonaroo',
 'Mary_Caty_Gomes',
 'psciarotta',
 'MoscaNaOrelha',
 'GuBaMontenegro',
 'IuliVieira',
 'GuilhermeBoulos',
 'KatiaFerreiram',
 'ddougsouza',
 'mota_thm',
 'LilaSan72505220',
 'Fabio_F_Lima',
 'heeey_johnny',
 'saulo3rios',
 'alicebiruta',
 'roodneyfb',
 'Conservadora191',
 'liviamunx',
 'zelafneto',
 'jornalqueri',
 'orlandosilva',
 'ZeroCool_BR21',
 'Bruno97352911',
 'AdrianaPrinceC1',
 'krause_kellen',
 'mauriciorenato7']

In [16]:
node_groups[20][:30]

['LessaFer_BR',
 'acdsp',
 'Countdown2023',
 'Sophia59671258',
 'PericlesVerdi',
 'RobertoLuan1988',
 'RenatoS76217808',
 'tercalivre',
 'hetero_do',
 'taoquei1',
 'RoseSouzaII',
 'vejaaverdadenua',
 'JulioCezarMD',
 'BolsonaroSP',
 'kimpaim',
 'MrciaMe11132434',
 'Elton37737929',
 'Marilen77315903',
 'RafaelKerr84',
 'rodollfo',
 'Taissabege',
 'vanice_a',
 'marianah50',
 'dimacgarcia',
 'brunosep89',
 'FATIMAA80856038',
 'AgNDigital',
 'Francis98663988',
 'FeSeu',
 'Andre_PMELLO']

In [17]:
node_groups[65][:30]

['1gorN4ascimento',
 'AnaPaul01881881',
 'CrisCrisDFBRA4',
 'prcristianomelo',
 'Ketley',
 'jairbolsonaro',
 'aleinformation',
 'Notlover8',
 'col_bacol',
 'nathmottadr',
 'CarlosBolsonaro',
 'cba_ze']

In [18]:
from networkx.algorithms.community.modularity_max import greedy_modularity_communities

communi = greedy_modularity_communities(G, weight='weight')

In [19]:
len(communi)

425

In [20]:
for idx, cluster in enumerate(communi):
    if len(cluster) >= 10:
        print((idx, len(cluster)))

(0, 237)
(1, 172)
(2, 120)
(3, 111)
(4, 104)
(5, 104)
(6, 103)
(7, 93)
(8, 85)
(9, 64)
(10, 63)
(11, 63)
(12, 61)
(13, 50)
(14, 47)
(15, 46)
(16, 43)
(17, 35)
(18, 33)
(19, 33)
(20, 33)
(21, 32)
(22, 28)
(23, 22)
(24, 19)
(25, 17)
(26, 12)
(27, 10)
(28, 10)


In [21]:
from networkx.algorithms.community.label_propagation import asyn_lpa_communities

label_prop_comm =  list(asyn_lpa_communities(G, weight='weight', seed=None))

In [22]:
len(label_prop_comm)

791

In [23]:
for idx, cluster in enumerate(label_prop_comm):
    if len(cluster) >= 10:
        print((idx, len(cluster)))

(0, 127)
(6, 30)
(7, 11)
(10, 13)
(11, 48)
(15, 29)
(17, 13)
(22, 41)
(23, 19)
(31, 30)
(36, 52)
(40, 28)
(42, 123)
(46, 14)
(48, 21)
(51, 36)
(55, 17)
(59, 10)
(65, 12)
(80, 12)
(92, 42)
(93, 15)
(99, 24)
(101, 70)
(116, 10)
(118, 13)
(120, 13)
(156, 16)
(190, 10)
(274, 11)
(277, 17)
(281, 10)
(316, 10)
(355, 30)
(367, 22)
(498, 10)
(683, 16)


In [24]:
reply_actors = []
quoted_rt_actors = []
mention_actors = []

for week in week_list:
    file_name = [file for file in os.listdir(DATA_PATH_TT+week) if file.startswith('merged_week') and file.endswith('.parquet')]
    tweets_df = pd.read_parquet(DATA_PATH_TT+week+f'/{file_name[0]}')
    nodes, edges = create_networks(tweets_df)
    G = df_2_networkx(nodes, edges)
    G_reply = nx.DiGraph([(u,v,d) for u,v,d in G.edges(data=True) if d['int_type']=='reply'])
    G_qrt = nx.DiGraph([(u,v,d) for u,v,d in G.edges(data=True) if d['int_type']=='quoted_rt'])
    G_mn = nx.DiGraph([(u,v,d) for u,v,d in G.edges(data=True) if d['int_type']=='mention'])
    
    reply_actors.append(sorted(G_reply.in_degree, key=lambda x: x[1], reverse=True)[:10])
    quoted_rt_actors.append(sorted(G_qrt.in_degree, key=lambda x: x[1], reverse=True)[:10])    
    mention_actors.append(sorted(G_mn.in_degree, key=lambda x: x[1], reverse=True)[:10])    

In [25]:
actors = []

for week in reply_actors:
    for actor in week:
        actors.append(actor[0])
        
actors = pd.Series(actors)

actors.value_counts()[:10]

jairbolsonaro      23
randolfeap         18
OmarAzizSenador    18
taoquei1           17
SenadoFederal      16
senadorhumberto    13
UOLNoticias        13
CarlaZambelli38    11
MarcosRogerio       9
BolsonaroSP         9
dtype: int64

In [26]:
actors = []

for week in quoted_rt_actors:
    for actor in week:
        actors.append(actor[0])
        
actors = pd.Series(actors)

actors.value_counts()[:10]

Metropoles         19
taoquei1           16
folha              15
CarlaZambelli38    14
randolfeap         11
UOLNoticias        11
Rconstantino       11
jairbolsonaro      10
MidiaNINJA         10
kimpaim             8
dtype: int64

In [27]:
actors = []

for week in mention_actors:
    for actor in week:
        actors.append(actor[0])
        
actors = pd.Series(actors)

actors.value_counts()[:10]

jairbolsonaro      26
randolfeap         25
OmarAzizSenador    24
renancalheiros     22
SenadoFederal      20
MarcosRogerio      15
LulaOficial         9
Sen_Alessandro      9
STF_oficial         9
senadorhumberto     9
dtype: int64