# __All Period CPI SNA__

In [1]:
import pandas as pd
import networkx as nx
import os, datetime

In [2]:
DATA_PATH_TT = 'data/tweets/'
DATA_PATH_NT = 'data/networks/'

nodes_ = pd.read_csv(DATA_PATH_NT+'all_nodes.csv')

In [3]:
def create_networks(tweets_df):
    nodes = []
    edges = []
    for u, rt, qrt, ir, mu in tweets_df[['user', 'retweetedTweet', 'quotedTweet', 'inReplyToUser', 'mentionedUsers']].values:
        user = eval(u)
        retweeted = eval(rt)
        quoted = eval(qrt)
        in_reply = eval(ir)
        mentioned = eval(mu)
        nodes.append((user['username'], user['displayname'], user['description'], user['id']))
        if retweeted:
            if user['username'] != retweeted['user']['username']:
                edges.append((user['username'], retweeted['user']['username'], 'retweet'))
                nodes.append((retweeted['user']['username'], retweeted['user']['displayname'], retweeted['user']['description'], retweeted['user']['id']))
        if quoted:
            if user['username'] != quoted['user']['username']:
                edges.append((user['username'], quoted['user']['username'], 'quoted_rt'))
                nodes.append((quoted['user']['username'], quoted['user']['displayname'], quoted['user']['description'], quoted['user']['id']))
        if in_reply:
            if user['username'] != in_reply['username']:
                edges.append((user['username'], in_reply['username'], 'reply'))
                nodes.append((in_reply['username'], in_reply['displayname'], in_reply['description'], in_reply['id']))
        if mentioned:
            for r_user in mentioned:
                if user['username'] != r_user['username']:
                    edges.append((user['username'], r_user['username'], 'mention'))
                    nodes.append((r_user['username'], r_user['displayname'], r_user['description'], r_user['id']))
                
    nodes_df = pd.DataFrame(nodes, columns=['Id', 'DisplayName', 'Description', 'TtId'])
    edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'LinkType'])
    nodes_df.drop_duplicates(subset=['TtId'], keep='last', ignore_index=True, inplace=True)
    
    return nodes_df, edges_df

def df_2_networkx(nodes, edges):
    w_edges = []
    G = nx.DiGraph()
    for usr, disp_n, desc, tt_id in nodes.values:
        G.add_node(str(usr), displayname=str(disp_n), description=str(desc if desc else ''), tt_id=tt_id)
        #G.add_node(tt_id, username=str(usr), displayname=str(disp_n), description=str(desc if desc else 'None'))
    edges_tuples = [tuple(x) for x in edges.to_numpy()]
    edges_count = pd.Series(edges_tuples).value_counts()
    for i in range(edges_count.shape[0]):
        w_edges.append(edges_count.index[i]+(edges_count.values[i],))
    for edge in w_edges:
        G.add_edge(str(edge[0]), str(edge[1]), interac_type=str(edge[2]), weight=int(edge[3]))
        
    G_quoted = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'quoted_rt'])
    G_rtweet = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'retweeted'])
    G_reply = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'reply'])
    G_mention = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'mention'])
    
    return G, G_quoted, G_rtweet, G_reply, G_mention

In [4]:
week_list = [week_dir for week_dir in os.listdir(DATA_PATH_TT) if os.path.isdir(DATA_PATH_TT+week_dir) and not week_dir.endswith('.ipynb_checkpoints')]

week_list.sort()
week_list.remove('week_pr_01')
week_list.insert(week_list.index('week_12')+1, 'week_pr_01')
week_list.remove('week_pr_02')
week_list.insert(week_list.index('week_pr_01')+1, 'week_pr_02')
week_list.remove('week_pr_03')
week_list.insert(week_list.index('week_17')+1, 'week_pr_03')
week_list.remove('week_pr_04')
week_list.insert(week_list.index('week_21')+1, 'week_pr_04')
#week_list.remove('week_22')

print(week_list)

['week_01', 'week_02', 'week_03', 'week_04', 'week_05', 'week_06', 'week_07', 'week_08', 'week_09', 'week_10', 'week_11', 'week_12', 'week_pr_01', 'week_pr_02', 'week_13', 'week_14', 'week_15', 'week_16', 'week_17', 'week_pr_03', 'week_18', 'week_19', 'week_20', 'week_21', 'week_pr_04', 'week_22']


In [5]:
edges_list = []
#nodes_list = []

for week in week_list:
    print(f'Starting {week}...')
    f_name = [f for f in os.listdir(DATA_PATH_TT+week) if f.startswith('merged_week')][0]
    week_tt_df = pd.read_parquet(DATA_PATH_TT+week+'/'+f_name)
    nodes, edges = create_networks(week_tt_df)
    #nodes_list.append(nodes)
    edges_list.append(edges)

Starting week_01...
Starting week_02...
Starting week_03...
Starting week_04...
Starting week_05...
Starting week_06...
Starting week_07...
Starting week_08...
Starting week_09...
Starting week_10...
Starting week_11...
Starting week_12...
Starting week_pr_01...
Starting week_pr_02...
Starting week_13...
Starting week_14...
Starting week_15...
Starting week_16...
Starting week_17...
Starting week_pr_03...
Starting week_18...
Starting week_19...
Starting week_20...
Starting week_21...
Starting week_pr_04...
Starting week_22...


In [6]:
len(edges_list)

26

In [7]:
edges_ = pd.concat(edges_list, axis=0)
edges_.shape

(3808645, 3)

In [8]:
edges_.tail()

Unnamed: 0,Source,Target,LinkType
43466,_jeffaugusto,onyxlorenzoni,mention
43467,noemi_nk,Diego1Amorim,reply
43468,noemi_nk,Diego1Amorim,mention
43469,_jeffaugusto,AbrahamWeint,reply
43470,_jeffaugusto,AbrahamWeint,mention


In [9]:
#nodes_ = pd.concat(nodes_list, axis=0)
#nodes_.drop_duplicates(subset=['TtId'], keep='last', ignore_index=True, inplace=True)
#nodes_.shape

In [10]:
#nodes_.tail()

In [11]:
G, G_quoted, G_rtweet, G_reply, G_mention = df_2_networkx(nodes_, edges_)

In [12]:
list(G.nodes(data=True))[:5]

[('seguezaqueu',
  {'displayname': 'Zaqueu Castelamary 🏴🏳(SDV ? )',
   'description': 'nan',
   'tt_id': 1247224168850034694}),
 ('zaqueusdvPT',
  {'displayname': 'Segue de Volta Esquerdistas ?',
   'description': 'nan',
   'tt_id': 1573595167}),
 ('OSul_noticias',
  {'displayname': 'Jornal O Sul', 'description': 'nan', 'tt_id': 3245657679}),
 ('farah_julioo',
  {'displayname': 'Julio Patriota38🇧🇷🇧🇷🇧🇷',
   'description': 'nan',
   'tt_id': 1164340987071750145}),
 ('thivagando',
  {'displayname': 'Thiago Barreto',
   'description': '🏳️\u200d🌈 O antropólogo que logo sou...\nPalavras-chave: Gênero e Sexualidade; Estudos Sociais das Ciências; Política; Marcadores Sociais da Diferença.',
   'tt_id': 1177650794612563969})]

In [13]:
len(G.nodes()), len(G.edges())

(543334, 1705007)

In [14]:
len(G_quoted.nodes()), len(G_quoted.edges())

(112125, 348596)

In [15]:
len(G_reply.nodes()), len(G_reply.edges())

(187454, 610747)

In [16]:
len(G_mention.nodes()), len(G_mention.edges())

(199286, 745664)

In [20]:
G.remove_nodes_from(list(nx.isolates(G)))
G_quoted.remove_nodes_from(list(nx.isolates(G_quoted)))
G_reply.remove_nodes_from(list(nx.isolates(G_reply)))
G_mention.remove_nodes_from(list(nx.isolates(G_mention)))

In [22]:
len(G.nodes()), len(G.edges())

(325154, 1705007)

In [23]:
len(G_quoted.nodes()), len(G_quoted.edges())

(112125, 348596)

In [24]:
len(G_reply.nodes()), len(G_reply.edges())

(187454, 610747)

In [25]:
len(G_mention.nodes()), len(G_mention.edges())

(199286, 745664)

In [27]:
nx.write_gexf(G, DATA_PATH_NT+'complete_network.gexf')
nx.write_gexf(G_quoted, DATA_PATH_NT+'quoted_network.gexf')
nx.write_gexf(G_reply, DATA_PATH_NT+'reply_network.gexf')
nx.write_gexf(G_mention, DATA_PATH_NT+'mention_network.gexf')

In [28]:
sorted(G.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('jairbolsonaro', 34150),
 ('OmarAzizSenador', 24242),
 ('randolfeap', 19235),
 ('renancalheiros', 17346),
 ('taoquei1', 16096),
 ('CarlaZambelli38', 13610),
 ('MarcosRogerio', 11338),
 ('kimpaim', 11325),
 ('SenadoFederal', 10684),
 ('Rconstantino', 10082),
 ('BolsonaroSP', 9391),
 ('ArthurLira_', 9333),
 ('senadorhumberto', 9182),
 ('folha', 8947),
 ('UOLNoticias', 8277),
 ('LulaOficial', 8088),
 ('CNNBrasil', 7655),
 ('STF_oficial', 7377),
 ('Sen_Alessandro', 6942),
 ('Metropoles', 6789)]

In [29]:
sorted(G_quoted.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('taoquei1', 3854),
 ('Metropoles', 3588),
 ('MidiaNINJA', 3514),
 ('CarlaZambelli38', 3314),
 ('kimpaim', 2965),
 ('jairbolsonaro', 2704),
 ('folha', 2482),
 ('Rconstantino', 2239),
 ('UOLNoticias', 1917),
 ('randolfeap', 1906),
 ('Estadao', 1771),
 ('ALLuSapelli', 1706),
 ('BolsonaroSP', 1691),
 ('MarceloFreixo', 1675),
 ('GuilhermeBoulos', 1673),
 ('Biakicis', 1629),
 ('bolsomito_2', 1527),
 ('g1', 1524),
 ('jairmearrependi', 1520),
 ('desmentindobozo', 1502)]

In [30]:
sorted(G_reply.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('jairbolsonaro', 8703),
 ('taoquei1', 8522),
 ('OmarAzizSenador', 8505),
 ('CarlaZambelli38', 6403),
 ('randolfeap', 5464),
 ('kimpaim', 5311),
 ('Rconstantino', 5216),
 ('SenadoFederal', 4498),
 ('BolsonaroSP', 4272),
 ('folha', 3854),
 ('senadorhumberto', 3759),
 ('MarcosRogerio', 3645),
 ('UOLNoticias', 3581),
 ('o_antagonista', 3474),
 ('CNNBrasil', 3215),
 ('CarlosBolsonaro', 3127),
 ('renancalheiros', 3032),
 ('LaurinhaIronic', 3011),
 ('profpaulamarisa', 2809),
 ('MarceloFreixo', 2780)]

In [31]:
sorted(G_mention.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('jairbolsonaro', 22743),
 ('OmarAzizSenador', 14742),
 ('renancalheiros', 13996),
 ('randolfeap', 11865),
 ('ArthurLira_', 6599),
 ('MarcosRogerio', 6424),
 ('LulaOficial', 5675),
 ('SenadoFederal', 5580),
 ('STF_oficial', 5381),
 ('rpsenador', 5050),
 ('SimoneTebetms', 4681),
 ('FlavioBolsonaro', 4610),
 ('senadorhumberto', 4500),
 ('Sen_Alessandro', 4310),
 ('YouTube', 4283),
 ('ottoalencar', 4057),
 ('CarlaZambelli38', 3893),
 ('taoquei1', 3720),
 ('ContaratoSenado', 3546),
 ('BolsonaroSP', 3428)]

In [32]:
#all_nodes = []

#for u, d in list(G.nodes(data=True)):
    #all_nodes.append((u, d['displayname'], d['tt_id'], d['description']))
#    all_nodes.append((u, d['username'], d['displayname'], d['description']))
    
#all_nodes_df = pd.DataFrame(all_nodes, columns=['user', 'username', 'user_id', 'description'])
#all_nodes_df = pd.DataFrame(all_nodes, columns=['user_id', 'username', 'displayname', 'description'])
#all_nodes_df.to_csv(DATA_PATH_NT+'all_nodes.csv', index=False)

In [26]:
list(G.nodes(data=True))[:10]

[('seguezaqueu',
  {'displayname': 'Zaqueu Castelamary 🏴🏳(SDV ? )',
   'description': 'nan',
   'tt_id': 1247224168850034694}),
 ('zaqueusdvPT',
  {'displayname': 'Segue de Volta Esquerdistas ?',
   'description': 'nan',
   'tt_id': 1573595167}),
 ('OSul_noticias',
  {'displayname': 'Jornal O Sul', 'description': 'nan', 'tt_id': 3245657679}),
 ('farah_julioo',
  {'displayname': 'Julio Patriota38🇧🇷🇧🇷🇧🇷',
   'description': 'nan',
   'tt_id': 1164340987071750145}),
 ('thetunnes',
  {'displayname': 'Tunnes',
   'description': '21y |  🇧🇷 || Jogador de Valorant 💢 | Web Developer 👨\u200d💻',
   'tt_id': 1360024018246631425}),
 ('joaodoria45',
  {'displayname': 'joaodoria45', 'description': 'nan', 'tt_id': 3405608584}),
 ('eriicksonsilva',
  {'displayname': 'Erickson',
   'description': '25, BR\nFalo merda sobre o: @vascodagama @manutd @okcthunder',
   'tt_id': 51587901}),
 ('Elmosol',
  {'displayname': 'Elmo Souza Oliveira',
   'description': 'nan',
   'tt_id': 82350206}),
 ('silviacbap',
  {'

In [19]:
list(G.nodes(data=True))[543333]

('dimerstt',
 {'displayname': 'Laila Dimerstt use pff2',
  'description': 'nan',
  'tt_id': 1341947278165204992})

### User Weekly Participation

In [None]:
from IPython.display import clear_output, display

nodes_list = nodes['Id'].values.tolist()
user_count_list = []

for usrname in nodes_list:
    count = 0
    for week in week_list:
        clear_output(wait=True)
        print(f'User: {usrname} - week: {week}')
        week_nodes = pd.read_parquet(DATA_PATH_NT+f'{week}/{week}_node_list.parquet')
        if usrname in week_nodes['Id'].values.tolist():
            count+=1
    user_count_list.append(count)

## Community Detection

In [None]:
from networkx.algorithms.community.centrality import girvan_newman
from networkx.algorithms.community.modularity_max import greedy_modularity_communities
from networkx.algorithms.community.label_propagation import asyn_lpa_communities

comm_grivan = list(girvan_newman(G))

In [None]:
comm_modularity = list(greedy_modularity_communities(G, weight='weight'))

In [None]:
comm_label_prop =  list(asyn_lpa_communities(G, weight='weight', seed=None))