# __All Period CPI SNA__

In [12]:
import pandas as pd
import networkx as nx
import os, datetime

In [21]:
DATA_PATH_TT = 'data/tweets/'
DATA_PATH_NT = 'data/networks/'

nodes = pd.read_csv(DATA_PATH_NT+'all_nodes.csv')
nodes.shape

(536263, 3)

In [10]:
def create_networks(tweets_df):
    nodes = []
    edges = []
    for u, rt, qrt, ir, mu in tweets_df[['user', 'retweetedTweet', 'quotedTweet', 'inReplyToUser', 'mentionedUsers']].values:
        user = eval(u)
        retweeted = eval(rt)
        quoted = eval(qrt)
        in_reply = eval(ir)
        mentioned = eval(mu)
        nodes.append((user['username'], user['displayname'], user['description']))
        if retweeted:
            if user['username'] != retweeted['user']['username']:
                edges.append((user['username'], retweeted['user']['username'], 'retweet'))
                nodes.append((retweeted['user']['username'], retweeted['user']['displayname'], retweeted['user']['description']))
        if quoted:
            if user['username'] != quoted['user']['username']:
                edges.append((user['username'], quoted['user']['username'], 'quoted_rt'))
                nodes.append((quoted['user']['username'], quoted['user']['displayname'], quoted['user']['description']))
        if in_reply:
            if user['username'] != in_reply['username']:
                edges.append((user['username'], in_reply['username'], 'reply'))
                nodes.append((in_reply['username'], in_reply['displayname'], in_reply['description']))
        if mentioned:
            for r_user in mentioned:
                if user['username'] != r_user['username']:
                    edges.append((user['username'], r_user['username'], 'mention'))
                    nodes.append((r_user['username'], r_user['displayname'], r_user['description']))
                
    nodes_df = pd.DataFrame(nodes, columns=['Id', 'DisplayName', 'Description'])
    edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'LinkType'])
    nodes_df.drop_duplicates(subset=['Id'], keep='last', ignore_index=True, inplace=True)
    
    return nodes_df, edges_df

def df_2_networkx(nodes, edges):
    w_edges = []
    G = nx.DiGraph()
    for usr, disp_n, desc in nodes.values:
        G.add_node(str(usr), displayname=str(disp_n), description=str(desc if desc else ''))
    edges_tuples = [tuple(x) for x in edges.to_numpy()]
    edges_count = pd.Series(edges_tuples).value_counts()
    for i in range(edges_count.shape[0]):
        w_edges.append(edges_count.index[i]+(edges_count.values[i],))
    for edge in w_edges:
        G.add_edge(str(edge[0]), str(edge[1]), interac_type=str(edge[2]), weight=int(edge[3]))
        
    G_quoted = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'quoted_rt'])
    G_rtweet = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'retweeted'])
    G_reply = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'reply'])
    G_mention = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'mention'])
    
    return G, G_quoted, G_rtweet, G_reply, G_mention

In [5]:
week_list = [week_dir for week_dir in os.listdir(DATA_PATH_TT) if os.path.isdir(DATA_PATH_TT+week_dir) and not week_dir.endswith('.ipynb_checkpoints')]

week_list.sort()
week_list.remove('week_pr_01')
week_list.insert(week_list.index('week_12')+1, 'week_pr_01')
week_list.remove('week_pr_02')
week_list.insert(week_list.index('week_pr_01')+1, 'week_pr_02')
week_list.remove('week_pr_03')
week_list.insert(week_list.index('week_17')+1, 'week_pr_03')

print(week_list)

['week_01', 'week_02', 'week_03', 'week_04', 'week_05', 'week_06', 'week_07', 'week_08', 'week_09', 'week_10', 'week_11', 'week_12', 'week_pr_01', 'week_pr_02', 'week_13', 'week_14', 'week_15', 'week_16', 'week_17', 'week_pr_03', 'week_18', 'week_19', 'week_20', 'week_21']


In [13]:
edges_list = []

for week in week_list:
    print(f'Starting {week}...')
    f_name = [f for f in os.listdir(DATA_PATH_TT+week) if f.startswith('merged_week')][0]
    week_tt_df = pd.read_parquet(DATA_PATH_TT+week+'/'+f_name)
    nodes, edges = create_networks(week_tt_df)
    edges_list.append(edges)

Starting week_01...
Starting week_02...
Starting week_03...
Starting week_04...
Starting week_05...
Starting week_06...
Starting week_07...
Starting week_08...
Starting week_09...
Starting week_10...
Starting week_11...
Starting week_12...
Starting week_pr_01...
Starting week_pr_02...
Starting week_13...
Starting week_14...
Starting week_15...
Starting week_16...
Starting week_17...
Starting week_pr_03...
Starting week_18...
Starting week_19...
Starting week_20...
Starting week_21...


In [14]:
len(edges_list)

24

In [20]:
edges = pd.concat(edges_list, axis=0)
edges.shape

(3734184, 3)

In [22]:
G, G_quoted, G_rtweet, G_reply, G_mention = df_2_networkx(nodes, edges)

In [23]:
len(G.nodes()), len(G.edges())

(536263, 1674363)

In [24]:
len(G_quoted.nodes()), len(G_quoted.edges())

(110722, 343803)

In [26]:
len(G_reply.nodes()), len(G_reply.edges())

(187029, 608957)

In [27]:
len(G_mention.nodes()), len(G_mention.edges())

(193304, 721603)

In [32]:
G.remove_nodes_from(list(nx.isolates(G)))
G_quoted.remove_nodes_from(list(nx.isolates(G_quoted)))
G_reply.remove_nodes_from(list(nx.isolates(G_reply)))
G_mention.remove_nodes_from(list(nx.isolates(G_mention)))

In [33]:
len(G.nodes()), len(G.edges())

(320143, 1674363)

In [34]:
len(G_quoted.nodes()), len(G_quoted.edges())

(110722, 343803)

In [35]:
len(G_reply.nodes()), len(G_reply.edges())

(187029, 608957)

In [36]:
len(G_mention.nodes()), len(G_mention.edges())

(193304, 721603)

In [37]:
nx.write_gexf(G, DATA_PATH_NT+'complete_network.gexf')
nx.write_gexf(G_quoted, DATA_PATH_NT+'quoted_network.gexf')
nx.write_gexf(G_reply, DATA_PATH_NT+'reply_network.gexf')
nx.write_gexf(G_mention, DATA_PATH_NT+'mention_network.gexf')

In [39]:
sorted(G.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('jairbolsonaro', 33768),
 ('OmarAzizSenador', 23661),
 ('randolfeap', 18808),
 ('renancalheiros', 16845),
 ('taoquei1', 16065),
 ('CarlaZambelli38', 13475),
 ('kimpaim', 11258),
 ('MarcosRogerio', 11118),
 ('SenadoFederal', 10401),
 ('Rconstantino', 10011),
 ('BolsonaroSP', 9243),
 ('ArthurLira_', 9141),
 ('senadorhumberto', 8949),
 ('folha', 8769),
 ('UOLNoticias', 8050),
 ('LulaOficial', 7983),
 ('CNNBrasil', 7512),
 ('STF_oficial', 7294),
 ('Sen_Alessandro', 6791),
 ('CarlosBolsonaro', 6698)]

In [40]:
sorted(G_quoted.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('taoquei1', 3856),
 ('MidiaNINJA', 3494),
 ('Metropoles', 3472),
 ('CarlaZambelli38', 3302),
 ('kimpaim', 2936),
 ('jairbolsonaro', 2707),
 ('folha', 2449),
 ('Rconstantino', 2238),
 ('randolfeap', 1872),
 ('UOLNoticias', 1862),
 ('Estadao', 1751),
 ('ALLuSapelli', 1702),
 ('BolsonaroSP', 1687),
 ('GuilhermeBoulos', 1652),
 ('MarceloFreixo', 1638),
 ('Biakicis', 1617),
 ('bolsomito_2', 1530),
 ('jairmearrependi', 1512),
 ('g1', 1503),
 ('desmentindobozo', 1486)]

In [42]:
sorted(G_reply.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('jairbolsonaro', 8752),
 ('taoquei1', 8349),
 ('OmarAzizSenador', 8210),
 ('CarlaZambelli38', 6278),
 ('randolfeap', 5412),
 ('Rconstantino', 5263),
 ('kimpaim', 5094),
 ('SenadoFederal', 4397),
 ('BolsonaroSP', 4263),
 ('folha', 3819),
 ('senadorhumberto', 3624),
 ('o_antagonista', 3548),
 ('MarcosRogerio', 3514),
 ('UOLNoticias', 3503),
 ('CNNBrasil', 3206),
 ('CarlosBolsonaro', 3072),
 ('LaurinhaIronic', 2926),
 ('renancalheiros', 2887),
 ('profpaulamarisa', 2831),
 ('Jouberth19', 2814)]

In [43]:
sorted(G_mention.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('jairbolsonaro', 22309),
 ('OmarAzizSenador', 14478),
 ('renancalheiros', 13646),
 ('randolfeap', 11524),
 ('ArthurLira_', 6434),
 ('MarcosRogerio', 6365),
 ('LulaOficial', 5581),
 ('SenadoFederal', 5434),
 ('STF_oficial', 5289),
 ('rpsenador', 4992),
 ('SimoneTebetms', 4519),
 ('FlavioBolsonaro', 4502),
 ('senadorhumberto', 4433),
 ('YouTube', 4183),
 ('Sen_Alessandro', 4162),
 ('ottoalencar', 4050),
 ('CarlaZambelli38', 3895),
 ('taoquei1', 3860),
 ('ContaratoSenado', 3467),
 ('BolsonaroSP', 3293)]

### User Weekly Participation

In [None]:
from IPython.display import clear_output, display

nodes_list = nodes['Id'].values.tolist()
user_count_list = []

for usrname in nodes_list:
    count = 0
    for week in week_list:
        clear_output(wait=True)
        print(f'User: {usrname} - week: {week}')
        week_nodes = pd.read_parquet(DATA_PATH_NT+f'{week}/{week}_node_list.parquet')
        if usrname in week_nodes['Id'].values.tolist():
            count+=1
    user_count_list.append(count)

User: vicentelou - week: week_03


## Community Detection

In [44]:
from networkx.algorithms.community.centrality import girvan_newman
from networkx.algorithms.community.modularity_max import greedy_modularity_communities
from networkx.algorithms.community.label_propagation import asyn_lpa_communities

comm_grivan = list(girvan_newman(G))

KeyboardInterrupt: 

In [45]:
comm_modularity = list(greedy_modularity_communities(G, weight='weight'))

KeyboardInterrupt: 

In [None]:
comm_label_prop =  list(asyn_lpa_communities(G, weight='weight', seed=None))