# __All Period CPI SNA__

In [1]:
import pandas as pd
import networkx as nx
import os, datetime

In [3]:
DATA_PATH_TT = 'data/tweets/'
DATA_PATH_NT = 'data/networks/'

nodes_ = pd.read_csv(DATA_PATH_NT+'all_nodes.csv')

In [15]:
def create_networks(tweets_df):
    nodes = []
    edges = []
    for u, tc, rt, qrt, ir, mu in tweets_df[['user', 'content', 'retweetedTweet', 'quotedTweet', 'inReplyToUser', 'mentionedUsers']].values:
        user = eval(u)
        retweeted = eval(rt)
        quoted = eval(qrt)
        in_reply = eval(ir)
        mentioned = eval(mu)
        tweet_content = str(tc)
        nodes.append((user['username'], user['displayname'], user['description'], user['id']))
        if retweeted:
            if user['username'] != retweeted['user']['username']:
                edges.append((user['username'], retweeted['user']['username'], 'retweet', tweet_content))
                nodes.append((retweeted['user']['username'], retweeted['user']['displayname'], retweeted['user']['description'], retweeted['user']['id']))
        elif quoted:
            if user['username'] != quoted['user']['username']:
                edges.append((user['username'], quoted['user']['username'], 'quoted_rt', tweet_content))
                nodes.append((quoted['user']['username'], quoted['user']['displayname'], quoted['user']['description'], quoted['user']['id']))
        elif in_reply:
            if user['username'] != in_reply['username']:
                edges.append((user['username'], in_reply['username'], 'reply', tweet_content))
                nodes.append((in_reply['username'], in_reply['displayname'], in_reply['description'], in_reply['id']))
        elif mentioned:
            for r_user in mentioned:
                if user['username'] != r_user['username']:
                    edges.append((user['username'], r_user['username'], 'mention', tweet_content))
                    nodes.append((r_user['username'], r_user['displayname'], r_user['description'], r_user['id']))

    nodes_df = pd.DataFrame(nodes, columns=['Id', 'DisplayName', 'Description', 'TtId'])
    edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'LinkType', 'TweetContent'])
    nodes_df.drop_duplicates(subset=['TtId'], keep='last', ignore_index=True, inplace=True)

    return nodes_df, edges_df

def df_2_networkx(nodes, edges):
    w_edges = []
    G = nx.DiGraph()
    for usr, disp_n, desc, tt_id in nodes.values:
        G.add_node(str(usr), displayname=str(disp_n), description=str(desc if desc else ''), tt_id=tt_id)
        #G.add_node(tt_id, username=str(usr), displayname=str(disp_n), description=str(desc if desc else 'None'))
    edges_tuples = [tuple(x) for x in edges.to_numpy()]
    edges_count = pd.Series(edges_tuples).value_counts()
    for i in range(edges_count.shape[0]):
        w_edges.append(edges_count.index[i]+(edges_count.values[i],))
    for edge in w_edges:
        G.add_edge(str(edge[0]), str(edge[1]), interac_type=str(edge[2]), weight=int(edge[3]))

    G_quoted = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'quoted_rt'])
    G_rtweet = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'retweeted'])
    G_reply = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'reply'])
    G_mention = nx.DiGraph([(u, v, e) for u,v,e in G.edges(data=True) if e['interac_type'] == 'mention'])

    return G, G_quoted, G_rtweet, G_reply, G_mention

def def_edges_weight(edges):
    columns = ['Source', 'Target', 'Weight']
    edges_unique = edges[['Source', 'Target']].value_counts().index
    edges_weight = edges[['Source', 'Target']].value_counts().values
    edges_with_weights = [(users[0], users[1], weight) for users, weight in zip(edges_unique, edges_weight)]
    return edges_with_weights

def df_2_networkx_2(edges):
    all_edges = edges[['Source', 'Target']]
    quoted_edges = edges[edges['LinkType']=='quoted_rt'][['Source', 'Target']]
    rt_edges = edges[edges['LinkType']=='retweeted'][['Source', 'Target']]
    reply_edges = edges[edges['LinkType']=='reply'][['Source', 'Target']]
    mention_edges = edges[edges['LinkType']=='mention'][['Source', 'Target']]
    G = nx.DiGraph()
    for src, tgt, w in def_edges_weight(all_edges):
        G.add_edge(src, tgt, weight=w)
    G_quoted = nx.DiGraph()
    for src, tgt, w in def_edges_weight(quoted_edges):
        G_quoted.add_edge(src, tgt, weight=w)
    G_rt = nx.DiGraph()
    for src, tgt, w in def_edges_weight(rt_edges):
        G_rt.add_edge(src, tgt, weight=w)
    G_reply = nx.DiGraph()
    for src, tgt, w in def_edges_weight(reply_edges):
        G_reply.add_edge(src, tgt, weight=w)
    G_mention = nx.DiGraph()
    for src, tgt, w in def_edges_weight(mention_edges):
        G_mention.add_edge(src, tgt, weight=w)
    return G, G_quoted, G_rt, G_reply, G_mention

In [4]:
week_list = [week_dir for week_dir in os.listdir(DATA_PATH_TT) if os.path.isdir(DATA_PATH_TT+week_dir) and not week_dir.endswith('.ipynb_checkpoints')]

week_list.sort()
week_list.remove('week_pr_01')
week_list.insert(week_list.index('week_12')+1, 'week_pr_01')
week_list.remove('week_pr_02')
week_list.insert(week_list.index('week_pr_01')+1, 'week_pr_02')
week_list.remove('week_pr_03')
week_list.insert(week_list.index('week_17')+1, 'week_pr_03')
week_list.remove('week_pr_04')
week_list.insert(week_list.index('week_21')+1, 'week_pr_04')
#week_list.remove('week_22')

print(week_list)

['week_01', 'week_02', 'week_03', 'week_04', 'week_05', 'week_06', 'week_07', 'week_08', 'week_09', 'week_10', 'week_11', 'week_12', 'week_pr_01', 'week_pr_02', 'week_13', 'week_14', 'week_15', 'week_16', 'week_17', 'week_pr_03', 'week_18', 'week_19', 'week_20', 'week_21', 'week_pr_04', 'week_22']


In [5]:
edges_list = []
#nodes_list = []

for week in week_list:
    print(f'Starting {week}...')
    f_name = [f for f in os.listdir(DATA_PATH_TT+week) if f.startswith('merged_week')][0]
    week_tt_df = pd.read_parquet(DATA_PATH_TT+week+'/'+f_name)
    nodes, edges = create_networks(week_tt_df)
    #nodes_list.append(nodes)
    edges_list.append(edges)

Starting week_01...
Starting week_02...
Starting week_03...
Starting week_04...
Starting week_05...
Starting week_06...
Starting week_07...
Starting week_08...
Starting week_09...
Starting week_10...
Starting week_11...
Starting week_12...
Starting week_pr_01...
Starting week_pr_02...
Starting week_13...
Starting week_14...
Starting week_15...
Starting week_16...
Starting week_17...
Starting week_pr_03...
Starting week_18...
Starting week_19...
Starting week_20...
Starting week_21...
Starting week_pr_04...
Starting week_22...


In [6]:
len(edges_list)

26

In [7]:
edges_ = pd.concat(edges_list, axis=0)
edges_.shape

(1895008, 4)

In [8]:
edges_.to_parquet('data/networks/complete_network_edges.parquet', index=False)

In [9]:
edges_.sample(5)

Unnamed: 0,Source,Target,LinkType,TweetContent
64885,BRASIL47682154,MarcosRogerio,reply,@MarcosRogerio tirando o foco Renan sabia de t...
2357,baroninho94,cirogomes,reply,@cirogomes Saiu correndo com o rabo no meio da...
36338,denizevicente,HBO_Brasil,reply,@HBO_Brasil As melhores? Estão transmitindo a ...
68636,Leo_Saramago,NatuzaNery,mention,"@MarinaSilva - Um manda, o outro obedece. 442...."
39171,adrianerikchave,nikolas_dm,mention,Farei minha contribuição também junto com o @n...


In [9]:
#nodes_ = pd.concat(nodes_list, axis=0)
#nodes_.drop_duplicates(subset=['TtId'], keep='last', ignore_index=True, inplace=True)
#nodes_.shape

In [10]:
#nodes_.tail()

In [11]:
#G, G_quoted, G_rtweet, G_reply, G_mention = df_2_networkx(nodes_, edges_)

In [13]:
edges = pd.read_parquet('data/networks/complete_network_edges.parquet')
edges.shape

(1878862, 4)

In [10]:
edges.sample(5)

Unnamed: 0,Source,Target,LinkType,TweetContent
545964,Costa59Beto,gazeta,mention,Randolfe: A GAZELA SALTITANTE ‘Impeachment de ...
152250,assumpcao_fabio,TSEjusbr,reply,@TSEjusbr Nivelando por baixo... #VotoImpresso...
162413,vinnybrandt,arrobajulimaa,reply,@arrobajulimaa @Estadao estou muito surpreso: ...
1404668,opropriorobert,BrazilFight,reply,@BrazilFight @MarcosCruzeiro5 #BarrosoNaCadeia...
1402226,ThyaneGG,Rconstantino,reply,@Rconstantino Quero ver ao vivo e a cores #Bar...


In [16]:
G, G_quoted, G_rtweet, G_reply, G_mention = df_2_networkx_2(edges)

In [17]:
list(G.nodes(data=True))[:5]

[('ptribeirao', {}),
 ('AAMachadoRP', {}),
 ('erregege1', {}),
 ('googlenews', {}),
 ('noemi_nk', {})]

In [18]:
len(G.nodes()), len(G.edges())

(301981, 1298516)

In [19]:
len(G_quoted.nodes()), len(G_quoted.edges())

(113357, 368795)

In [20]:
len(G_reply.nodes()), len(G_reply.edges())

(206971, 749304)

In [21]:
len(G_mention.nodes()), len(G_mention.edges())

(88205, 231094)

In [22]:
G.remove_nodes_from(list(nx.isolates(G)))
G_quoted.remove_nodes_from(list(nx.isolates(G_quoted)))
G_reply.remove_nodes_from(list(nx.isolates(G_reply)))
G_mention.remove_nodes_from(list(nx.isolates(G_mention)))

In [23]:
len(G.nodes()), len(G.edges())

(301981, 1298516)

In [24]:
len(G_quoted.nodes()), len(G_quoted.edges())

(113357, 368795)

In [25]:
len(G_reply.nodes()), len(G_reply.edges())

(206971, 749304)

In [26]:
len(G_mention.nodes()), len(G_mention.edges())

(88205, 231094)

In [27]:
nx.write_gexf(G, DATA_PATH_NT+'complete_network.gexf')
nx.write_gexf(G_quoted, DATA_PATH_NT+'quoted_network.gexf')
nx.write_gexf(G_reply, DATA_PATH_NT+'reply_network.gexf')
nx.write_gexf(G_mention, DATA_PATH_NT+'mention_network.gexf')

In [28]:
list(G.edges(data=True))[:10]

[('ptribeirao', 'AAMachadoRP', {'weight': 12281}),
 ('erregege1', 'googlenews', {'weight': 358}),
 ('erregege1', 'randolfeap', {'weight': 1}),
 ('noemi_nk', 'o_antagonista', {'weight': 332}),
 ('noemi_nk', 'diogomainardi', {'weight': 131}),
 ('noemi_nk', 'claudioedantas', {'weight': 87}),
 ('noemi_nk', 'Diego1Amorim', {'weight': 32}),
 ('noemi_nk', 'RevistaCrusoe', {'weight': 28}),
 ('noemi_nk', 'VemPraRua_br', {'weight': 14}),
 ('noemi_nk', 'blogdojosias', {'weight': 13})]

In [29]:
sorted(G.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('jairbolsonaro', 21895),
 ('OmarAzizSenador', 18923),
 ('randolfeap', 15265),
 ('taoquei1', 15027),
 ('CarlaZambelli38', 12095),
 ('renancalheiros', 10693),
 ('kimpaim', 10201),
 ('MarcosRogerio', 9335),
 ('Rconstantino', 9215),
 ('SenadoFederal', 8659),
 ('BolsonaroSP', 7826),
 ('folha', 7811),
 ('senadorhumberto', 7724),
 ('UOLNoticias', 7384),
 ('CNNBrasil', 6488),
 ('ArthurLira_', 6471),
 ('Metropoles', 6267),
 ('MidiaNINJA', 5837),
 ('CarlosBolsonaro', 5638),
 ('Sen_Alessandro', 5612)]

In [30]:
sorted(G_quoted.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('taoquei1', 4486),
 ('Metropoles', 3823),
 ('CarlaZambelli38', 3814),
 ('MidiaNINJA', 3742),
 ('kimpaim', 3604),
 ('jairbolsonaro', 3293),
 ('folha', 2701),
 ('Rconstantino', 2528),
 ('randolfeap', 2233),
 ('UOLNoticias', 2134),
 ('BolsonaroSP', 1927),
 ('Estadao', 1920),
 ('ALLuSapelli', 1873),
 ('MarceloFreixo', 1830),
 ('Biakicis', 1826),
 ('GuilhermeBoulos', 1794),
 ('desmentindobozo', 1716),
 ('bolsomito_2', 1660),
 ('jairmearrependi', 1630),
 ('g1', 1590)]

In [31]:
sorted(G_reply.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('taoquei1', 11353),
 ('jairbolsonaro', 10170),
 ('OmarAzizSenador', 9951),
 ('CarlaZambelli38', 8081),
 ('kimpaim', 6931),
 ('Rconstantino', 6868),
 ('randolfeap', 6661),
 ('SenadoFederal', 5795),
 ('BolsonaroSP', 5398),
 ('folha', 4982),
 ('UOLNoticias', 4726),
 ('senadorhumberto', 4695),
 ('o_antagonista', 4605),
 ('MarcosRogerio', 4392),
 ('CNNBrasil', 4112),
 ('CarlosBolsonaro', 3867),
 ('Jouberth19', 3801),
 ('profpaulamarisa', 3690),
 ('LaurinhaIronic', 3676),
 ('MarceloFreixo', 3546)]

In [32]:
sorted(G_mention.in_degree, key=lambda x: x[1], reverse=True)[:20]

[('jairbolsonaro', 11039),
 ('OmarAzizSenador', 10056),
 ('randolfeap', 8279),
 ('renancalheiros', 7554),
 ('MarcosRogerio', 4387),
 ('ArthurLira_', 3763),
 ('YouTube', 3639),
 ('SenadoFederal', 3124),
 ('SimoneTebetms', 3023),
 ('Sen_Alessandro', 2931),
 ('ottoalencar', 2698),
 ('senadorhumberto', 2602),
 ('LulaOficial', 2478),
 ('STF_oficial', 2379),
 ('ContaratoSenado', 2349),
 ('FlavioBolsonaro', 2247),
 ('elizianegama', 2051),
 ('rpsenador', 1982),
 ('EduGiraoOficial', 1843),
 ('SenadorRogerio', 1770)]

In [32]:
#all_nodes = []

#for u, d in list(G.nodes(data=True)):
    #all_nodes.append((u, d['displayname'], d['tt_id'], d['description']))
#    all_nodes.append((u, d['username'], d['displayname'], d['description']))
    
#all_nodes_df = pd.DataFrame(all_nodes, columns=['user', 'username', 'user_id', 'description'])
#all_nodes_df = pd.DataFrame(all_nodes, columns=['user_id', 'username', 'displayname', 'description'])
#all_nodes_df.to_csv(DATA_PATH_NT+'all_nodes.csv', index=False)

### User Weekly Participation

In [None]:
from IPython.display import clear_output, display

nodes_list = nodes['Id'].values.tolist()
user_count_list = []

for usrname in nodes_list:
    count = 0
    for week in week_list:
        clear_output(wait=True)
        print(f'User: {usrname} - week: {week}')
        week_nodes = pd.read_parquet(DATA_PATH_NT+f'{week}/{week}_node_list.parquet')
        if usrname in week_nodes['Id'].values.tolist():
            count+=1
    user_count_list.append(count)

## Community Detection

In [2]:
from networkx.algorithms.community.centrality import girvan_newman
from networkx.algorithms.community.modularity_max import greedy_modularity_communities
from networkx.algorithms.community.label_propagation import asyn_lpa_communities

#comm_grivan = list(girvan_newman(G))

In [6]:
G = nx.read_gexf(DATA_PATH_NT+'complete_network.gexf')
G_quoted = nx.read_gexf(DATA_PATH_NT+'quoted_network.gexf')
G_reply = nx.read_gexf(DATA_PATH_NT+'reply_network.gexf')
G_mention = nx.read_gexf(DATA_PATH_NT+'mention_network.gexf')

In [7]:
comm_modularity = list(greedy_modularity_communities(G_reply, weight='weight'))

KeyboardInterrupt: 

In [None]:
len(comm_modularity)

In [None]:
comm_label_prop =  list(asyn_lpa_communities(G, weight='weight', seed=None))

In [None]:
len(comm_grivan)

In [None]:
len(comm_label_prop)