# README

**Note**

The `data.json` file has the following structure:

- It is a dictionary and the keys are the initial target tags
- The values for each of these keys are lists of posts data dictionaries
- Each dictionary has information about the post and a list of tags

# Preparing Data

## Loading Previous Data

In [1]:
# link to access the data.json file
share_url = 'https://drive.google.com/open?id=1L0vboR9Y7u7VH6gwf78AlLCFh7p-iZ0D'

# link do download the data.json file
download_url = 'https://drive.google.com/uc?export=download&id=' + '1L0vboR9Y7u7VH6gwf78AlLCFh7p-iZ0D'

In [2]:
!wget "https://drive.google.com/uc?export=download&id=1L0vboR9Y7u7VH6gwf78AlLCFh7p-iZ0D" -nc -q -O "data.json"

In [3]:
# reading the data.json file

import json

file = open('data.json')

data_json = json.load(file)

## Creating List of Edges

In [4]:
# NOTE This cell is merely for tests

import re

pattern = '^[a-zA-Z0-9]+$'

tags_test = [
    'alsdnbvcliud', 
    '541dvfv', 
    '6546385', 
    'kdfvbdbdjfkvbsdfvgbdfbdkdfvhbdfvfdbvdfbfgnbfg', 
    'a', 
    'df.dfb', 
    '(', 
    'dfbvd,', 
    'adf$', 
    'vsdf#'
]

[tag for tag in tags_test if re.match(pattern, tag) and len(tag) < 25 and len(tag) > 1]

['alsdnbvcliud', '541dvfv', '6546385']

In [5]:
import re
    
def validate_tag(tag):

    MAX_LEN = 25
    MIN_LEN = 1

    pattern = '^[a-zA-Z0-9]+$'
    
    if re.match(pattern, tag) and len(tag) < MAX_LEN and len(tag) > MIN_LEN :
        return True
    else :
        return False

In [6]:
# trying a limitation in the number of posts
POSTS_MAX = 100

# this list contains just edges from initial target (keys) tags to related post tags
edges_list_keys = []

# this list contains all edges between pairs of tags from the same post
edges_list_all = []

# populating the lists of edges
for person, posts in data_json.items() :
    
    for post in posts[:POSTS_MAX] :
        
        post_tags = post['tags']
        
        post_tags = [tag for tag in post_tags if validate_tag(tag)]
        
        post_tags_drop_person = [tag for tag in post_tags if not tag == person]
        
        for tag in post_tags_drop_person :
            
            edge_keys = (person, tag)
            
            edges_list_keys.append( edge_keys )
        
        for tag in post_tags :
            
            tag_index = post_tags.index(tag)
            
            post_tags_slice = post_tags[tag_index+1:]
            
            # post_tags_drop_tag = [tagg for tagg in post_tags if not tagg == tag]
            
            for tagg in post_tags_slice :
                
                edge_all_pre = (tag, tagg)
                
                edge_all = ( min(edge_all_pre) , max(edge_all_pre) )
                
                edges_list_all.append( edge_all )

In [7]:
len(edges_list_keys)

10518

In [8]:
len(edges_list_all)

123209

In [9]:
edges_list_all[:10]

[('acasacaiu', 'comunismo'),
 ('acasacaiu', 'comunista'),
 ('acasacaiu', 'esquerda'),
 ('acasacaiu', 'esquerdista'),
 ('acasacaiu', 'lula'),
 ('acasacaiu', 'lulalivre'),
 ('comunismo', 'comunista'),
 ('comunismo', 'esquerda'),
 ('comunismo', 'esquerdista'),
 ('comunismo', 'lula')]

In [10]:
edges_list_keys[:10]

[('lula', 'acasacaiu'),
 ('lula', 'comunismo'),
 ('lula', 'comunista'),
 ('lula', 'esquerda'),
 ('lula', 'esquerdista'),
 ('lula', 'lulalivre'),
 ('lula', 'omecanismo'),
 ('lula', 'themecanism'),
 ('lula', 'bresil'),
 ('lula', 'brasil')]

# Handling List of All Edges

## Initial Graph

In [11]:
import networkx as nx

In [12]:
G = nx.from_edgelist(edges_list_all)

In [13]:
list(G.nodes)[:10]

['larenga',
 'amomuitotudoisso',
 'ladrao',
 'sozlesme',
 'emdefesadademocracia',
 'jeanwillys',
 'dem',
 'chiro',
 'evolution',
 'golpistaspagarao']

In [14]:
list(G.edges())[:10]

[('larenga', 'guasones'),
 ('larenga', 'patofontanet'),
 ('larenga', 'calamaro'),
 ('larenga', 'elbordo'),
 ('larenga', 'frasesdelrock'),
 ('larenga', 'cerati'),
 ('larenga', 'notevagustar'),
 ('larenga', 'sumo'),
 ('larenga', 'lavela'),
 ('larenga', 'discosderock')]

In [15]:
len(G.nodes)

2796

In [16]:
len(G.edges)

44477

In [17]:
# percentage from graph edges to list of edges
100 * len(G.edges)/len(edges_list_all)

36.09882394954914

**Note**

What should we do with duplicates edges which disappear when added to the graph? They could be counted as a wight parameter.

## Grouping and Counting Edges

In [18]:
import pandas as pd

In [19]:
edges_df = pd.DataFrame(edges_list_all, columns=['source', 'target'])

In [20]:
edges_df.head()

Unnamed: 0,source,target
0,acasacaiu,comunismo
1,acasacaiu,comunista
2,acasacaiu,esquerda
3,acasacaiu,esquerdista
4,acasacaiu,lula


In [21]:
# edges_df.to_csv('edges_list_all.csv')

In [22]:
edges_df['tuple'] = pd.Series(zip(edges_df.source, edges_df.target))

In [23]:
edges_df.head()

Unnamed: 0,source,target,tuple
0,acasacaiu,comunismo,"(acasacaiu, comunismo)"
1,acasacaiu,comunista,"(acasacaiu, comunista)"
2,acasacaiu,esquerda,"(acasacaiu, esquerda)"
3,acasacaiu,esquerdista,"(acasacaiu, esquerdista)"
4,acasacaiu,lula,"(acasacaiu, lula)"


In [24]:
edges_grouped = edges_df.groupby('tuple').count()

In [25]:
edges_grouped.sample(5)

Unnamed: 0_level_0,source,target
tuple,Unnamed: 1_level_1,Unnamed: 2_level_1
"(crucilandia, fotonoespelho)",1,1
"(jairbolsonaro, telegram)",1,1
"(flaviobolsonaro, stf)",2,2
"(pdt, tempo)",1,1
"(dilma, misionverdad)",1,1


**Note**

We can add the count for each connection between tags as a parameter of the edge.

Let's improve the dataframe fot this task.

In [26]:
edges_grouped.drop(columns='target', inplace=True, errors='ignore')

In [27]:
edges_grouped.columns=['weight']

In [28]:
edges_grouped.reset_index(inplace=True)

In [29]:
edges_grouped.sample(5)

Unnamed: 0,tuple,weight
40233,"(napoli, vintage)",1
28992,"(glenngreenwald, theintercept)",2
36069,"(liomessi, teambarca)",1
1196,"(alesilva, lavajatoeuapoio)",1
13762,"(ciro, familytime)",9


In [30]:
edges_grouped.shape

(44477, 2)

In [31]:
edges_grouped['source'] = edges_grouped.tuple.str[0]

In [32]:
edges_grouped['target'] = edges_grouped.tuple.str[1]

In [33]:
edges_grouped = edges_grouped.drop(columns='tuple')

In [34]:
edges_grouped.sample(5)

Unnamed: 0,weight,source,target
29289,1,goal,pesmobile
18147,1,delight,fcbarcelona
10494,6,brasilia,esquerda
40108,1,musiclover,pionnerdj
11662,1,callejon,forzanapolisempre


In [35]:
# edges_grouped.to_csv('edges_counted.csv')

**Note**

Now, let's finally create the graph.

## Creating New Graph

In [36]:
G = nx.from_pandas_edgelist(edges_grouped, edge_attr=True)

In [37]:
list(G.nodes)[:10]

['larenga',
 'amomuitotudoisso',
 'leo',
 'sozlesme',
 'emdefesadademocracia',
 'jeanwillys',
 'dem',
 'chiro',
 'evolution',
 'golpistaspagarao']

In [38]:
list(G.edges(data=True))[:10]

[('larenga', 'patofontanet', {'weight': 5}),
 ('larenga', 'calamaro', {'weight': 1}),
 ('larenga', 'elbordo', {'weight': 5}),
 ('larenga', 'losredondos', {'weight': 6}),
 ('larenga', 'guasones', {'weight': 5}),
 ('larenga', 'notevagustar', {'weight': 1}),
 ('larenga', 'sumo', {'weight': 4}),
 ('larenga', 'lavela', {'weight': 5}),
 ('larenga', 'discosderock', {'weight': 1}),
 ('larenga', 'attaque77', {'weight': 1})]

In [39]:
len(G.nodes)

2796

In [40]:
len(G.edges)

44477

**Note**

We have the same number of nodes and edges, but now with the weight.

In [41]:
# the same percetual as before, but now with the grouped dataframe
100 * len(G.edges)/edges_grouped.shape[0]

100.0

In [42]:
nx.write_graphml(G, "edges_counted_" + str(POSTS_MAX) + ".graphml")

**Note**

Lets's take a closer look.

Let's check the most important nodes.

## Inspecting Edges

In [43]:
edges_grouped.sort_values(by='weight', ascending=False).head(10)

Unnamed: 0,weight,source,target
21904,198,elenao,lulalivre
37306,175,lulalivre,manueladavila
21726,162,eleicoes2018,manueladavila
30728,153,haddad,lulalivre
36755,138,lula,lulalivre
6972,127,bolsonaro,brasil
7445,115,bolsonaro,moro
21854,113,elenao,haddad
21725,109,eleicoes2018,lulalivre
6938,108,bolsonaro,bolsonaro2018


In [44]:
# defining masks to select data

mask_source_lulalivre = edges_grouped.source == 'lulalivre'
mask_source_lulapresopolitico = edges_grouped.source == 'lulapresopolitico'

mask_target_lulalivre = edges_grouped.target == 'lulalivre'
mask_target_lulapresopolitico = edges_grouped.target == 'lulapresopolitico'

In [45]:
edges_grouped[mask_source_lulalivre & mask_target_lulapresopolitico]

Unnamed: 0,weight,source,target
37298,54,lulalivre,lulapresopolitico


In [46]:
edges_grouped[mask_target_lulalivre & mask_source_lulapresopolitico]

Unnamed: 0,weight,source,target


**Note**

No pair of tags is duplicated.

## Inspecting Weights

In [47]:
edges_grouped.weight.sort_values(ascending=False).sample(15)

15764    2
7533     1
23660    1
40915    2
39090    5
3295     1
43344    1
20283    1
36390    1
30238    1
39303    2
21110    1
38083    1
39350    1
37542    5
Name: weight, dtype: int64

In [48]:
weight_counts = edges_grouped.weight.value_counts().sort_index(ascending=False)

In [49]:
weight_counts.head(10)

198    1
175    1
162    1
153    1
138    1
127    1
115    1
113    1
109    1
108    1
Name: weight, dtype: int64

In [50]:
weight_counts.tail(15)

15       69
14       56
13       70
12      330
11       70
10      154
9       186
8       470
7       426
6       780
5       914
4      2085
3      3079
2      6460
1     28090
Name: weight, dtype: int64

**Note**

Most ot the edges are insignificant and can be dropped to a better visual analysis.

## Dropgging Insignificant Edges

In [51]:
TRESHOLD = 5

mask_insignificant = edges_grouped.weight.apply(lambda x : x <= TRESHOLD)

In [52]:
edges_grouped_dropped = edges_grouped[~mask_insignificant]

In [53]:
edges_grouped_dropped.weight.value_counts().sort_index(ascending=False).head(10)

198    1
175    1
162    1
153    1
138    1
127    1
115    1
113    1
109    1
108    1
Name: weight, dtype: int64

In [54]:
edges_grouped_dropped.weight.value_counts().sort_index(ascending=False).tail(15)

20    100
19     29
18     58
17     28
16    214
15     69
14     56
13     70
12    330
11     70
10    154
9     186
8     470
7     426
6     780
Name: weight, dtype: int64

In [55]:
# recreating the graph
G_dropped = nx.from_pandas_edgelist(edges_grouped_dropped, edge_attr=True)

## Selfloop Edges

In [56]:
list(G_dropped.selfloop_edges(data=True))[:10]

[('bolsominionsarrependidos', 'bolsominionsarrependidos'),
 ('portugal', 'portugal'),
 ('familia', 'familia'),
 ('guedes', 'guedes'),
 ('pacoteanticrimeja', 'pacoteanticrimeja'),
 ('conservadores', 'conservadores'),
 ('pt', 'pt'),
 ('moro', 'moro'),
 ('lulapresopolitico', 'lulapresopolitico'),
 ('bolsonaro', 'bolsonaro')]

In [57]:
len(list(G_dropped.selfloop_edges(data=True)))

35

In [58]:
nx.write_graphml(G_dropped, "edges_counted_" + str(POSTS_MAX) + "_dropped.graphml")

## Plotting Graph

In [59]:
import matplotlib.pyplot as plt

In [60]:
%%time

# turn to False to disable a long time operation
if False :

    nx.draw(G)

    plt.show()

CPU times: user 11 µs, sys: 0 ns, total: 11 µs
Wall time: 22.4 µs


# Handling List of Key Edges

## Creating Keys Graph

It could be interesting also examine the graph of edges between the initial tags and all others.

In [61]:
edges_list_keys[:10]

[('lula', 'acasacaiu'),
 ('lula', 'comunismo'),
 ('lula', 'comunista'),
 ('lula', 'esquerda'),
 ('lula', 'esquerdista'),
 ('lula', 'lulalivre'),
 ('lula', 'omecanismo'),
 ('lula', 'themecanism'),
 ('lula', 'bresil'),
 ('lula', 'brasil')]

In [62]:
g = nx.from_edgelist(edges_list_keys)

In [63]:
len(g.nodes)

2799

In [64]:
len(g.edges)

3817

In [65]:
# percentage from graph edges to list of edges
100 * len(g.edges)/len(edges_list_keys)

36.29016923369462

**Note**

The same problem occourred with the percentage.

## Grouping and Counting Keys Edges

In [66]:
edges_df_keys = pd.DataFrame(edges_list_keys, columns=['source', 'target'])

In [67]:
edges_df_keys.sample(5)

Unnamed: 0,source,target
6047,ciro,leomessi
3715,bolsonaro,mexico
9719,haddad,pmsp
1146,lula,tipografia
10468,haddad,lulalivre


In [68]:
# edges_df_keys.to_csv('edges_list_keys.csv')

In [69]:
edges_df_keys['tuple'] = pd.Series(zip(edges_df_keys.source, edges_df_keys.target))

In [70]:
edges_df_keys.sample(5)

Unnamed: 0,source,target,tuple
8339,haddad,elesim,"(haddad, elesim)"
6166,ciro,ciromessi,"(ciro, ciromessi)"
3197,bolsonaro,euapoioalavajato,"(bolsonaro, euapoioalavajato)"
4622,dilma,tercainsana18anos,"(dilma, tercainsana18anos)"
9706,haddad,obrasilfelizdenovo,"(haddad, obrasilfelizdenovo)"


In [71]:
edges_grouped_keys = edges_df_keys.groupby('tuple').count()

In [72]:
edges_grouped_keys.sample(5)

Unnamed: 0_level_0,source,target
tuple,Unnamed: 1_level_1,Unnamed: 2_level_1
"(guedes, joaofelix)",1,1
"(ciro, lulanobeldapaz)",1,1
"(ciro, frenchie)",1,1
"(bolsonaro, portedearma)",1,1
"(lula, 54)",1,1


In [73]:
edges_grouped_keys.drop(columns='target', inplace=True, errors='ignore')

In [74]:
edges_grouped_keys.columns=['weight']

In [75]:
edges_grouped_keys.reset_index(inplace=True)

In [76]:
edges_grouped_keys.sample(5)

Unnamed: 0,tuple,weight
536,"(bolsonaro, repost)",1
207,"(bolsonaro, ditadura)",2
2896,"(lula, gay)",1
3032,"(lula, lulusantosoficial)",1
1268,"(ciro, sons)",3


In [77]:
edges_grouped_keys['source'] = edges_grouped_keys.tuple.str[0]

In [78]:
edges_grouped_keys['target'] = edges_grouped_keys.tuple.str[1]

In [79]:
edges_grouped_keys.shape

(3828, 4)

In [80]:
edges_grouped_keys = edges_grouped_keys.drop(columns='tuple')

In [81]:
edges_grouped_keys.sample(5)

Unnamed: 0,weight,source,target
3170,5,lula,presidente
3404,1,moro,cesarebeccaria
190,22,bolsonaro,direita
72,2,bolsonaro,bolsonaromerepresenta
477,2,bolsonaro,patriotas


In [82]:
# edges_grouped_keys.to_csv('edges_counted_keys.csv')

## Creating New Keys Graph

In [83]:
g = nx.from_pandas_edgelist(edges_grouped_keys, edge_attr=True)

In [84]:
list(g.nodes)[:10]

['larenga',
 'amomuitotudoisso',
 'ladrao',
 'sozlesme',
 'emdefesadademocracia',
 'jeanwillys',
 'dem',
 'evolution',
 'condenarsemprovasecrime',
 'documentario']

In [85]:
list(g.edges(data=True))[:10]

[('larenga', 'ciro', {'weight': 6}),
 ('amomuitotudoisso', 'guedes', {'weight': 16}),
 ('ladrao', 'dilma', {'weight': 1}),
 ('sozlesme', 'ciro', {'weight': 1}),
 ('emdefesadademocracia', 'lula', {'weight': 1}),
 ('emdefesadademocracia', 'haddad', {'weight': 1}),
 ('jeanwillys', 'dilma', {'weight': 4}),
 ('dem', 'bolsonaro', {'weight': 1}),
 ('evolution', 'ciro', {'weight': 1}),
 ('condenarsemprovasecrime', 'lula', {'weight': 1})]

In [86]:
len(g.nodes)

2799

In [87]:
len(g.edges)

3817

In [88]:
# the same percetual as before, but now with the grouped dataframe
100 * len(g.edges)/edges_grouped_keys.shape[0]

99.71264367816092

In [89]:
nx.write_graphml(g, "edges_counted_keys_" + str(POSTS_MAX) + ".graphml")

## Inspecting Keys Edges

In [90]:
edges_grouped_keys.sample(10)

Unnamed: 0,weight,source,target
3472,1,moro,doutrinaespirita
1649,4,dilma,repost
1985,1,guedes,instasoccer
3442,1,moro,deliciousfood
3817,2,moro,vazajatotheintercept
1124,1,ciro,network
2609,1,haddad,saporrasincero
241,1,bolsonaro,euapoiolavajato
3718,1,moro,pintura
364,4,bolsonaro,juizsergiomoro


In [91]:
edges_grouped_keys.isnull().sum()

weight    0
source    0
target    0
dtype: int64

In [92]:
edges_grouped_keys.weight.sort_values().unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  41,  42,  43,  44,
        46,  48,  49,  51,  54,  55,  62,  86, 113])

In [93]:
# checking for empty tags
edges_grouped_keys.source.apply( lambda x : x is '' ).sum()

0

In [94]:
# checking for empty tags
edges_grouped_keys.target.apply( lambda x : x is '' ).sum()

0

In [95]:
# checking for self loop edges
list(g.selfloop_edges())

[]

In [96]:
# checking for swapped key tags

key_tags = edges_grouped_keys.source.unique().tolist()

mask_key_tags = edges_grouped_keys.target.isin(key_tags)

edges_grouped_keys[mask_key_tags]

Unnamed: 0,weight,source,target
187,2,bolsonaro,dilma
322,2,bolsonaro,haddad
381,7,bolsonaro,lula
430,21,bolsonaro,moro
704,7,ciro,bolsonaro
787,1,ciro,dilma
923,5,ciro,haddad
1051,4,ciro,lula
1110,4,ciro,moro
1382,36,dilma,bolsonaro


**Note**

The number of swapped key tags is low. These ones can be handled in Gephi later by merge operations.

## Plotting Keys Graph

In [97]:
import matplotlib.pyplot as plt

In [98]:
%%time

# just to disable a long time operation
if False :
    
    nx.draw(g)

    plt.show()

CPU times: user 10 µs, sys: 1 µs, total: 11 µs
Wall time: 21.5 µs


# Node Weights

In [99]:
node_weights = {}

for person, posts in data_json.items() :
    
    for post in posts[:POSTS_MAX] :
        
        post_tags = post['tags']
        
        post_tags = [tag for tag in post_tags if validate_tag(tag)]
        
        for tag in post_tags :
            
            if tag in node_weights : 
                node_weights[tag] = node_weights[tag] + 1
            else :
                node_weights[tag] = 1

In [100]:
list(G.nodes(data=True))[:10]

[('larenga', {}),
 ('amomuitotudoisso', {}),
 ('leo', {}),
 ('sozlesme', {}),
 ('emdefesadademocracia', {}),
 ('jeanwillys', {}),
 ('dem', {}),
 ('chiro', {}),
 ('evolution', {}),
 ('golpistaspagarao', {})]

In [101]:
list(g.nodes(data=True))[:10]

[('larenga', {}),
 ('amomuitotudoisso', {}),
 ('ladrao', {}),
 ('sozlesme', {}),
 ('emdefesadademocracia', {}),
 ('jeanwillys', {}),
 ('dem', {}),
 ('evolution', {}),
 ('condenarsemprovasecrime', {}),
 ('documentario', {})]

In [102]:
len(G.nodes)

2796

In [104]:
len(G_dropped.nodes)

441

In [103]:
len(g.nodes)

2799

In [116]:
nx.set_node_attributes(G, node_weights, 'weight')

In [117]:
list(G.nodes(data=True))[:10]

[('larenga', {'weight': 6}),
 ('amomuitotudoisso', {'weight': 16}),
 ('leo', {'weight': 11}),
 ('sozlesme', {'weight': 1}),
 ('emdefesadademocracia', {'weight': 2}),
 ('jeanwillys', {'weight': 4}),
 ('dem', {'weight': 1}),
 ('chiro', {'weight': 1}),
 ('evolution', {'weight': 1}),
 ('golpistaspagarao', {'weight': 1})]

In [122]:
nx.write_graphml(G, "edges_counted_" + str(POSTS_MAX) + "_nw.graphml")

In [118]:
nx.set_node_attributes(G_dropped, node_weights, 'weight')

In [119]:
list(G_dropped.nodes(data=True))[:10]

[('antoroccuzzo', {'weight': 12}),
 ('larenga', {'weight': 6}),
 ('bolsominionsarrependidos', {'weight': 21}),
 ('amomuitotudoisso', {'weight': 16}),
 ('leo', {'weight': 11}),
 ('digitalpainting', {'weight': 3}),
 ('fcbarcelona', {'weight': 8}),
 ('goiania', {'weight': 8}),
 ('galeria', {'weight': 3}),
 ('grevegeral', {'weight': 6})]

In [123]:
nx.write_graphml(G_dropped, "edges_counted_" + str(POSTS_MAX) + "_dropped_nw.graphml")

In [120]:
nx.set_node_attributes(g, node_weights, 'weight')

In [121]:
list(G.nodes(data=True))[:10]

[('larenga', {'weight': 6}),
 ('amomuitotudoisso', {'weight': 16}),
 ('leo', {'weight': 11}),
 ('sozlesme', {'weight': 1}),
 ('emdefesadademocracia', {'weight': 2}),
 ('jeanwillys', {'weight': 4}),
 ('dem', {'weight': 1}),
 ('chiro', {'weight': 1}),
 ('evolution', {'weight': 1}),
 ('golpistaspagarao', {'weight': 1})]

In [124]:
nx.write_graphml(g, "edges_counted_keys" + str(POSTS_MAX) + "_nw.graphml")

# Discarded