In [52]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm_notebook

In [2]:
df = pd.read_csv("data/mentions-month.gzip", compression='gzip')
df

Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionTimeDate,MentionType,MentionSourceName,MentionIdentifier,SentenceID,Actor1CharOffset,Actor2CharOffset,ActionCharOffset,InRawText,Confidence,MentionDocLen,MentionDocTone,MentionDocTranslationInfo,Extras
0,803346281,20181117110000,20181117110000,1,dw.com,https://www.dw.com/pt-br/cia-conclui-que-pr%C3...,38,10466,-1,10499,1,100,11578,-3.069926,srclc:por;eng:GT-POR 1.0,
1,803296669,20181117043000,20181117110000,1,dw.com,https://www.dw.com/pt-br/cia-conclui-que-pr%C3...,28,7486,-1,7514,1,100,11578,-3.069926,srclc:por;eng:GT-POR 1.0,
2,803346663,20181117110000,20181117110000,1,dw.com,https://www.dw.com/pt-br/cia-conclui-que-pr%C3...,35,9769,9747,9721,1,100,11578,-3.069926,srclc:por;eng:GT-POR 1.0,
3,803281180,20181117024500,20181117110000,1,dw.com,https://www.dw.com/pt-br/cia-conclui-que-pr%C3...,36,10017,10090,10078,1,100,11578,-3.069926,srclc:por;eng:GT-POR 1.0,
4,796045333,20181020014500,20181020180000,1,dw.com,https://www.dw.com/pt-br/o-peso-da-rejei%C3%A7...,23,6401,-1,6424,1,100,10251,-0.256739,srclc:por;eng:GT-POR 1.0,
5,796154309,20181020160000,20181020201500,1,dw.com,https://www.dw.com/pt-002/ar%C3%A1bia-saudita-...,21,4746,4608,4795,1,100,7639,-3.951890,srclc:por;eng:GT-POR 1.0,
6,796154322,20181020160000,20181020201500,1,dw.com,https://www.dw.com/pt-002/ar%C3%A1bia-saudita-...,34,7164,-1,7128,1,100,7639,-3.951890,srclc:por;eng:GT-POR 1.0,
7,796262881,20181021074500,20181021074500,1,dw.com,https://www.dw.com/es/al-wazir-cada-voto-verde...,28,-1,7740,7717,1,100,7722,0.075131,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
8,796263317,20181021074500,20181021074500,1,dw.com,https://www.dw.com/es/al-wazir-cada-voto-verde...,21,6044,-1,6008,1,100,7722,0.075131,srclc:spa;eng:Moses 2.1.1 / MosesCore Europarl...,
9,797945242,20181027110000,20181027110000,1,dw.com,https://www.dw.com/id/jerman-turki-perancis-da...,35,8247,8317,8267,1,100,12564,-5.898416,srclc:ind;eng:GT-IND 1.0,


In [3]:
# sanity check : we only have articles (MentionType 1)
df[df.MentionType!=1]

Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionTimeDate,MentionType,MentionSourceName,MentionIdentifier,SentenceID,Actor1CharOffset,Actor2CharOffset,ActionCharOffset,InRawText,Confidence,MentionDocLen,MentionDocTone,MentionDocTranslationInfo,Extras


In [4]:
# distribution of nb of articles by sources
nb_articles = df.groupby("MentionSourceName").size()
nb_articles.describe()

count       50.000000
mean     14994.540000
std      10704.597264
min       6754.000000
25%       7575.250000
50%      10240.000000
75%      19197.000000
max      57083.000000
dtype: float64

In [5]:
sources = set(df.MentionSourceName)

In [53]:
domains = { s.split(".")[-1] for s in sources }
len(domains)

17

In [47]:
threshold = 300
nodes = []
edges = []
new_sources = sources

for source1 in tqdm_notebook(sources):
    nodes.append({ 'id' : source1, 'size' : int(nb_articles[source1]), 'group' : source1.split(".")[-1] }) 
    events1 = set(df[df.MentionSourceName==source1].GLOBALEVENTID)
    new_sources = new_sources - {source1}
    
    for source2 in new_sources:
        if source1==source2:
            continue
            
        events2 = set(df[df.MentionSourceName==source2].GLOBALEVENTID)
        w = len(events1 & events2)
        
        if w > threshold:
            edges.append({'source': source1, 'target': source2, 'weight': w})  

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [48]:
nodes

[{'id': 'wkrb13.com', 'size': 10117, 'group': 'com'},
 {'id': 'dailymail.co.uk', 'size': 21132, 'group': 'uk'},
 {'id': 'iheart.com', 'size': 57083, 'group': 'com'},
 {'id': 'ziarelive.ro', 'size': 28148, 'group': 'ro'},
 {'id': 'special.tass.ru', 'size': 9551, 'group': 'ru'},
 {'id': 'globo.com', 'size': 16610, 'group': 'com'},
 {'id': 'focus-news.net', 'size': 7151, 'group': 'net'},
 {'id': 'wafa.ps', 'size': 41062, 'group': 'ps'},
 {'id': 'thenews.com.pk', 'size': 8715, 'group': 'pk'},
 {'id': 'time.mk', 'size': 20794, 'group': 'mk'},
 {'id': 'xinhuanet.com', 'size': 8538, 'group': 'com'},
 {'id': 'yahoo.com', 'size': 30293, 'group': 'com'},
 {'id': 'thehindu.com', 'size': 7060, 'group': 'com'},
 {'id': 'indiatimes.com', 'size': 19869, 'group': 'com'},
 {'id': 'eleconomista.es', 'size': 7282, 'group': 'es'},
 {'id': 'msn.com', 'size': 21790, 'group': 'com'},
 {'id': 'regnum.ru', 'size': 10383, 'group': 'ru'},
 {'id': 'english.wafa.ps', 'size': 17181, 'group': 'ps'},
 {'id': 'washing

In [49]:
len(edges)

105

In [50]:
network = {'nodes': nodes, 'edges': edges}

In [51]:
with open("data/network-month-"+str(threshold)+".json", "w") as f:
    json.dump(network, f, indent=2)

In [67]:
cluster_centers = {}
h = 800
k = 400
r = 200
for d,i in zip(domains, np.linspace(0, 2*np.pi, len(domains))):
    cluster_centers[d] = {"x": r*np.cos(i) + h, "y":r*np.sin(i) + k}

In [68]:
cluster_centers

{'pk': {'x': 1000.0, 'y': 400.0},
 'ps': {'x': 984.7759065022574, 'y': 476.53668647301794},
 'uk': {'x': 941.4213562373095, 'y': 541.4213562373095},
 'mx': {'x': 876.536686473018, 'y': 584.7759065022574},
 'es': {'x': 800.0, 'y': 600.0},
 'nz': {'x': 723.463313526982, 'y': 584.7759065022574},
 'ru': {'x': 658.5786437626905, 'y': 541.4213562373095},
 'bg': {'x': 615.2240934977426, 'y': 476.536686473018},
 'com': {'x': 600.0, 'y': 400.0},
 'ro': {'x': 615.2240934977426, 'y': 323.46331352698206},
 'net': {'x': 658.5786437626905, 'y': 258.5786437626905},
 'cz': {'x': 723.4633135269819, 'y': 215.2240934977427},
 'it': {'x': 800.0, 'y': 200.0},
 'cu': {'x': 876.536686473018, 'y': 215.22409349774267},
 'br': {'x': 941.4213562373095, 'y': 258.5786437626905},
 'kr': {'x': 984.7759065022573, 'y': 323.4633135269819},
 'mk': {'x': 1000.0, 'y': 399.99999999999994}}