# Data preprocessing

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt 
import seaborn as sns
from pandas.api.types import union_categoricals

### Read dataset

In [2]:
data = pd.read_csv('../data/processed_data/processed_data_2015_onward.csv')

In [3]:
data.head(5)

Unnamed: 0,EventDate,Actor1CountryCode,Actor2CountryCode,GoldsteinScale_Summed,AvgTone_Summed,NumMentions_averaged,NumArticles_averaged
0,2015-01-31,AFG,PAK,8.5,2.205358,4.125,4.125
1,2015-01-31,AFG,USA,25.4,-25.458712,7.933333,7.933333
2,2015-01-31,AFR,BDI,-2.0,0.706714,5.0,5.0
3,2015-01-31,AFR,USA,2.8,5.889282,10.0,10.0
4,2015-01-31,AFR,ZAF,-30.0,-17.795758,4.0,4.0


### Alphabetical order transofrmation

In [4]:
# Remove tuples that contain the same two countries but in different order by averaging the values from both tuples (ex. AFG -PAK and PAK - AFG)

In [5]:
for i in range(data.shape[0]):
    if data.iloc[i, 2] < data.iloc[i, 1]:
        c1 = data.iloc[i, 2]
        data.iloc[i, 2] = data.iloc[i, 1]
        data.iloc[i, 1] = c1

KeyboardInterrupt: 

In [None]:
data = data.groupby(['EventDate', 'Actor1CountryCode',	'Actor2CountryCode'])[['GoldsteinScale_Summed',	'AvgTone_Summed', 'NumMentions_averaged', 'NumArticles_averaged']].mean().reset_index()

In [None]:
data.head(5)

### Familiarization

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.shape

# Graph building

In [None]:
# extract nodes from data - the unique country codes
a1 = pd.Categorical(data['Actor1CountryCode'])
a2 = pd.Categorical(data['Actor2CountryCode'])
nodes = union_categoricals([a1, a2]).categories.to_list()
nodes

In [None]:
g = nx.MultiGraph()

In [None]:
g.nodes

In [None]:
g.edges

In [None]:
for i in range(data.shape[0]):
    if data.iloc[i,0] == '2015-01-31':
        g.add_edge(data.iloc[i,1], data.iloc[i,2], data.iloc[i,3])
        g.add_edge(data.iloc[i,1], data.iloc[i,2], data.iloc[i,4])
        g.add_edge(data.iloc[i,1], data.iloc[i,2], data.iloc[i,5])
        g.add_edge(data.iloc[i,1], data.iloc[i,2], data.iloc[i,6])
    else:
        break 

In [None]:
g.edges

In [None]:
for (u,v,d) in g.edges:
    print((u,v,d))

In [None]:
elarge=[(u,v) for (u,v,d) in g.edges if d >0.5]
esmall=[(u,v) for (u,v,d) in g.edges if d <=0.5]

plt.figure(figsize=(30, 50))
pos=nx.spring_layout(g) # positions for all nodes

# nodes
nx.draw_networkx_nodes(g,pos,node_size=300)

# edges
nx.draw_networkx_edges(g,pos,edgelist=elarge,
                    width=6)
nx.draw_networkx_edges(g,pos,edgelist=esmall,
                    width=6,alpha=0.5,edge_color='b',style='dashed')

# labels
nx.draw_networkx_labels(g,pos,font_size=7,font_family='sans-serif')

plt.axis('off')
plt.title('January 2015')
plt.savefig("weighted_graph.png") # save as png
plt.show() # display

### Building graph with attributes on edges

In [None]:
g = nx.MultiGraph()

In [None]:
for i in range(data.shape[0]):
    if data.iloc[i,0] == '2015-01-31':
        g.add_edge(data.iloc[i,1], data.iloc[i,2], glt_sc = data.iloc[i,3], avg_tone = data.iloc[i,4], num_mentions= data.iloc[i,5], num_articles= data.iloc[i,6])
    else:
        break 

In [None]:
g.edges()

In [None]:
for (u,v,d) in g.edges(data=True):
    print((u,v,d))