## Notebook Content

In this notebook we create the third dataset (dataset3.pkl) that contains graph and node features derived from the graph.

In [1]:
import pandas as pd
import networkx as nx
import feature_engineering

Load the graph

In [2]:
G = nx.read_gpickle('data/graph_61')

In [3]:
print("The graph has", len(set(G.nodes)), " unique nodes.")

The graph has 24564  unique nodes.


### 1. Add the following graph features
1. Graph clustering coefficient
2. Graph transitivity
3. Graph average degree centrality
4. Graph connectivity
5. Graph strongly connected subgraph

###### To the nodes of the strongly connected graph add (0 to the others):
6. Graph periphery
7. Graph center

In [4]:
graph_features = pd.DataFrame()

In [None]:
graph_features, G_strongly = feature_engineering.add_graph_characteristics(G, graph_features)
graph_features

In [12]:
nx.write_gpickle(G_strongly, 'data/graph61_strongly')
graph_features.to_pickle('data/graph_features.pkl')

In [14]:
graph_features['belongs_to_graph_periphery'].value_counts()

0    24539
1       25
Name: belongs_to_graph_periphery, dtype: int64

In [15]:
graph_features['belongs_to_graph_center'].value_counts()

0    24563
1        1
Name: belongs_to_graph_center, dtype: int64

### 2. Add the following node features
1. Node degree
2. Node clustering coefficient
3. Node eccentricity
4. Node degree centrality
5. Node closeness centrality
6. Node betweenness centrality
7. Node pagerank
8. Node hits

In [2]:
G = nx.read_gpickle('data/graph_61')
G_strongly = nx.read_gpickle('data/graph61_strongly')
graph_features = pd.read_pickle('data/graph_features.pkl')
graph_features

Unnamed: 0,graph_cc,graph_transitivity,graph_average_degree_centrality,is_connected,belongs_to_strongly_connected,belongs_to_graph_periphery,belongs_to_graph_center
0,0.198505,0.046533,12281.5,0,1,0,0
1,0.198505,0.046533,12281.5,0,1,0,0
2,0.198505,0.046533,12281.5,0,1,0,0
3,0.198505,0.046533,12281.5,0,1,0,0
4,0.198505,0.046533,12281.5,0,1,0,0
...,...,...,...,...,...,...,...
24559,0.198505,0.046533,12281.5,0,1,0,0
24560,0.198505,0.046533,12281.5,0,1,0,0
24561,0.198505,0.046533,12281.5,0,1,0,0
24562,0.198505,0.046533,12281.5,0,1,0,0


In [3]:
graph_node_features = feature_engineering.add_node_characteristics(G, graph_features, G_strongly)
graph_node_features

in eccentricity
in closeness_centrality
in betweenness_centrality


  A = nx.adjacency_matrix(G, nodelist=list(G), dtype=float)


Unnamed: 0,graph_cc,graph_transitivity,graph_average_degree_centrality,is_connected,belongs_to_strongly_connected,belongs_to_graph_periphery,belongs_to_graph_center,node_degree,node_cc,graph_eccentricity,node_degree_centrality,node_closeness_centrality,node_betweenness_centrality,node_pageRank,node_hub,node_authority
0,0.198505,0.046533,12281.5,0,1,0,0,8,0.321429,9,0.000326,0.326405,1.068907e-06,0.000018,3.457544e-05,3.457544e-05
1,0.198505,0.046533,12281.5,0,1,0,0,726,0.071876,9,0.029557,0.411162,3.379304e-03,0.001237,1.145977e-03,1.145977e-03
2,0.198505,0.046533,12281.5,0,1,0,0,27,0.108262,10,0.001099,0.315918,1.449296e-06,0.000049,2.901689e-05,2.901689e-05
3,0.198505,0.046533,12281.5,0,1,0,0,2,0.000000,11,0.000081,0.263466,8.053312e-08,0.000009,4.879332e-07,4.879332e-07
4,0.198505,0.046533,12281.5,0,1,0,0,350,0.070556,10,0.014249,0.358946,1.663929e-03,0.000757,1.104268e-04,1.104268e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24559,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,10,0.000041,0.307084,0.000000e+00,0.000006,1.058974e-05,1.058974e-05
24560,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,10,0.000041,0.256873,0.000000e+00,0.000006,3.064330e-07,3.064330e-07
24561,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,11,0.000041,0.240722,0.000000e+00,0.000006,6.473794e-08,6.473794e-08
24562,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,11,0.000041,0.240722,0.000000e+00,0.000006,6.473794e-08,6.473794e-08


In [4]:
graph_node_features.to_pickle('data/graph_node_features.pkl')

In [4]:
graph_node_features  = pd.read_pickle('data/graph_node_features.pkl')
graph_node_features

Unnamed: 0,graph_cc,graph_transitivity,graph_average_degree_centrality,is_connected,belongs_to_strongly_connected,belongs_to_graph_periphery,belongs_to_graph_center,node_degree,node_cc,graph_eccentricity,node_degree_centrality,node_closeness_centrality,node_betweenness_centrality,node_pageRank,node_hub,node_authority
0,0.198505,0.046533,12281.5,0,1,0,0,8,0.321429,9,0.000326,0.326405,1.068907e-06,0.000018,3.457544e-05,3.457544e-05
1,0.198505,0.046533,12281.5,0,1,0,0,726,0.071876,9,0.029557,0.411162,3.379304e-03,0.001237,1.145977e-03,1.145977e-03
2,0.198505,0.046533,12281.5,0,1,0,0,27,0.108262,10,0.001099,0.315918,1.449296e-06,0.000049,2.901689e-05,2.901689e-05
3,0.198505,0.046533,12281.5,0,1,0,0,2,0.000000,11,0.000081,0.263466,8.053312e-08,0.000009,4.879332e-07,4.879332e-07
4,0.198505,0.046533,12281.5,0,1,0,0,350,0.070556,10,0.014249,0.358946,1.663929e-03,0.000757,1.104268e-04,1.104268e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24559,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,10,0.000041,0.307084,0.000000e+00,0.000006,1.058974e-05,1.058974e-05
24560,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,10,0.000041,0.256873,0.000000e+00,0.000006,3.064330e-07,3.064330e-07
24561,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,11,0.000041,0.240722,0.000000e+00,0.000006,6.473794e-08,6.473794e-08
24562,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,11,0.000041,0.240722,0.000000e+00,0.000006,6.473794e-08,6.473794e-08


Add the political party of each node

In [2]:
party = pd.read_pickle('data/node_attributes_preprocessed.pkl')
party

Unnamed: 0,followers,following,total_tweets,lists,twitter_age,verified,party
0,166,158,1547,0,3061,0,left
1,11593,3234,133817,39,1463,0,left
3,390,423,11046,6,999,0,middle
4,4428,1406,40478,19,517,0,right
5,1506,729,165786,2,1779,0,right
...,...,...,...,...,...,...,...
24597,154,158,4034,0,325,0,left
24598,490,853,2123,5,835,0,left
24600,520,520,16610,1,2610,0,neutral
24601,3,42,14,0,355,0,middle


In [3]:
party = party['party']
party

0           left
1           left
3         middle
4          right
5          right
          ...   
24597       left
24598       left
24600    neutral
24601     middle
24602       left
Name: party, Length: 23634, dtype: object

We keep only the nodes that appeared in graph and we have their political party

In [9]:
graph_node_features = graph_node_features.merge(party, left_index=True, right_index=True)
graph_node_features

Unnamed: 0,graph_cc,graph_transitivity,graph_average_degree_centrality,is_connected,belongs_to_strongly_connected,belongs_to_graph_periphery,belongs_to_graph_center,node_degree,node_cc,graph_eccentricity,node_degree_centrality,node_closeness_centrality,node_betweenness_centrality,node_pageRank,node_hub,node_authority,party
0,0.198505,0.046533,12281.5,0,1,0,0,8,0.321429,9,0.000326,0.326405,1.068907e-06,0.000018,3.457544e-05,3.457544e-05,left
1,0.198505,0.046533,12281.5,0,1,0,0,726,0.071876,9,0.029557,0.411162,3.379304e-03,0.001237,1.145977e-03,1.145977e-03,left
3,0.198505,0.046533,12281.5,0,1,0,0,2,0.000000,11,0.000081,0.263466,8.053312e-08,0.000009,4.879332e-07,4.879332e-07,middle
4,0.198505,0.046533,12281.5,0,1,0,0,350,0.070556,10,0.014249,0.358946,1.663929e-03,0.000757,1.104268e-04,1.104268e-04,right
5,0.198505,0.046533,12281.5,0,1,0,0,7,0.047619,11,0.000285,0.273592,4.066568e-05,0.000024,1.148707e-06,1.148707e-06,right
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24559,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,10,0.000041,0.307084,0.000000e+00,0.000006,1.058974e-05,1.058974e-05,left
24560,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,10,0.000041,0.256873,0.000000e+00,0.000006,3.064330e-07,3.064330e-07,neutral
24561,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,11,0.000041,0.240722,0.000000e+00,0.000006,6.473794e-08,6.473794e-08,right
24562,0.198505,0.046533,12281.5,0,1,0,0,1,0.000000,11,0.000041,0.240722,0.000000e+00,0.000006,6.473794e-08,6.473794e-08,right


In [11]:
graph_node_features.to_pickle('data/dataset3.pkl')