In [12]:
import math
import json
import numpy
import pandas as pd
import networkx as nx
from itertools import combinations
from scipy.stats import pearsonr
from causality.inference.search import IC
from causality.inference.independence_tests import *

In [2]:
# load the data
df = pd.read_csv("data/wdvp_stats.tsv", 
                 sep="\t", 
                 header=0, 
                 skiprows=range(1, 5), 
                 index_col=0, 
                 thousands=',',
                 na_values=["-"])
df.drop("ISO Country code", axis=1, inplace=True)
df.dropna(axis=1, how="all", inplace=True)

In [3]:
df.head()

Unnamed: 0_level_0,population,surface area (Km2),GINI index,happy planet index,human development index,world happiness report score,sustainable economic development assessment (SEDA),GDP (billions PPP),GDP per capita (PPP),GDP growth (annual %),...,regulatory quality,rule of law,control of corruption,judicial effectiveness score,government integrity score,property rights score,tax burden score,overall economic freedom score,financial freedom score,women MPs (% of all MPs)
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,36000000,652230,,20.2,0.498,2.66,,64.1,1919.0,1.5,...,-1.3,-1.57,-1.52,28.2,26.2,17.9,91.8,51.3,10.0,27.7
Albania,2900000,27398,29.0,36.8,0.785,4.64,53.1,34.2,11840.0,2.6,...,0.2,-0.4,-0.42,25.4,39.9,54.1,85.1,64.5,70.0,27.9
Algeria,41000000,2381740,35.3,33.3,0.754,5.25,45.8,612.5,15027.0,3.7,...,-1.2,-0.86,-0.61,35.2,29.0,27.8,74.0,44.7,30.0,25.8
Andorra,77000,468,,,0.858,,,,,,...,1.2,1.6,1.24,,,,,,,32.1
Angola,30000000,1246700,42.7,,0.581,,28.4,187.3,6844.0,3.0,...,-1.0,-1.1,-1.41,25.4,18.9,36.0,82.4,48.6,40.0,38.2


In [4]:
# define the variable types: 'c' is 'continuous'
variables = [
    "GINI index",
    "happy planet index",
    "human development index",
    "world happiness report score",
    "sustainable economic development assessment (SEDA)",
    "GDP per capita (PPP)",
    "GDP growth (annual %)",
    "health expenditure  % of GDP",
    "health expenditure  per person",
    "education expenditure % of GDP",
    "education expenditure  per person ",
    "school life expectancy (YEARS)",
    "unemployment (%)",
    "government spending score",
    "government expenditure (% of GDP)",
    "political rights score ",
    "civil liberties score ",
    "political stability & absence of violence",
    "government effectiveness",
    "regulatory quality",
    "rule of law",
    "control of corruption",
    "judicial effectiveness score",
    "government integrity score",
    "property rights score",
    "tax burden score",
    "overall economic freedom score",
    "financial freedom score",
    "women MPs (% of all MPs)"
]

variable_types = {v: "c" for v in variables}

# run the search
ic_algorithm = IC(RobustRegressionTest)
causal_graph = ic_algorithm.search(df.fillna(df.mean()), variable_types)

In [37]:
# process and format the data for display
D = nx.DiGraph()
D.add_nodes_from(causal_graph.nodes())

causal_edges = {}
for n1, n2, data in causal_graph.edges(data=True):
    arrows = data["arrows"]
    if not arrows:
        causal_edges[(n1, n2)] = dict(genuine=data["marked"], directed=False, both_arrows=None)
    elif (n1 in arrows and n2 in arrows):
        causal_edges[(n2, n1)] = dict(genuine=data["marked"], directed=True, both_arrows=True)
    elif n1 in arrows:
        causal_edges[(n2, n1)] = dict(genuine=data["marked"], directed=True, both_arrows=False)
    elif n2 in arrows:
        causal_edges[(n1, n2)] = dict(genuine=data["marked"], directed=True, both_arrows=False)

for n1, n2 in combinations(D.nodes(), 2):
    r2, pval = pearsonr(df[n1], df[n2])
    
    n1_n2 = causal_edges.get((n1, n2), None)
    n2_n1 = causal_edges.get((n2, n1), None)
    if not n1_n2 and not n2_n1: # no causal edge
        D.add_edge(n1, n2, **dict(genuine=None, directed=False, causal=False, both_arrows=None, r2=r2, pval=pval))
    elif n1_n2:
        D.add_edge(n1, n2, **{**n1_n2, **dict(causal=True, r2=r2, pval=pval)})
    elif n2_n1:
        D.add_edge(n2, n1, **{**n2_n1, **dict(causal=True, r2=r2, pval=pval)})

pos = nx.circular_layout(D)

In [38]:
# serialize and write
from networkx.readwrite.json_graph import node_link_data
import pickle

with open("data/graph", "wb") as f:
    f.write(pickle.dumps(node_link_data(D)))

with open("data/pos", "wb") as f:
    f.write(pickle.dumps(pos))