In [1]:
import math
import copy
import json
import numpy
import pandas as pd
import networkx as nx
from itertools import combinations
from scipy.stats import pearsonr
from causality.inference.search import IC
from causality.inference.independence_tests import *

In [2]:
# load the data
df = pd.read_csv("data/wdvp_stats.tsv", 
                 sep="\t", 
                 header=0, 
                 skiprows=range(1, 5), 
                 index_col=0, 
                 thousands=',',
                 na_values=["-"])
df.drop("ISO Country code", axis=1, inplace=True)
df.dropna(axis=1, how="all", inplace=True)
df.fillna(df.mean(), inplace=True)

In [3]:
df.head()

Unnamed: 0_level_0,population,surface area (Km2),GINI index,happy planet index,human development index,world happiness report score,sustainable economic development assessment (SEDA),GDP (billions PPP),GDP per capita (PPP),GDP growth (annual %),...,regulatory quality,rule of law,control of corruption,judicial effectiveness score,government integrity score,property rights score,tax burden score,overall economic freedom score,financial freedom score,women MPs (% of all MPs)
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,36000000,652230,39.128378,20.2,0.498,2.66,51.883333,64.1,1919.0,1.5,...,-1.3,-1.57,-1.52,28.2,26.2,17.9,91.8,51.3,10.0,27.7
Albania,2900000,27398,29.0,36.8,0.785,4.64,53.1,34.2,11840.0,2.6,...,0.2,-0.4,-0.42,25.4,39.9,54.1,85.1,64.5,70.0,27.9
Algeria,41000000,2381740,35.3,33.3,0.754,5.25,45.8,612.5,15027.0,3.7,...,-1.2,-0.86,-0.61,35.2,29.0,27.8,74.0,44.7,30.0,25.8
Andorra,77000,468,39.128378,26.49562,0.858,5.496232,51.883333,649.0375,20061.176796,2.414674,...,1.2,1.6,1.24,46.635519,41.86776,51.243169,76.603911,60.855618,48.379888,32.1
Angola,30000000,1246700,42.7,26.49562,0.581,5.496232,28.4,187.3,6844.0,3.0,...,-1.0,-1.1,-1.41,25.4,18.9,36.0,82.4,48.6,40.0,38.2


In [4]:
# define the variable types: 'c' is 'continuous'
variables = [
    "GINI index",
    "happy planet index",
    "human development index",
    "world happiness report score",
    "sustainable economic development assessment (SEDA)",
    "GDP per capita (PPP)",
    "GDP growth (annual %)",
    "health expenditure  % of GDP",
    "health expenditure  per person",
    "education expenditure % of GDP",
    "education expenditure  per person ",
    "school life expectancy (YEARS)",
    "unemployment (%)",
    "government spending score",
    "government expenditure (% of GDP)",
    "political rights score ",
    "civil liberties score ",
    "political stability & absence of violence",
    "government effectiveness",
    "regulatory quality",
    "rule of law",
    "control of corruption",
    "judicial effectiveness score",
    "government integrity score",
    "property rights score",
    "tax burden score",
    "overall economic freedom score",
    "financial freedom score",
    "women MPs (% of all MPs)"
]

variable_types = {v: "c" for v in variables}

# run the search
ic_algorithm = IC(RobustRegressionTest)
causal_graph = ic_algorithm.search(df, variable_types)

In [5]:
# annotate the graph
D = nx.DiGraph()
D.add_nodes_from(causal_graph.nodes())

causal_edges = {}
for n1, n2, data in causal_graph.edges(data=True):
    arrows = data["arrows"]
    if not arrows:
        causal_edges[(n1, n2)] = dict(genuine=data["marked"], directed=False, both_arrows=None)
    elif (n1 in arrows and n2 in arrows):
        causal_edges[(n2, n1)] = dict(genuine=data["marked"], directed=True, both_arrows=True)
    elif n1 in arrows:
        causal_edges[(n2, n1)] = dict(genuine=data["marked"], directed=True, both_arrows=False)
    elif n2 in arrows:
        causal_edges[(n1, n2)] = dict(genuine=data["marked"], directed=True, both_arrows=False)

def normalize(d):
    return (d - d.mean())/d.std()

for n1, n2 in combinations(D.nodes(), 2):
    r, pval = pearsonr(normalize(df[n1].values), normalize(df[n2].values))
    n1_n2 = causal_edges.get((n1, n2), None)
    n2_n1 = causal_edges.get((n2, n1), None)
    if not n1_n2 and not n2_n1: # no causal edge
        D.add_edge(n1, n2, **dict(genuine=None, directed=False, causal=False, both_arrows=None, r=r, pval=pval))
    elif n1_n2:
        D.add_edge(n1, n2, **{**n1_n2, **dict(causal=True, r=r, pval=pval)})
    elif n2_n1:
        D.add_edge(n2, n1, **{**n2_n1, **dict(causal=True, r=r, pval=pval)})

pos = nx.circular_layout(D)

In [26]:
# convert graph data to bokeh format
def dist(l1, l2):
    x1, y1 = l1
    x2, y2 = l2
    return ((y2 - y1)**2 + (x2 - x1)**2)**0.5

def bezier(l1, l2, b):
    x1, y1 = l1
    x2, y2 = l2
    d = dist(l1, l2)
    t = b * (1 + d)
    steps = [i/STEPS for i in range(STEPS)]
    xs = [(1-s)**t*x1 + s**t*x2 for s in steps]
    ys = [(1-s)**t*y1 + s**t*y2 for s in steps]
    return xs, ys

def colormap(num):
    colors = ["#f7c031", "#ef4837", "#91b5bb", "#526354", "#fecacb"]
    return list(colors * 100)[:num]

def nearest_offset(xs, ys, centroid):
    for i, (x, y) in enumerate(zip(xs[::-1], ys[::-1])):
        if dist(centroid, (x, y)) > RADIUS-0.075:
            break
    return i

RADIUS = 0.15
STEPS = 500
EDGE_INDEX2LABEL = {
    0: "correlation", 
    1: "undirected causal", 
    2: "directed causal", 
    3: "genuine causal"
}
EDGE_LABEL2INDEX = {v: k for k, v in EDGE_INDEX2LABEL.items()}

EDGE_STYLE = {
    "correlation": {
        "color": "#ffffff",
        "width": 2,
        "alpha": 0.3,
    },
    "undirected causal": {
        "color": "#000000",
        "width": 2,
        "alpha": 0.3,
    },
    "directed causal": {
        "color": "#000000",
        "width": 3,
        "alpha": 0.6,
    },
    "genuine causal": {
        "color": "#000000",
        "width": 3,
        "alpha": 1,
    }
}

# data for nodes
nodes = dict(
    index=list(pos.keys()), 
    label=list(pos.keys()), 
    color=colormap(len(pos)), 
    x=[pos[n][0] for n in pos.keys()], 
    y=[pos[n][1] for n in pos.keys()]
)

# data for edges
empty_edges = dict(
    start=[], end=[], xs=[], ys=[], color=[], width=[], 
    alpha=[], r=[], pval=[], type=[], type_name=[], 
    b_arrow=[], e_arrow=[]
)

graph_edges = copy.deepcopy(empty_edges)

for e in D.edges(data=True):
    n1, n2, d = e
    l1, l2 = pos[n1], pos[n2]
    xs, ys = bezier(l1, l2, 1)
    os = nearest_offset(xs, ys, l2)
    graph_edges["start"].append(n1)
    graph_edges["end"].append(n2)
    graph_edges["xs"].append(xs[os:-os])
    graph_edges["ys"].append(ys[os:-os])
    graph_edges["r"].append(d["r"])
    graph_edges["pval"].append(d["pval"])

    if not d["causal"]:
        kind = "correlation"
    elif not d["directed"]:
        kind = "undirected causal"
    elif not d["genuine"]:
        kind = "directed causal"
    else:
        kind = "genuine causal"

    if d["directed"]:
        graph_edges["e_arrow"].append(1)
    else:
        graph_edges["e_arrow"].append(0)

    if d["both_arrows"]:
        graph_edges["b_arrow"].append(1)
    else:
        graph_edges["b_arrow"].append(0)

    for s in EDGE_STYLE[kind].keys():
        graph_edges[s].append(EDGE_STYLE[kind][s])
    
    graph_edges["type"].append(EDGE_LABEL2INDEX[kind])
    graph_edges["type_name"].append(kind)

correlation_edges = copy.deepcopy(empty_edges)
for k in graph_edges:
    correlation_edges[k] = [
        graph_edges[k][i] for i, t in enumerate(graph_edges["type_name"]) if t == "correlation"
    ]

causal_edges = copy.deepcopy(empty_edges)
for k in graph_edges:
    causal_edges[k] = [
        graph_edges[k][i] for i, t in enumerate(graph_edges["type_name"]) if "causal" in t
    ]

edges = copy.deepcopy(empty_edges)
for k in graph_edges:
    edges[k] = correlation_edges[k] + causal_edges[k]

pos_json = {k: list(v) for k, v in pos.items()}

In [27]:
# serialize and write
with open("data.py", "w") as f:
    f.write("nodes = " + json.dumps(nodes, indent=4) + "\n")
    f.write("edges = " + json.dumps(edges, indent=4) + "\n")
    f.write("correlation_edges = " + json.dumps(correlation_edges, indent=4) + "\n")
    f.write("pos = " + json.dumps(pos_json, indent=4) + "\n")