# Lasso for Causal BBN Structure Learning

In [3]:
import pandas as pd
import json

df = pd.read_csv('../data/data-binary.csv')

with open('../data/data-binary-complete.json', 'r') as f:
    meta = json.load(f)

## Code

In [86]:
from sklearn.linear_model import LogisticRegression
from itertools import combinations, chain
import operator
from functools import reduce
from typing import Tuple, Dict, List, Any

def get_n_way(X_cols: List[str], n_way=3):
    combs = (combinations(X_cols, n + 1) for n in range(n_way))
    combs = chain(*combs)
    combs = list(combs)
    return combs

def get_data(df_path: str, X_cols: List[str], y_col: str, n_way=3):
    def to_col_name(interaction):
        if len(interaction) == 1:
            return interaction[0]
        else:
            return '_'.join(interaction)
    
    def get_interaction(interaction):
        def multiply(r):
            vals = [r[col] for col in interaction]
            return reduce(operator.mul, vals, 1)
        
        return data.apply(multiply, axis=1)
        
    data = pd.read_csv(df_path)
    interactions = get_n_way(X_cols, n_way=n_way)
    
    d = {to_col_name(interaction): get_interaction(interaction) for interaction in interactions}
    d = {**d, **{y_col: data[y_col]}}
    
    df = pd.DataFrame(d)
    return df

def do_regression(X_cols: List[str], y_col: str, df: pd.DataFrame, solver='liblinear', penalty='l1', C=0.2) -> pd.DataFrame:
    X = df[X_cols]
    y = df[y_col]
    
    model = LogisticRegression(penalty=penalty, solver=solver, C=C)
    model.fit(X, y)
    
    return model

def extract_model_params(independent_cols: List[str], y_col: str, model: LogisticRegression):
    intercept = {'__intercept': model.intercept_[0]}
    indeps = {c: v for c, v in zip(independent_cols, model.coef_[0])}
    y = {'__dependent': y_col}
    
    d = {**y, **intercept}
    d = {**d, **indeps}
    
    return d

def to_robustness_indication(params: pd.DataFrame, ignore_neg_gt=-0.1, ignore_pos_lt=0.1):
    def is_robust(v):
        if v < ignore_neg_gt:
            return 0
        if v < ignore_pos_lt:
            return 0
        return 1

    return params[[c for c in params if c not in ['__intercept', '__dependent']]].applymap(is_robust)

def get_robust_stats(robust: pd.DataFrame, robust_threshold=0.9):
    s = robust.sum()
    p = s / robust.shape[0]
    i = s.index
    
    df = pd.DataFrame([{'name': name, 'count': count, 'percent': pct} for name, count, pct in zip(i, s, p)])
    df = df.sort_values(['count', 'percent', 'name'], ascending=[False, False, True])
    df = df[df['percent'] >= robust_threshold]
    return df
    
def do_robust_regression(X_cols: List[str], y_col: str, df_path: str, n_way=3, 
                         ignore_neg_gt=-0.1, ignore_pos_lt=0.1, 
                         n_regressions=10, solver='liblinear', penalty='l1', C=0.2,
                         robust_threshold=0.9):
    data = get_data(df_path, X_cols, y_col, n_way=n_way)
    frames = (data.sample(frac=0.9) for _ in range(n_regressions))
    
    independent_cols = [c for c in data.columns if c != y_col]
    models = (do_regression(independent_cols, y_col, data, solver=solver, penalty=penalty, C=C) for df in frames)
    
    params = pd.DataFrame((extract_model_params(independent_cols, y_col, m) for m in models))
    robust = to_robustness_indication(params, ignore_neg_gt, ignore_pos_lt)
    robust_stats = get_robust_stats(robust)
    
    return params, robust, robust_stats

## e ~ .

In [87]:
df_path = '../data/data-binary.csv'
params, robust, robust_stats = do_robust_regression(['a', 'b', 'c', 'd'], 'e', df_path)
params

Unnamed: 0,__dependent,__intercept,a,b,c,d,a_b,a_c,a_d,b_c,b_d,c_d,a_b_c,a_b_d,a_c_d,b_c_d
0,e,-2.244279,0.0,0.0,0.0,4.314648,0.025278,0.0,0.0,0.0,0.052642,0.0,0.0,0.093992,0.0,0.0
1,e,-2.24448,0.0,0.0,0.0,4.31505,0.026401,0.0,0.0,0.0,0.064747,0.0,0.0,0.078542,0.0,0.0
2,e,-2.244263,0.0,0.0,0.0,4.314676,0.025653,0.0,0.0,0.0,0.051094,0.0,0.0,0.095482,0.0,0.0
3,e,-2.244242,0.0,0.0,0.0,4.314848,0.025485,0.0,0.0,0.0,0.054271,0.0,0.0,0.092176,0.0,0.0
4,e,-2.244357,0.0,0.0,0.0,4.31505,0.025295,0.0,0.0,0.0,0.0537,0.0,0.0,0.092416,0.0,0.0
5,e,-2.244088,0.0,0.0,0.0,4.314678,0.024774,0.0,0.0,0.0,0.049405,0.0,0.0,0.09898,0.0,0.0
6,e,-2.244253,0.0,0.0,0.0,4.314708,0.02523,0.0,0.0,0.0,0.050886,0.0,0.0,0.0977,0.0,0.0
7,e,-2.244182,0.0,0.0,0.0,4.314848,0.02515,0.0,0.0,0.0,0.04818,0.0,0.0,0.100256,0.0,0.0
8,e,-2.244322,0.0,0.0,0.0,4.315006,0.025562,0.0,0.0,0.0,0.049578,0.0,0.0,0.097306,0.0,0.0
9,e,-2.244306,0.0,0.0,0.0,4.314872,0.025334,0.0,0.0,0.0,0.049083,0.0,0.0,0.098991,0.0,0.0


In [84]:
robust

Unnamed: 0,a,b,c,d,a_b,a_c,a_d,b_c,b_d,c_d,a_b_c,a_b_d,a_c_d,b_c_d
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,0,0,0,0
8,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [88]:
robust_stats

Unnamed: 0,name,count,percent
3,d,10,1.0


## d ~ .

In [89]:
params, robust, robust_stats = do_robust_regression(['a', 'b', 'c'], 'd', df_path)
params

Unnamed: 0,__dependent,__intercept,a,b,c,a_b,a_c,b_c,a_b_c
0,d,-1.247897,-0.110852,-0.100672,0.001244,0.059395,0.0,-0.042367,0.0
1,d,-1.248644,-0.110107,-0.097802,0.001486,0.057181,0.0,-0.043667,0.0
2,d,-1.247902,-0.11086,-0.100243,0.00117,0.059165,0.0,-0.042432,0.0
3,d,-1.248073,-0.110699,-0.099921,0.001235,0.058804,0.0,-0.042634,0.0
4,d,-1.248097,-0.110684,-0.099811,0.001339,0.058852,0.0,-0.042765,0.0
5,d,-1.248204,-0.110604,-0.098599,0.001382,0.058015,0.0,-0.043399,0.0
6,d,-1.248707,-0.110058,-0.097552,0.001535,0.057061,0.0,-0.043882,0.0
7,d,-1.247969,-0.110791,-0.100256,0.001298,0.059077,0.0,-0.042592,0.0
8,d,-1.248101,-0.11067,-0.099787,0.001289,0.058838,0.0,-0.042742,0.0
9,d,-1.248053,-0.110743,-0.10007,0.001334,0.059006,0.0,-0.04276,0.0


In [90]:
robust

Unnamed: 0,a,b,c,a_b,a_c,b_c,a_b_c
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0


In [91]:
robust_stats

Unnamed: 0,name,count,percent
