# Lasso for Causal BBN Structure Learning

In [1]:
import pandas as pd
import json

df = pd.read_csv('../data/gmu-covid.csv')

with open('../data/gmu-covid.json', 'r') as f:
    meta = json.load(f)

In [2]:
df.shape

(461, 37)

## Code

In [3]:
from sklearn.linear_model import LogisticRegression
from itertools import combinations, chain
import operator
from functools import reduce
from typing import Tuple, Dict, List, Any

def get_n_way(X_cols: List[str], n_way=3):
    combs = (combinations(X_cols, n + 1) for n in range(n_way))
    combs = chain(*combs)
    combs = list(combs)
    return combs

def get_data(df_path: str, X_cols: List[str], y_col: str, n_way=3):
    def to_col_name(interaction):
        if len(interaction) == 1:
            return interaction[0]
        else:
            return '_'.join(interaction)
    
    def get_interaction(interaction):
        def multiply(r):
            vals = [r[col] for col in interaction]
            return reduce(operator.mul, vals, 1)
        
        return data.apply(multiply, axis=1)
        
    data = pd.read_csv(df_path)
    interactions = get_n_way(X_cols, n_way=n_way)
    
    d = {to_col_name(interaction): get_interaction(interaction) for interaction in interactions}
    d = {**d, **{y_col: data[y_col]}}
    
    df = pd.DataFrame(d)
    return df

def do_regression(X_cols: List[str], y_col: str, df: pd.DataFrame, solver='liblinear', penalty='l1', C=0.2) -> pd.DataFrame:
    X = df[X_cols]
    y = df[y_col]
    
    model = LogisticRegression(penalty=penalty, solver=solver, C=C)
    model.fit(X, y)
    
    return model

def extract_model_params(independent_cols: List[str], y_col: str, model: LogisticRegression):
    intercept = {'__intercept': model.intercept_[0]}
    indeps = {c: v for c, v in zip(independent_cols, model.coef_[0])}
    y = {'__dependent': y_col}
    
    d = {**y, **intercept}
    d = {**d, **indeps}
    
    return d

def to_robustness_indication(params: pd.DataFrame, ignore_neg_gt=-0.1, ignore_pos_lt=0.1):
    def is_robust(v):
        if v < ignore_neg_gt:
            return 0
        if v < ignore_pos_lt:
            return 0
        return 1

    return params[[c for c in params if c not in ['__intercept', '__dependent']]].applymap(is_robust)

def get_robust_stats(robust: pd.DataFrame, robust_threshold=0.9):
    s = robust.sum()
    p = s / robust.shape[0]
    i = s.index
    
    df = pd.DataFrame([{'name': name, 'count': count, 'percent': pct} for name, count, pct in zip(i, s, p)])
    df = df.sort_values(['count', 'percent', 'name'], ascending=[False, False, True])
    df = df[df['percent'] >= robust_threshold]
    return df
    
def do_robust_regression(X_cols: List[str], y_col: str, df_path: str, n_way=3, 
                         ignore_neg_gt=-0.1, ignore_pos_lt=0.1, 
                         n_regressions=10, solver='liblinear', penalty='l1', C=0.2,
                         robust_threshold=0.9):
    data = get_data(df_path, X_cols, y_col, n_way=n_way)
    frames = (data.sample(frac=0.9) for _ in range(n_regressions))
    
    independent_cols = [c for c in data.columns if c != y_col]
    models = (do_regression(independent_cols, y_col, data, solver=solver, penalty=penalty, C=C) for df in frames)
    
    params = pd.DataFrame((extract_model_params(independent_cols, y_col, m) for m in models))
    robust = to_robustness_indication(params, ignore_neg_gt, ignore_pos_lt)
    robust_stats = get_robust_stats(robust)
    
    return params, robust, robust_stats

## Ordering map

In [14]:
ordering_map = {}

col_ordering = list(reversed(meta['ordering']))
for i, arr in enumerate(col_ordering):
    for col in arr:
        indeps = list(chain(*col_ordering[i+1:]))
        ordering_map[col] = indeps

## 1. Fever ~ .

In [17]:
','.join(ordering_map['Fever'])

'Bluish,Numbness,SlurredSpeech,RedRash,UnexplainedRash,Wheezing,LossBalance,Pinkeye,Shivering,LossSmell,Confusion,LossTaste,JointPain,AbdominalPain,Shortnessofbreath,Vomiting,ExcessSweat,Chestpain,LossAppetite,Difficultybreathing,Diarrhea,Chills,MuscleAches,Fatigue,Headaches,Sorethroat,Runnynose,Cough,TestPositive,IgnoreRespSymp,IgnoreGastroSymp,IgnoreNeuroSymp,IgnoreInflamSymp,RaceWhite,GenderFemale,Age30orMore'

In [18]:
df_path = '../data/gmu-covid.csv'
params, robust, robust_stats = do_robust_regression(ordering_map['Fever'], 'Fever', df_path)
params

Unnamed: 0,__dependent,__intercept,Bluish,Numbness,SlurredSpeech,RedRash,UnexplainedRash,Wheezing,LossBalance,Pinkeye,...,IgnoreNeuroSymp_IgnoreInflamSymp_RaceWhite,IgnoreNeuroSymp_IgnoreInflamSymp_GenderFemale,IgnoreNeuroSymp_IgnoreInflamSymp_Age30orMore,IgnoreNeuroSymp_RaceWhite_GenderFemale,IgnoreNeuroSymp_RaceWhite_Age30orMore,IgnoreNeuroSymp_GenderFemale_Age30orMore,IgnoreInflamSymp_RaceWhite_GenderFemale,IgnoreInflamSymp_RaceWhite_Age30orMore,IgnoreInflamSymp_GenderFemale_Age30orMore,RaceWhite_GenderFemale_Age30orMore
0,Fever,-0.701641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Fever,-0.695946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Fever,-0.688846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Fever,-0.700976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Fever,-0.69545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Fever,-0.692849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Fever,-0.69248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Fever,-0.700915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Fever,-0.699575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Fever,-0.693683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
robust

Unnamed: 0,Bluish,Numbness,SlurredSpeech,RedRash,UnexplainedRash,Wheezing,LossBalance,Pinkeye,Shivering,LossSmell,...,IgnoreNeuroSymp_IgnoreInflamSymp_RaceWhite,IgnoreNeuroSymp_IgnoreInflamSymp_GenderFemale,IgnoreNeuroSymp_IgnoreInflamSymp_Age30orMore,IgnoreNeuroSymp_RaceWhite_GenderFemale,IgnoreNeuroSymp_RaceWhite_Age30orMore,IgnoreNeuroSymp_GenderFemale_Age30orMore,IgnoreInflamSymp_RaceWhite_GenderFemale,IgnoreInflamSymp_RaceWhite_Age30orMore,IgnoreInflamSymp_GenderFemale_Age30orMore,RaceWhite_GenderFemale_Age30orMore
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
robust_stats

Unnamed: 0,name,count,percent
17,Chestpain,10,1.0
511,Chestpain_GenderFemale,10,1.0
561,Chills_MuscleAches,10,1.0
27,Cough,10,1.0
19,Difficultybreathing,10,1.0
648,IgnoreRespSymp_RaceWhite,10,1.0
401,JointPain_Headaches,10,1.0
627,Runnynose_RaceWhite,10,1.0
25,Sorethroat,10,1.0
228,Wheezing_RaceWhite,10,1.0


## 2. Chestpain ~ .

In [21]:
params, robust, robust_stats = do_robust_regression(ordering_map['Chestpain'], 'Chestpain', df_path)
params

Unnamed: 0,__dependent,__intercept,LossAppetite,Difficultybreathing,Diarrhea,Chills,MuscleAches,Fatigue,Headaches,Sorethroat,...,IgnoreNeuroSymp_IgnoreInflamSymp_RaceWhite,IgnoreNeuroSymp_IgnoreInflamSymp_GenderFemale,IgnoreNeuroSymp_IgnoreInflamSymp_Age30orMore,IgnoreNeuroSymp_RaceWhite_GenderFemale,IgnoreNeuroSymp_RaceWhite_Age30orMore,IgnoreNeuroSymp_GenderFemale_Age30orMore,IgnoreInflamSymp_RaceWhite_GenderFemale,IgnoreInflamSymp_RaceWhite_Age30orMore,IgnoreInflamSymp_GenderFemale_Age30orMore,RaceWhite_GenderFemale_Age30orMore
0,Chestpain,-2.183663,0.0,0.0,0.0,0.0,0.115364,0.042307,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Chestpain,-2.182812,0.0,0.0,0.0,0.0,0.116035,0.043282,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Chestpain,-2.183574,0.0,0.0,0.0,0.0,0.116768,0.043544,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Chestpain,-2.182622,0.0,0.0,0.0,0.0,0.115836,0.044773,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Chestpain,-2.183489,0.0,0.0,0.0,0.0,0.118805,0.043402,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Chestpain,-2.183372,0.0,0.0,0.0,0.0,0.118768,0.044038,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Chestpain,-2.182833,0.0,0.0,0.0,0.0,0.120785,0.043153,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Chestpain,-2.182856,0.0,0.0,0.0,0.0,0.11685,0.044516,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Chestpain,-2.183414,0.0,0.0,0.0,0.0,0.118589,0.042117,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Chestpain,-2.18352,0.0,0.0,0.0,0.0,0.120314,0.042565,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
robust

Unnamed: 0,LossAppetite,Difficultybreathing,Diarrhea,Chills,MuscleAches,Fatigue,Headaches,Sorethroat,Runnynose,Cough,...,IgnoreNeuroSymp_IgnoreInflamSymp_RaceWhite,IgnoreNeuroSymp_IgnoreInflamSymp_GenderFemale,IgnoreNeuroSymp_IgnoreInflamSymp_Age30orMore,IgnoreNeuroSymp_RaceWhite_GenderFemale,IgnoreNeuroSymp_RaceWhite_Age30orMore,IgnoreNeuroSymp_GenderFemale_Age30orMore,IgnoreInflamSymp_RaceWhite_GenderFemale,IgnoreInflamSymp_RaceWhite_Age30orMore,IgnoreInflamSymp_GenderFemale_Age30orMore,RaceWhite_GenderFemale_Age30orMore
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
robust_stats

Unnamed: 0,name,count,percent
349,Difficultybreathing_Fatigue_Headaches,10,1.0
336,Difficultybreathing_MuscleAches_Fatigue,10,1.0
337,Difficultybreathing_MuscleAches_Headaches,10,1.0
341,Difficultybreathing_MuscleAches_TestPositive,10,1.0
743,Fatigue_TestPositive_RaceWhite,10,1.0
208,LossAppetite_Chills_TestPositive,10,1.0
19,LossAppetite_Diarrhea,10,1.0
233,LossAppetite_Fatigue_TestPositive,10,1.0
4,MuscleAches,10,1.0


## 2. Cough ~ .

In [24]:
params, robust, robust_stats = do_robust_regression(ordering_map['Cough'], 'Cough', df_path)
robust_stats

Unnamed: 0,name,count,percent
1,IgnoreRespSymp,10,1.0
20,IgnoreRespSymp_Age30orMore,10,1.0
12,TestPositive_RaceWhite,10,1.0


## 2. Difficultybreathing ~ .

In [25]:
params, robust, robust_stats = do_robust_regression(ordering_map['Difficultybreathing'], 'Difficultybreathing', df_path)
robust_stats

Unnamed: 0,name,count,percent
31,Chills_MuscleAches,10,1.0
293,Chills_Runnynose_RaceWhite,10,1.0
288,Chills_Runnynose_TestPositive,10,1.0
279,Chills_Sorethroat_TestPositive,10,1.0
225,Diarrhea_IgnoreRespSymp_Age30orMore,10,1.0
151,Diarrhea_MuscleAches_Headaches,10,1.0
155,Diarrhea_MuscleAches_TestPositive,10,1.0
21,Diarrhea_Runnynose,10,1.0
11,IgnoreNeuroSymp,10,1.0
129,IgnoreNeuroSymp_Age30orMore,10,1.0


## 2. Sorethroat ~ .

In [26]:
params, robust, robust_stats = do_robust_regression(ordering_map['Sorethroat'], 'Sorethroat', df_path)
robust_stats

Unnamed: 0,name,count,percent
1,Cough,10,1.0
10,Runnynose_Cough,10,1.0
55,Runnynose_Cough_TestPositive,10,1.0
17,Runnynose_GenderFemale,10,1.0
12,Runnynose_IgnoreRespSymp,10,1.0
16,Runnynose_RaceWhite,10,1.0
67,Runnynose_TestPositive_RaceWhite,10,1.0


## 3. MuscleAches ~

In [27]:
params, robust, robust_stats = do_robust_regression(ordering_map['MuscleAches'], 'MuscleAches', df_path)
robust_stats

Unnamed: 0,name,count,percent
62,Cough_Age30orMore,10,1.0
302,Cough_IgnoreRespSymp_IgnoreInflamSymp,10,1.0
55,Cough_TestPositive,10,1.0
0,Fatigue,10,1.0
99,Fatigue_Headaches_RaceWhite,10,1.0
92,Fatigue_Headaches_Runnynose,10,1.0
94,Fatigue_Headaches_TestPositive,10,1.0
14,Fatigue_Sorethroat,10,1.0
103,Fatigue_Sorethroat_Cough,10,1.0


## 3. IgnoreNeuroSymp ~ .

In [28]:
params, robust, robust_stats = do_robust_regression(ordering_map['IgnoreNeuroSymp'], 'IgnoreNeuroSymp', df_path)
robust_stats

Unnamed: 0,name,count,percent


## 4. Fatigue ~ .

In [29]:
params, robust, robust_stats = do_robust_regression(ordering_map['Fatigue'], 'Fatigue', df_path)
robust_stats

Unnamed: 0,name,count,percent
49,Cough_Age30orMore,10,1.0
241,Cough_GenderFemale_Age30orMore,10,1.0
218,Cough_TestPositive_RaceWhite,10,1.0
0,Headaches,10,1.0
20,Headaches_RaceWhite,10,1.0
12,Headaches_Sorethroat,10,1.0
79,Headaches_Sorethroat_Cough,10,1.0
80,Headaches_Sorethroat_TestPositive,10,1.0
54,TestPositive_RaceWhite,10,1.0


## 5. Headaches ~ .

In [30]:
params, robust, robust_stats = do_robust_regression(ordering_map['Headaches'], 'Headaches', df_path)
robust_stats

Unnamed: 0,name,count,percent
151,Cough_TestPositive_RaceWhite,10,1.0
6,IgnoreNeuroSymp,10,1.0
21,Runnynose_Cough,10,1.0
123,Runnynose_TestPositive_RaceWhite,10,1.0
0,Sorethroat,10,1.0
19,Sorethroat_GenderFemale,10,1.0
13,Sorethroat_TestPositive,10,1.0
3,TestPositive,10,1.0
