In [16]:
import pandas as pd
import json
from itertools import chain

def get_ordering_map(meta):
    ordering_map = {}

    col_ordering = list(reversed(meta['ordering']))
    for i, arr in enumerate(col_ordering):
        for col in arr:
            indeps = list(chain(*col_ordering[i+1:]))
            ordering_map[col] = indeps
    return ordering_map

def get_start_nodes(meta):
    ordering = meta['ordering']
    return ordering[-1]

df_path = '../data/gmu-covid.csv'
df = pd.read_csv(df_path)

with open('../data/gmu-covid.json', 'r') as f:
    meta = json.load(f)
    
ordering_map = get_ordering_map(meta)
start_nodes = get_start_nodes(meta)

['TestPositive']

In [30]:
from sklearn.linear_model import LogisticRegression
from itertools import combinations, chain
import operator
from functools import reduce
from typing import Tuple, Dict, List, Any

def get_n_way(X_cols: List[str], n_way=3):
    combs = (combinations(X_cols, n + 1) for n in range(n_way))
    combs = chain(*combs)
    combs = list(combs)
    return combs

def get_data(df_path: str, X_cols: List[str], y_col: str, n_way=3):
    def to_col_name(interaction):
        if len(interaction) == 1:
            return interaction[0]
        else:
            return '!'.join(interaction)
    
    def get_interaction(interaction):
        def multiply(r):
            vals = [r[col] for col in interaction]
            return reduce(operator.mul, vals, 1)
        
        return data.apply(multiply, axis=1)
        
    data = pd.read_csv(df_path)
    interactions = get_n_way(X_cols, n_way=n_way)
    
    d = {to_col_name(interaction): get_interaction(interaction) for interaction in interactions}
    d = {**d, **{y_col: data[y_col]}}
    
    df = pd.DataFrame(d)
    return df

def do_regression(X_cols: List[str], y_col: str, df: pd.DataFrame, solver='liblinear', penalty='l1', C=0.2) -> pd.DataFrame:
    X = df[X_cols]
    y = df[y_col]
    
    model = LogisticRegression(penalty=penalty, solver=solver, C=C)
    model.fit(X, y)
    
    return model

def extract_model_params(independent_cols: List[str], y_col: str, model: LogisticRegression):
    intercept = {'__intercept': model.intercept_[0]}
    indeps = {c: v for c, v in zip(independent_cols, model.coef_[0])}
    y = {'__dependent': y_col}
    
    d = {**y, **intercept}
    d = {**d, **indeps}
    
    return d

def to_robustness_indication(params: pd.DataFrame, ignore_neg_gt=-0.1, ignore_pos_lt=0.1):
    def is_robust(v):
        if v < ignore_neg_gt:
            return 0
        if v < ignore_pos_lt:
            return 0
        return 1

    return params[[c for c in params if c not in ['__intercept', '__dependent']]].applymap(is_robust)

def get_robust_stats(robust: pd.DataFrame, robust_threshold=0.9):
    s = robust.sum()
    p = s / robust.shape[0]
    i = s.index
    
    df = pd.DataFrame([{'name': name, 'count': count, 'percent': pct} for name, count, pct in zip(i, s, p)])
    df = df.sort_values(['count', 'percent', 'name'], ascending=[False, False, True])
    df = df[df['percent'] >= robust_threshold]
    return df
    
def do_robust_regression(X_cols: List[str], y_col: str, df_path: str, n_way=3, 
                         ignore_neg_gt=-0.1, ignore_pos_lt=0.1, 
                         n_regressions=10, solver='liblinear', penalty='l1', C=0.2,
                         robust_threshold=0.9):
    data = get_data(df_path, X_cols, y_col, n_way=n_way)
    frames = (data.sample(frac=0.9) for _ in range(n_regressions))
    
    independent_cols = [c for c in data.columns if c != y_col]
    models = (do_regression(independent_cols, y_col, data, solver=solver, penalty=penalty, C=C) for df in frames)
    
    params = pd.DataFrame((extract_model_params(independent_cols, y_col, m) for m in models))
    robust = to_robustness_indication(params, ignore_neg_gt, ignore_pos_lt)
    robust_stats = get_robust_stats(robust)
    
    relationships = {
        'child': y_col,
        'parents': list(robust_stats['name'])
    }
    
    return relationships

In [35]:
def do_learn(nodes: List[str], seen: Dict[str, List[str]], ordering_map: Dict[str, List[str]]):
    next_nodes = []
    
    for y_col in nodes:
        if y_col in seen:
            continue
        
        rels = do_robust_regression(ordering_map[y_col], y_col, df_path)
        seen[y_col] = rels['parents']
        print(f'{len(seen)} / {len(ordering_map)} | {y_col}')
        
        component_parents = list(set(chain(*[pa.split('!') for pa in rels['parents']])))
        next_nodes.extend(component_parents)
        
    next_nodes = list(set(next_nodes))
    next_nodes = [n for n in next_nodes if n not in seen]
    next_nodes = [n for n in next_nodes if len(ordering_map[n]) > 0]
    
    if len(next_nodes) > 0:
        do_learn(next_nodes, seen, ordering_map)
        
def start_learn(nodes: List[str], ordering_map: Dict[str, List[str]]):
    seen = {}
    do_learn(nodes, seen, ordering_map)
    return seen

rels = start_learn(start_nodes, ordering_map)
rels

1 / 37 | TestPositive
2 / 37 | LossAppetite
3 / 37 | Runnynose
4 / 37 | Chills
5 / 37 | Headaches
6 / 37 | Fever
7 / 37 | Chestpain
8 / 37 | MuscleAches
9 / 37 | Fatigue
10 / 37 | IgnoreRespSymp
11 / 37 | Cough
12 / 37 | LossTaste
13 / 37 | AbdominalPain
14 / 37 | Sorethroat
15 / 37 | Diarrhea
16 / 37 | IgnoreNeuroSymp
17 / 37 | JointPain
18 / 37 | IgnoreInflamSymp
19 / 37 | Difficultybreathing
20 / 37 | Wheezing
21 / 37 | ExcessSweat
22 / 37 | Shortnessofbreath
23 / 37 | IgnoreGastroSymp
24 / 37 | Shivering
25 / 37 | LossBalance


{'TestPositive': ['Chestpain!Chills!MuscleAches',
  'Chestpain!Fatigue!RaceWhite',
  'Chills!Headaches!RaceWhite',
  'Cough!IgnoreRespSymp!RaceWhite',
  'Fever!Chestpain!Fatigue',
  'Fever!Headaches',
  'Fever!LossTaste!Cough',
  'Fever!MuscleAches!RaceWhite',
  'Headaches!Cough!RaceWhite',
  'Headaches!Runnynose!Cough',
  'LossAppetite!Cough!RaceWhite',
  'LossTaste!Headaches!RaceWhite',
  'LossTaste!Runnynose!Cough',
  'Chestpain!Chills',
  'Chestpain!Chills!Fatigue'],
 'LossAppetite': ['Chills!MuscleAches!Sorethroat',
  'Cough!GenderFemale',
  'Diarrhea!Fatigue!Sorethroat',
  'Difficultybreathing!Chills!Runnynose',
  'Difficultybreathing!MuscleAches!Fatigue',
  'Fatigue!Cough',
  'Fatigue!RaceWhite',
  'MuscleAches!Headaches!Runnynose',
  'MuscleAches!Sorethroat'],
 'Runnynose': ['Cough', 'Cough!RaceWhite', 'RaceWhite'],
 'Chills': ['Fatigue!Sorethroat!Cough',
  'Headaches!Sorethroat!Runnynose',
  'MuscleAches!Fatigue',
  'MuscleAches!Fatigue!Cough',
  'MuscleAches!Fatigue!Headaches