In [1]:
import pandas as pd
import numpy as np

v2c = {
    'Infectious_and_parasitic_diseases': 'bs01lr',
    'Neoplasms': 'bs02lr',
    'Endocrine,nutritional_and_metabolic diseases_and_immunity_disorders': 'bs03lr',
    'Diseases_of_the_blood_and_bloodforming_organs': 'bs04lr',
    'Mental_disorders': 'bs05lr',
    'Diseases_of_the_nervous_system_and_sense_organs': 'bs06lr',
    'Diseases_of_the_circulatory_system': 'bs07lr',
    'Diseases_of_the_respiratory_system': 'bs08lr',
    'Diseases_of_the_digestive_system': 'bs09lr',
    'Diseases_of_the_genitourinary_system': 'bs10lr',
    'Complications_of_pregnancy,_childbirth,_and_the_puerperium': 'bs11lr',
    'Diseases_of_the_skin_and_subcutaneous_tissue': 'bs12lr',
    'Diseases_of_the_musculoskeletal_system_and_connective_tissue': 'bs13lr',
    'Congenital_anomalies': 'bs14lr',
    'Symptoms,_signs,_and_ill_defined_conditions': 'bs16lr',
    'Injury_and_poisoning': 'bs17lr',
    'E_codes': 'bs18lr',
    'V_codes': 'bs19lr',
    'RareDX': 'rare_disease'
}

c2v = {v: k for k, v in v2c.items()}

df = pd.read_csv('s3://jeeforce-artifacts/alemi/diabetes/BC+with+Time.csv', low_memory=False) \
    .assign(
        body_sys1=lambda d: d['BC1'].apply(lambda v: v2c[v]), 
        body_sys2=lambda d: d['BC2'].apply(lambda v: v2c[v])) \
    .rename(columns={'patnum': 'patient_number', 'SumDayDiff': 'sum_day_diff', 'AvgDaydiff': 'avg_day_diff'}) \
    .drop(columns=['BC1', 'BC2'])

df.shape

(171, 5)

In [2]:
df

Unnamed: 0,patient_number,sum_day_diff,avg_day_diff,body_sys1,body_sys2
0,27020,2780593,102,bs01lr,bs02lr
1,51702,-14513656,-280,bs01lr,bs03lr
2,16303,2894519,177,bs01lr,bs04lr
3,52414,-10176298,-194,bs01lr,bs05lr
4,53127,3378359,63,bs01lr,bs06lr
...,...,...,...,...,...
166,96140,-16061590,-167,bs17lr,bs19lr
167,69589,-1486110,-21,bs17lr,rare_disease
168,35582,-9989873,-280,bs18lr,bs19lr
169,30637,-1870223,-61,bs18lr,rare_disease


In [3]:
import itertools
import numpy as np

def get_val(v1, v2):
    temp = df[(df['body_sys1']==v1) & (df['body_sys2']==v2)]
    
    if temp.shape[0] == 0:
        temp = df[(df['body_sys1']==v2) & (df['body_sys2']==v1)]
    
    if temp.shape[0] > 0:
        return -temp.iloc[0]['patient_number']
    return np.nan

body_systems = sorted(list(set(list(df['body_sys1'].unique()) + list(df['body_sys2'].unique()))))
keep_cols = [c for c in body_systems if not c in {}]

mat_df = pd.DataFrame([[get_val(v1, v2) for v2 in body_systems] for v1 in body_systems], columns=body_systems, index=body_systems)
mat_df = mat_df.loc[keep_cols][keep_cols]
M = mat_df.fillna(0.0) + np.abs(np.diag(mat_df.sum(axis=1)))
M

Unnamed: 0,bs01lr,bs02lr,bs03lr,bs04lr,bs05lr,bs06lr,bs07lr,bs08lr,bs09lr,bs10lr,bs11lr,bs12lr,bs13lr,bs14lr,bs16lr,bs17lr,bs18lr,bs19lr,rare_disease
bs01lr,692552.0,-27020.0,-51702.0,-16303.0,-52414.0,-53127.0,-52858.0,-37774.0,-44323.0,-33429.0,-113.0,-38652.0,-57908.0,-3745.0,-63071.0,-25853.0,-11071.0,-76568.0,-46621.0
bs02lr,-27020.0,994111.0,-86810.0,-27200.0,-65981.0,-80568.0,-93031.0,-53708.0,-66709.0,-52518.0,-97.0,-48197.0,-76350.0,-4967.0,-91019.0,-28141.0,-12440.0,-114789.0,-64566.0
bs03lr,-51702.0,-86810.0,2181563.0,-47950.0,-167953.0,-194687.0,-236654.0,-117721.0,-141082.0,-116451.0,-228.0,-87079.0,-195702.0,-8868.0,-219019.0,-61571.0,-25020.0,-300589.0,-122477.0
bs04lr,-16303.0,-27200.0,-47950.0,565950.0,-36497.0,-42545.0,-51539.0,-31160.0,-37550.0,-33803.0,-65.0,-23065.0,-42502.0,-2983.0,-50861.0,-16983.0,-8822.0,-60176.0,-35946.0
bs05lr,-52414.0,-65981.0,-167953.0,-36497.0,1884335.0,-162343.0,-163237.0,-103589.0,-123098.0,-86540.0,-327.0,-76953.0,-177852.0,-7772.0,-194586.0,-67171.0,-25430.0,-256695.0,-115897.0
bs06lr,-53127.0,-80568.0,-194687.0,-42545.0,-162343.0,2058172.0,-195007.0,-110543.0,-131298.0,-103788.0,-273.0,-85287.0,-188133.0,-9002.0,-203875.0,-66693.0,-26238.0,-278491.0,-126274.0
bs07lr,-52858.0,-93031.0,-236654.0,-51539.0,-163237.0,-195007.0,2177507.0,-119374.0,-142671.0,-120746.0,-126.0,-86519.0,-187485.0,-9037.0,-215471.0,-58476.0,-25149.0,-298265.0,-121862.0
bs08lr,-37774.0,-53708.0,-117721.0,-31160.0,-103589.0,-110543.0,-119374.0,1349481.0,-89307.0,-67969.0,-228.0,-57219.0,-114924.0,-6168.0,-131469.0,-45117.0,-18933.0,-162866.0,-81412.0
bs09lr,-44323.0,-66709.0,-141082.0,-37550.0,-123098.0,-131298.0,-142671.0,-89307.0,1586386.0,-80716.0,-209.0,-65667.0,-138272.0,-7440.0,-153819.0,-51958.0,-21170.0,-194234.0,-96863.0
bs10lr,-33429.0,-52518.0,-116451.0,-33803.0,-86540.0,-103788.0,-120746.0,-67969.0,-80716.0,1256193.0,-284.0,-51875.0,-102545.0,-6396.0,-119564.0,-36211.0,-16792.0,-150458.0,-76108.0


In [4]:
def get_before(r):
    v1 = r['body_sys1']
    v2 = r['body_sys2']
    d = r['avg_day_diff']
    if d > 0:
        return v1
    else:
        return v2
    
def get_after(r):
    v1 = r['body_sys1']
    v2 = r['body_sys2']
    d = r['avg_day_diff']
    if d > 0:
        return v2
    else:
        return v1
    
ba_df = df.assign(
    before=lambda d: d.apply(get_before, axis=1),
    after=lambda d: d.apply(get_after, axis=1))

before = ba_df.assign(n=1).groupby(['before'])['n'].sum()
after = ba_df.assign(n=1).groupby(['after'])['n'].sum()
after = pd.Series(list(after.values) + [0], index=list(after.index) + ['bs07lr'])

before, after, len(before), len(after)

(before
 bs01lr          15
 bs02lr          11
 bs03lr          17
 bs04lr           7
 bs05lr          16
 bs06lr           9
 bs07lr          18
 bs08lr          12
 bs09lr          14
 bs10lr           8
 bs11lr           1
 bs12lr           6
 bs13lr          13
 bs14lr           3
 bs16lr           5
 bs17lr           2
 bs18lr           1
 bs19lr          10
 rare_disease     3
 Name: n, dtype: int64,
 bs01lr           3
 bs02lr           7
 bs03lr           1
 bs04lr          11
 bs05lr           2
 bs06lr           9
 bs08lr           6
 bs09lr           4
 bs10lr          10
 bs11lr          17
 bs12lr          12
 bs13lr           5
 bs14lr          15
 bs16lr          13
 bs17lr          16
 bs18lr          17
 bs19lr           8
 rare_disease    15
 bs07lr           0
 dtype: int64,
 19,
 19)

In [5]:
b = 1 + 0.5 * (before - after)
b

bs01lr           7.0
bs02lr           3.0
bs03lr           9.0
bs04lr          -1.0
bs05lr           8.0
bs06lr           1.0
bs07lr          10.0
bs08lr           4.0
bs09lr           6.0
bs10lr           0.0
bs11lr          -7.0
bs12lr          -2.0
bs13lr           5.0
bs14lr          -5.0
bs16lr          -3.0
bs17lr          -6.0
bs18lr          -7.0
bs19lr           2.0
rare_disease    -5.0
dtype: float64

In [6]:
C = 2 * np.eye(M.shape[0]) + M
C

Unnamed: 0,bs01lr,bs02lr,bs03lr,bs04lr,bs05lr,bs06lr,bs07lr,bs08lr,bs09lr,bs10lr,bs11lr,bs12lr,bs13lr,bs14lr,bs16lr,bs17lr,bs18lr,bs19lr,rare_disease
bs01lr,692554.0,-27020.0,-51702.0,-16303.0,-52414.0,-53127.0,-52858.0,-37774.0,-44323.0,-33429.0,-113.0,-38652.0,-57908.0,-3745.0,-63071.0,-25853.0,-11071.0,-76568.0,-46621.0
bs02lr,-27020.0,994113.0,-86810.0,-27200.0,-65981.0,-80568.0,-93031.0,-53708.0,-66709.0,-52518.0,-97.0,-48197.0,-76350.0,-4967.0,-91019.0,-28141.0,-12440.0,-114789.0,-64566.0
bs03lr,-51702.0,-86810.0,2181565.0,-47950.0,-167953.0,-194687.0,-236654.0,-117721.0,-141082.0,-116451.0,-228.0,-87079.0,-195702.0,-8868.0,-219019.0,-61571.0,-25020.0,-300589.0,-122477.0
bs04lr,-16303.0,-27200.0,-47950.0,565952.0,-36497.0,-42545.0,-51539.0,-31160.0,-37550.0,-33803.0,-65.0,-23065.0,-42502.0,-2983.0,-50861.0,-16983.0,-8822.0,-60176.0,-35946.0
bs05lr,-52414.0,-65981.0,-167953.0,-36497.0,1884337.0,-162343.0,-163237.0,-103589.0,-123098.0,-86540.0,-327.0,-76953.0,-177852.0,-7772.0,-194586.0,-67171.0,-25430.0,-256695.0,-115897.0
bs06lr,-53127.0,-80568.0,-194687.0,-42545.0,-162343.0,2058174.0,-195007.0,-110543.0,-131298.0,-103788.0,-273.0,-85287.0,-188133.0,-9002.0,-203875.0,-66693.0,-26238.0,-278491.0,-126274.0
bs07lr,-52858.0,-93031.0,-236654.0,-51539.0,-163237.0,-195007.0,2177509.0,-119374.0,-142671.0,-120746.0,-126.0,-86519.0,-187485.0,-9037.0,-215471.0,-58476.0,-25149.0,-298265.0,-121862.0
bs08lr,-37774.0,-53708.0,-117721.0,-31160.0,-103589.0,-110543.0,-119374.0,1349483.0,-89307.0,-67969.0,-228.0,-57219.0,-114924.0,-6168.0,-131469.0,-45117.0,-18933.0,-162866.0,-81412.0
bs09lr,-44323.0,-66709.0,-141082.0,-37550.0,-123098.0,-131298.0,-142671.0,-89307.0,1586388.0,-80716.0,-209.0,-65667.0,-138272.0,-7440.0,-153819.0,-51958.0,-21170.0,-194234.0,-96863.0
bs10lr,-33429.0,-52518.0,-116451.0,-33803.0,-86540.0,-103788.0,-120746.0,-67969.0,-80716.0,1256195.0,-284.0,-51875.0,-102545.0,-6396.0,-119564.0,-36211.0,-16792.0,-150458.0,-76108.0


In [7]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

model = LinearRegression()
model.fit(C, b)

LinearRegression()

In [8]:
coef = pd.Series(model.coef_, index=C.columns).sort_values(ascending=False)
coef_df = coef \
    .to_frame(name='coefficient') \
    .reset_index() \
    .rename(columns={'index': 'code'}) \
    .assign(name=lambda d: d['code'].apply(lambda v: c2v[v]))[['name', 'code', 'coefficient']]
coef_df

Unnamed: 0,name,code,coefficient
0,Infectious_and_parasitic_diseases,bs01lr,0.000116
1,Diseases_of_the_circulatory_system,bs07lr,0.000112
2,"Endocrine,nutritional_and_metabolic diseases_a...",bs03lr,0.000112
3,Mental_disorders,bs05lr,0.000112
4,Diseases_of_the_digestive_system,bs09lr,0.000111
5,Diseases_of_the_respiratory_system,bs08lr,0.00011
6,Neoplasms,bs02lr,0.00011
7,Diseases_of_the_musculoskeletal_system_and_con...,bs13lr,0.00011
8,V_codes,bs19lr,0.000109
9,Diseases_of_the_nervous_system_and_sense_organs,bs06lr,0.000108


In [9]:
b_df = b \
    .to_frame(name='before_after') \
    .reset_index().rename(columns={'index': 'code'}) \
    .assign(name=lambda d: d['code'].apply(lambda v: c2v[v]))[['name', 'code', 'before_after']]
b_df

Unnamed: 0,name,code,before_after
0,Infectious_and_parasitic_diseases,bs01lr,7.0
1,Neoplasms,bs02lr,3.0
2,"Endocrine,nutritional_and_metabolic diseases_a...",bs03lr,9.0
3,Diseases_of_the_blood_and_bloodforming_organs,bs04lr,-1.0
4,Mental_disorders,bs05lr,8.0
5,Diseases_of_the_nervous_system_and_sense_organs,bs06lr,1.0
6,Diseases_of_the_circulatory_system,bs07lr,10.0
7,Diseases_of_the_respiratory_system,bs08lr,4.0
8,Diseases_of_the_digestive_system,bs09lr,6.0
9,Diseases_of_the_genitourinary_system,bs10lr,0.0


In [10]:
M.to_csv('s3://jeeforce-artifacts/alemi/diabetes/M.csv', index=False)

In [11]:
C.to_csv('s3://jeeforce-artifacts/alemi/diabetes/C.csv', index=False)

In [12]:
b_df.to_csv('s3://jeeforce-artifacts/alemi/diabetes/b.csv', index=False)

In [13]:
coef_df.to_csv('s3://jeeforce-artifacts/alemi/diabetes/ordering.csv', index=False)