# Updated risk score weights

In [1]:
import pandas as pd 
import pickle
import torch
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as pl
from sklearn.metrics.pairwise import cosine_similarity

def to_pickle(df, f):
    with open(f, 'wb') as fname:
        pickle.dump(df, fname)

def open_pickle(f):
    with open(f, 'rb') as file:
        data = pickle.load(file)
    return data

In [2]:
phekg_node_df = pd.read_csv("/n/holylfs06/LABS/mzitnik_lab/Lab/ruthjohnson/kg_paper_revision/connected_node_v3_df.csv", sep='\t')
phekg_node_df['original_code'] = phekg_node_df['node_id'].str.split(':', expand=True)[0]

In [3]:
phekg_cui_dict = open_pickle("/n/holylfs06/LABS/mzitnik_lab/Lab/ruthjohnson/kg_paper_revision/benchmarks/data/phekg_cui_dict.pkl")
phekg_icd_dict = open_pickle("/n/holylfs06/LABS/mzitnik_lab/Lab/ruthjohnson/kg_paper_revision/benchmarks/data/phekg_icd_dict.pkl")
phekg_icd9_dict = open_pickle("/n/holylfs06/LABS/mzitnik_lab/Lab/ruthjohnson/kg_paper_revision/benchmarks/data/phekg_icd9_dict.pkl")
phekg_atc_dict = open_pickle("/n/holylfs06/LABS/mzitnik_lab/Lab/ruthjohnson/kg_paper_revision/benchmarks/data/phekg_atc_dict.pkl")
phekg_snomed_dict = open_pickle("/n/holylfs06/LABS/mzitnik_lab/Lab/ruthjohnson/kg_paper_revision/benchmarks/data/phekg_snomed_dict.pkl")
phekg_phecode_dict = open_pickle("/n/holylfs06/LABS/mzitnik_lab/Lab/ruthjohnson/kg_paper_revision/benchmarks/data/phekg_phecode_dict.pkl")
phekg_rxnorm_dict = open_pickle("/n/holylfs06/LABS/mzitnik_lab/Lab/ruthjohnson/kg_paper_revision/benchmarks/data/phekg_rxnorm_dict.pkl")
phekg_lnc_dict = open_pickle("/n/holylfs06/LABS/mzitnik_lab/Lab/ruthjohnson/kg_paper_revision/benchmarks/data/phekg_lnc_dict.pkl")
phekg_cpt_dict = open_pickle("/n/holylfs06/LABS/mzitnik_lab/Lab/ruthjohnson/kg_paper_revision/benchmarks/data/phekg_cpt_dict.pkl")

phekg_all_dict = {
    'UMLS_CUI': phekg_cui_dict,
    'ICD10CM': phekg_icd_dict,
    'ICD9CM': phekg_icd9_dict,
    'SNOMEDCT_US': phekg_snomed_dict,
    'RXNORM': phekg_rxnorm_dict, 
    'ATC': phekg_atc_dict,
    'PHECODE': phekg_phecode_dict,
    'LNC': phekg_lnc_dict,
    'CPT': phekg_cpt_dict
}

In [4]:
part_df = pd.read_csv("/n/home01/ruthjohnson/kg_paper/umap/Part.csv")
system_part_list = part_df.loc[part_df['PartTypeName'] == 'COMPONENT']['PartNumber'].values
system_part_inds = phekg_node_df.loc[phekg_node_df['original_code'].isin(system_part_list)]['original_code'].values.tolist()

In [7]:
target_code = '428'
sim_list = []
key_list = []

# ICD9
for k in phekg_icd9_dict:
    x = [phekg_icd9_dict[target_code]]
    y = [phekg_icd9_dict[k]]
    sim = cosine_similarity(x, y)[0][0]
    sim_list.append(sim)
    key_list.append(k)

# ATC
for k in phekg_atc_dict:
    if len(k) == 5: # atc4 only
        x = [phekg_icd9_dict[target_code]]
        y = [phekg_atc_dict[k]]
        sim = cosine_similarity(x, y)[0][0]
        sim_list.append(sim)
        key_list.append(k)

# LNC
for k in phekg_lnc_dict:
    if k in system_part_inds: # atc4 only
        x = [phekg_icd9_dict[target_code]]
        y = [phekg_lnc_dict[k]]
        sim = cosine_similarity(x, y)[0][0]
        sim_list.append(sim)
        key_list.append(k)
sim_df = pd.DataFrame({'original_code': key_list, 'cos': sim_list})
sim_df = sim_df.merge(phekg_node_df.loc[phekg_node_df['ntype'].isin(['LNC', 'ATC', 'ICD9CM'])], on='original_code', how='inner')

In [6]:
'428' in phekg_icd9_dict

True

In [8]:
sim_df.sort_values(by='cos', ascending=False).head(4000).to_csv("heart_failure_weights.csv", sep='\t')

In [10]:
sim_df.sort_values(by='cos', ascending=False).tail(20)

Unnamed: 0,original_code,cos,node_id,node_name,ntype,node_index
20583,LP228314-3,-0.937814,LP228314-3:lnc,Doxepin cutoff,LNC,70982
13192,LP14832-7,-0.938873,LP14832-7:lnc,Orthopoxvirus,LNC,62748
14703,LP228451-3,-0.939239,LP228451-3:lnc,Flunitrazepam cutoff,LNC,64452
16312,LP14655-2,-0.939814,LP14655-2:lnc,Collagen type 4,LNC,66239
13890,LP14716-2,-0.941035,LP14716-2:lnc,Doxepin+Nordoxepin,LNC,63578
17526,LP229649-1,-0.941041,LP229649-1:lnc,Sodium salicylate leukotriene release,LNC,67730
19403,LP285931-4,-0.941388,LP285931-4:lnc,Doxepin/Creatinine,LNC,69671
20872,LP189757-0,-0.9416,LP189757-0:lnc,RARA gene rearrangements,LNC,71285
12228,LP17967-8,-0.941611,LP17967-8:lnc,Silicone,LNC,61622
16570,LP140599-4,-0.942505,LP140599-4:lnc,Instructions,LNC,66511
