In [None]:
import pandas as pd 
from itertools import combinations
from collections import Counter

# read in relevant tables
people = pd.read_csv("EHRShot/sampled_person.csv")
deaths = pd.read_csv("EHRShot/sampled_death.csv")
conditions = pd.read_csv("EHRShot/sampled_condition_occurrence.csv")
concepts = pd.read_csv("EHRShot/concept.csv")

  conditions = pd.read_csv("EHRShot/sampled_condition_occurrence.csv")
  concepts = pd.read_csv("EHRShot/concept.csv")


In [4]:
# establish AGE constant for filtering
AGE = 30

1) Find people who died under 30

In [5]:
# investigate and clean death data
deaths.head()
deaths.shape # 213 rows, 11 columns
deaths.info()
# columns either all null or all filled
# will check death_type_concept and death_cause_concept after filtering
# make DoB a datetime
deaths['death_DATE'] = pd.to_datetime(deaths['death_DATE'])
deaths.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213 entries, 0 to 212
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   person_id                213 non-null    int64  
 1   death_DATE               213 non-null    object 
 2   death_DATETIME           0 non-null      float64
 3   death_type_concept_id    213 non-null    int64  
 4   cause_concept_id         213 non-null    int64  
 5   cause_source_value       0 non-null      float64
 6   cause_source_concept_id  213 non-null    int64  
 7   trace_id                 0 non-null      float64
 8   unit_id                  0 non-null      float64
 9   load_table_id            213 non-null    object 
 10  _death_date_external     0 non-null      float64
dtypes: float64(5), int64(4), object(2)
memory usage: 18.4+ KB


person_id                           int64
death_DATE                 datetime64[ns]
death_DATETIME                    float64
death_type_concept_id               int64
cause_concept_id                    int64
cause_source_value                float64
cause_source_concept_id             int64
trace_id                          float64
unit_id                           float64
load_table_id                      object
_death_date_external              float64
dtype: object

In [6]:
# add a birth date column to people
people['birth_complete'] = pd.to_datetime(
    {
        'year': people['year_of_birth'],
        'month': people['month_of_birth'],
        'day': people['day_of_birth']
    },
)
people.head()
people.dtypes

person_id                               int64
gender_concept_id                       int64
year_of_birth                           int64
month_of_birth                          int64
day_of_birth                            int64
birth_DATETIME                         object
race_concept_id                         int64
ethnicity_concept_id                    int64
location_id                           float64
provider_id                           float64
care_site_id                          float64
person_source_value                   float64
gender_source_value                    object
gender_source_concept_id                int64
race_source_value                      object
race_source_concept_id                  int64
ethnicity_source_value                 object
ethnicity_source_concept_id             int64
trace_id                              float64
unit_id                               float64
load_table_id                          object
birth_complete                 dat

In [7]:
# join date of birth from people to deaths table
deaths_w_birth = deaths.merge(people[['person_id', 'birth_complete']], on='person_id', how='left')
# calculate age at death
deaths_w_birth['age_at_death'] = (deaths_w_birth['death_DATE'] - deaths_w_birth['birth_complete']).dt.days / 365.25
deaths_w_birth.head()

Unnamed: 0,person_id,death_DATE,death_DATETIME,death_type_concept_id,cause_concept_id,cause_source_value,cause_source_concept_id,trace_id,unit_id,load_table_id,_death_date_external,birth_complete,age_at_death
0,115969035,1999-06-05,,32817,0,,0,,,shc_patient,,1956-07-21,42.872005
1,115971656,2003-03-02,,32817,0,,0,,,shc_patient,,1946-07-20,56.616016
2,115972370,2005-09-02,,32817,0,,0,,,shc_patient,,1951-03-01,54.507871
3,115969393,2006-02-04,,32817,0,,0,,,shc_patient,,1926-03-22,79.874059
4,115972883,2008-05-19,,32817,0,,0,,,shc_patient,,1976-03-16,32.175222


In [8]:
# select people who died under 40
young_deaths = deaths_w_birth[deaths_w_birth['age_at_death'] < AGE]
young_deaths.shape

(8, 13)

In [9]:
# investigate death_type and cause_of_death concepts
set(young_deaths['death_type_concept_id']) # 32817
set(young_deaths['cause_concept_id']) # 0
# there is only one id in each
concepts[concepts['concept_id'].isin([32817, 0])] # just placeholders

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_DATE,valid_end_DATE,invalid_reason,load_table_id,load_row_id
2575844,32817,EHR,Type Concept,Type Concept,Type Concept,S,OMOP4976890,2020-08-20,2099-12-31,,athena_vocab,
5311560,0,No matching concept,Metadata,,Undefined,,No matching concept,1970-01-01,2099-12-31,,athena_vocab,


2) Filter condition occurrences to only those that occurred in people who died under 60

In [12]:
# select conditions where person_id has a matching record in young_deaths
mask = conditions['person_id'].isin(set(young_deaths['person_id']))
conditions_w_deaths = conditions.loc[mask].copy()

print(f"Conditions total: {len(set(conditions['condition_concept_id']))}\nConditions with young deaths: {len(set(conditions_w_deaths['condition_concept_id']))}")

Conditions total: 6376
Conditions with young deaths: 615


In [11]:
# join concept names to conditions
condition_concepts = pd.merge(conditions_w_deaths, concepts[['concept_id', 'concept_name']], left_on='condition_concept_id', right_on='concept_id', how='left')
set(condition_concepts['concept_name'])

{'Abdominal aortic aneurysm',
 'Abdominal distension, gaseous',
 'Abdominal pain',
 'Abnormal breathing',
 'Abnormal finding on evaluation procedure',
 'Abnormal findings on diagnostic imaging of lung',
 'Abnormal glucose level',
 'Abnormal granulation tissue',
 'Abnormal level of blood mineral',
 'Abnormality of systemic vein',
 'Abrasion and/or friction burn of multiple sites',
 'Abscess of abdominal wall',
 'Abscess of lung',
 'Accidental poisoning',
 'Acidosis',
 'Acne',
 'Acne vulgaris',
 'Acquired absence of organ',
 'Acquired absence of spleen',
 'Acquired deformity of rib',
 'Acquired hypothyroidism',
 'Acute deep vein thrombosis of lower limb',
 'Acute deep venous thrombosis of popliteal vein',
 'Acute endocarditis',
 'Acute gastritis',
 'Acute graft-versus-host disease',
 'Acute leukemia',
 'Acute lymphoid leukemia',
 'Acute lymphoid leukemia in remission',
 'Acute lymphoid leukemia relapse',
 'Acute myeloid leukemia, disease',
 'Acute pain',
 'Acute posthemorrhagic anemia',


In [None]:
# check condition_concepts for cleaning purposes
condition_concepts.shape # 9827 rows of 21 columns
condition_concepts.info()
# no missing values, data type is correct

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9827 entries, 0 to 9826
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   condition_occurrence_id        9827 non-null   int64  
 1   person_id                      9827 non-null   int64  
 2   condition_concept_id           9827 non-null   object 
 3   condition_start_DATE           9827 non-null   object 
 4   condition_start_DATETIME       9827 non-null   object 
 5   condition_end_DATE             206 non-null    object 
 6   condition_end_DATETIME         206 non-null    object 
 7   condition_type_concept_id      9827 non-null   int64  
 8   stop_reason                    0 non-null      float64
 9   provider_id                    7170 non-null   float64
 10  visit_occurrence_id            9410 non-null   float64
 11  visit_detail_id                0 non-null      float64
 12  condition_source_value         9827 non-null   o

In [26]:
# save unique condition names as vertex list
vertex_set = condition_concepts[['concept_name']].drop_duplicates().reset_index(drop=True).sort_values(by='concept_name')
vertex_set.to_csv('vertices.csv', index = False)

3. Build undirected edge list of co-occurrences

In [29]:
# keep only person_id and concept_name, drop duplicates
unique_concepts = condition_concepts[['person_id','concept_name']].drop_duplicates()

# group concepts per person (list of unique concepts)
person_groups = unique_concepts.groupby('person_id')['concept_name'].agg(list)

# count unordered pairs across people
pair_counts = Counter()
for concepts in person_groups:
    unique_concepts = sorted(set(concepts))
    if len(unique_concepts) < 2:
        continue
    for a, b in combinations(unique_concepts, 2):
        pair_counts[(a, b)] += 1

# convert counter to df, sort
edges = (pd.DataFrame(((a, b, c) for (a, b), c in pair_counts.items()), columns=['concept_A', 'concept_B', 'n_people'])
         .sort_values('n_people', ascending=False)
         .reset_index(drop=True))

print(f'Computed {len(edges)} edges from {len(person_groups)} people')
edges.head()

Computed 66653 edges from 8 people


Unnamed: 0,concept_A,concept_B,n_people
0,Dyspnea,Pleural effusion,6
1,Abdominal pain,Cardiac arrhythmia,6
2,Abdominal pain,Pleural effusion,5
3,Abdominal pain,Nausea,5
4,Cardiac arrhythmia,Dyspnea,5


In [30]:
# save to csv
edges.to_csv('edges.csv', index = False)