In [1]:
import pandas as pd
import math

In [2]:
df_unfilt = pd.read_csv('PLM_final_data.csv')

In [3]:
df = df_unfilt.filter(items = ['id','deceased','country','primaryConditionName','secondaryConditionNames','firstDiagnosisDate','sex','state','userAge'])

In [4]:
df['secondaryConditionNames'] = df['secondaryConditionNames'].apply(eval)

In [5]:
for i in range(0, len(df)):
    pc = df['primaryConditionName'][i]
    if type(pc) != 'float64':
        df['secondaryConditionNames'][i].insert(0, pc)

In [6]:
df = df.rename(columns = {'secondaryConditionNames':'condition_names'}).drop(columns = ['primaryConditionName'])

In [13]:
def row_1D(series):
    d_list = []
    for x in series:
        d_list.append(x)
    return d_list

In [14]:
conditionsDict = {}
for i in range(0, len(df)):
    conditionsDict[df['id'][i]] = row_1D(df['condition_names'][i])

In [17]:
patientlist = []
diseaselist = []
for i in conditionsDict:
    for x in conditionsDict[i]:
        patientlist.append(i)
        diseaselist.append(x)
data = {'id':patientlist, 'disease':diseaselist}

In [18]:
edges = pd.DataFrame(data)

In [19]:
edges.to_csv('edges.csv')

In [24]:
df.head()

Unnamed: 0,id,deceased,country,condition_names,firstDiagnosisDate,sex,state,userAge,Age_Group
0,552284,False,United States,"[epilepsy, broken shoulder, broken ankle, brok...",1974.0,F,FL,60.0,55-70
1,298121,False,Belgium,"[fibromyalgia, epilepsy, gallstones, insomnia,...",2012.0,M,Vlaams Brabant,63.0,55-70
2,740546,False,United States,"[systemic lupus erythematosus, osteoarthritis,...",1991.0,F,NH,52.0,35-54
3,253861,False,,"[epilepsy, diabetes type 1]",1979.0,M,,54.0,35-54
4,431106,False,United States,"[primary lateral sclerosis, common variable im...",2017.0,M,WA,57.0,55-70


In [25]:
demographics = df.drop(columns = ['condition_names'])

In [26]:
expanded_df = edges.merge(demographics, on = 'id')

In [29]:
val = expanded_df.dropna()

In [30]:
age_groups = [0, 18, 35, 55, 70, 120]

age_group_names = ['<18', '18-34', '35-54', '55-70', '>70']

df['Age_Group'] = pd.cut(df['userAge'], bins = age_groups, labels = age_group_names)

In [31]:
state_counts = df.state.value_counts()

south_states = ['NC','SC','GA','AL','FL','TN','KY','AR','MS','LA','TX','OK','VA']
south_count = 0
for state in south_states:
    count = state_counts[state]
    south_count += count
    print(state, count)
print(south_count)

NC 206
SC 112
GA 237
AL 95
FL 452
TN 146
KY 113
AR 77
MS 42
LA 81
TX 464
OK 104
VA 178
2307


In [32]:
west_states = ['WA','CA','OR','AZ','NV','CO','NM','ID','MT','UT','WY','AK','HI']
west_count = 0
for state in west_states:
    count = state_counts[state]
    west_count += count
    print(state, count)
print(west_count)

WA 174
CA 609
OR 120
AZ 159
NV 75
CO 141
NM 42
ID 56
MT 30
UT 49
WY 17
AK 21
HI 20
1513


In [33]:
northeast_states = ['DE','RI','MA','PA','NY','NJ','ME','MD','CT','VT','NH']
northeast_count = 0
for state in northeast_states:
    count = state_counts[state]
    northeast_count += count
    print(state, count)
print(northeast_count)

DE 15
RI 25
MA 219
PA 301
NY 312
NJ 168
ME 44
MD 91
CT 59
VT 19
NH 36
1289


In [34]:
midwest_states = ['KS','MO','IL','IN','NE','SD','ND','MN','WI','MI','OH','IA']
midwest_count = 0
for state in midwest_states:
    count = state_counts[state]
    midwest_count += count
    print(state, count)
print(midwest_count)

KS 60
MO 152
IL 240
IN 188
NE 53
SD 12
ND 8
MN 121
WI 109
MI 250
OH 272
IA 54
1519


In [35]:
tot_prevalence = val['disease'].value_counts()/val['id'].nunique()

In [36]:
m_prevalence = val[val['sex'] == 'M']['disease'].value_counts()/val[val['sex'] == 'M']['id'].nunique()

In [37]:
f_prevalence = val[val['sex'] == 'F']['disease'].value_counts()/val[val['sex'] == 'F']['id'].nunique()

In [38]:
disease_list = tot_prevalence.index.tolist()

In [39]:
m_comorbidities = []
f_comorbidities = []
tot_comorbidities = []
for disease in disease_list:
    try:
        m_comorbidities.append(m_prevalence[disease])
    except:
        m_comorbidities.append(0)
    try:
        f_comorbidities.append(f_prevalence[disease])
    except:
        f_comorbidities.append(0)
    tot_comorbidities.append(tot_prevalence[disease])
data = {'disease':disease_list, 'male_comorbidity':m_comorbidities, 'female_comorbidity':f_comorbidities, 'overall_comorbidity':tot_comorbidities}
comorbidities_df = pd.DataFrame(data)

In [40]:
comorbidities_df.head(20)

Unnamed: 0,disease,male_comorbidity,female_comorbidity,overall_comorbidity
0,epilepsy,1.000701,1.001636,1.001374
1,migraine,0.086255,0.197928,0.166667
2,generalized anxiety disorder,0.086255,0.1494,0.131724
3,major depressive disorder,0.085554,0.137132,0.122693
4,high blood pressure (hypertension),0.079243,0.070065,0.072634
5,fibromyalgia,0.01683,0.092148,0.071064
6,traumatic brain injury,0.078541,0.05916,0.064586
7,hypothyroidism,0.023142,0.073064,0.059089
8,post-traumatic stress disorder,0.032258,0.05807,0.050844
9,asthma,0.021038,0.051799,0.043188


In [41]:
df.to_csv('filtered_PLM.csv')