In [3]:
# Import Dependencies
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os

#Connecting the driver
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [5]:
# Read Raw Dataset'
root_dir = '/content/drive/MyDrive/DAP/python_scripts'
df = pd.read_excel(os.path.join(root_dir,'raw_data.xlsx'))

In [6]:
data = df.fillna(method='ffill')

In [7]:
data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall
...,...,...,...
1861,UMLS:C0233472_affect labile,45.0,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,UMLS:C0233472_affect labile,45.0,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0871754_frail


In [8]:
data['Disease']

0       UMLS:C0020538_hypertensive disease
1       UMLS:C0020538_hypertensive disease
2       UMLS:C0020538_hypertensive disease
3       UMLS:C0020538_hypertensive disease
4       UMLS:C0020538_hypertensive disease
                       ...                
1861           UMLS:C0233472_affect labile
1862           UMLS:C0233472_affect labile
1863         UMLS:C0011127_decubitus ulcer
1864         UMLS:C0011127_decubitus ulcer
1865         UMLS:C0011127_decubitus ulcer
Name: Disease, Length: 1866, dtype: object

In [9]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    return data_list

In [10]:
# Data Cleanup
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():

    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

In [11]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'hypertensive disease': ['pain chest',
              'shortness of breath',
              'dizziness',
              'asthenia',
              'fall',
              'syncope',
              'vertigo',
              'sweat',
              'sweating increased',
              'palpitation',
              'nausea',
              'angina pectoris',
              'pressure chest'],
             'diabetes': ['polyuria',
              'polydypsia',
              'shortness of breath',
              'pain chest',
              'asthenia',
              'nausea',
              'orthopnea',
              'rale',
              'sweat',
              'sweating increased',
              'unresponsiveness',
              'mental status changes',
              'vertigo',
              'vomiting',
              'labored breathing'],
             'depression mental': ['feeling suicidal',
              'suicidal',
              'hallucinations auditory',
              'feel

In [12]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'hypertensive disease': 3363.0,
 'diabetes': 1421.0,
 'depression mental': 1337.0,
 'depressive disorder': 1337.0,
 'coronary arteriosclerosis': 1284.0,
 'coronary heart disease': 1284.0,
 'pneumonia': 1029.0,
 'failure heart congestive': 963.0,
 'accident\xa0cerebrovascular': 885.0,
 'asthma': 835.0,
 'myocardial infarction': 759.0,
 'hypercholesterolemia': 685.0,
 'infection': 630.0,
 'infection urinary tract': 597.0,
 'anemia': 544.0,
 'chronic obstructive airway disease': 524.0,
 'dementia': 504.0,
 'insufficiency renal': 445.0,
 'confusion': 408.0,
 'degenerative\xa0polyarthritis': 405.0,
 'hypothyroidism': 398.0,
 'anxiety state': 390.0,
 'malignant neoplasms': 354.0,
 'primary malignant neoplasm': 354.0,
 'acquired\xa0immuno-deficiency syndrome': 350.0,
 'HIV': 350.0,
 'hiv infections': 350.0,
 'cellulitis': 341.0,
 'gastroesophageal reflux disease': 325.0,
 'septicemia': 311.0,
 'systemic infection': 311.0,
 'sepsis (invertebrate)': 311.0,
 'deep vein thrombosis': 310.0,
 'deh

In [13]:
# Save cleaned data as CSV
f = open('cleaned_data.csv', 'w')

with f:
    writer = csv.writer(f)
    for key, val in disease_symptom_dict.items():
        for i in range(len(val)):
            writer.writerow([key, val[i], disease_symptom_count[key]])

In [14]:
# Read Cleaned Data as DF
df = pd.read_csv('cleaned_data.csv', encoding='ISO-8859-1')
df.columns = ['disease', 'symptom', 'occurence_count']
df.head()

Unnamed: 0,disease,symptom,occurence_count
0,hypertensive disease,shortness of breath,3363.0
1,hypertensive disease,dizziness,3363.0
2,hypertensive disease,asthenia,3363.0
3,hypertensive disease,fall,3363.0
4,hypertensive disease,syncope,3363.0


In [15]:
# Remove any rows with empty values
df.replace(float('nan'), np.nan, inplace=True)
df.dropna(inplace=True)

In [16]:
alzheimer_df = df[df['disease'] == "Alzheimer's disease"]

In [17]:
alzheimer_df

Unnamed: 0,disease,symptom,occurence_count
1441,Alzheimer's disease,drool,101.0
1442,Alzheimer's disease,agitation,101.0
1443,Alzheimer's disease,nightmare,101.0
1444,Alzheimer's disease,rhonchus,101.0
1445,Alzheimer's disease,consciousness clear,101.0
1446,Alzheimer's disease,pin-point pupils,101.0
1447,Alzheimer's disease,bedridden,101.0
1448,Alzheimer's disease,bedridden,101.0
1449,Alzheimer's disease,frail,101.0
1450,Alzheimer's disease,tremor resting,101.0


In [18]:
from sklearn import preprocessing

In [19]:
n_unique = len(df['symptom'].unique())
n_unique

404

In [20]:
df.dtypes

disease             object
symptom             object
occurence_count    float64
dtype: object

In [21]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd


X = df[['symptom']]
y = df['disease']


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
onehot_encoder = OneHotEncoder(sparse=False)
X_onehot_encoded = onehot_encoder.fit_transform(X)


df_encoded = pd.DataFrame(X_onehot_encoded, columns=onehot_encoder.get_feature_names_out(['symptom']))
df_result = pd.concat([df[['disease']], df_encoded], axis=1)

# Group by 'disease' and apply the maximum function to get binary indicators (1 if symptom is present, 0 otherwise)
df_result = df_result.groupby('disease').max().reset_index()
df_result




Unnamed: 0,disease,symptom_Heberden's node,symptom_Murphy's sign,symptom_Stahli's line,symptom_abdomen acute,symptom_abdominal bloating,symptom_abdominal tenderness,symptom_abnormal sensation,symptom_abnormally hard consistency,symptom_abortion,...,symptom_vision blurred,symptom_vomiting,symptom_weepiness,symptom_weight gain,symptom_welt,symptom_wheelchair bound,symptom_wheezing,symptom_withdraw,symptom_worry,symptom_yellow sputum
0,Alzheimer's disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,HIV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PneumocystisÂ cariniiÂ pneumonia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,accidentÂ cerebrovascular,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,acquiredÂ immuno-deficiency syndrome,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,tonic-clonic seizures,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145,transient ischemic attack,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,tricuspid valve insufficiency,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,ulcer peptic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df_result.columns = df_result.columns.str.replace('symptom_', '')


In [23]:
#df = df_result.drop('nan', axis=1)
df =df.dropna()

In [24]:
df_result.to_csv('Transformed_data.csv', index=False)