Importing the necessary libraries.

In [1]:
import csv
import numpy as np
import pandas as pd

Loading the cleaned dataset.

In [2]:
CSV_FILEPATH = 'dataset/dataset_clean.csv'
data = pd.read_csv(CSV_FILEPATH)
data = data.replace({"  ": " "}, regex=True)
data

Unnamed: 0,Disease,Symptom,Occurence
0,hypertensive disease,pain chest,3363.0
1,hypertensive disease,shortness of breath,3363.0
2,hypertensive disease,dizziness,3363.0
3,hypertensive disease,asthenia,3363.0
4,hypertensive disease,fall,3363.0
...,...,...,...
2125,affect labile,bedridden,45.0
2126,affect labile,prostatism,45.0
2127,decubitus ulcer,systolic murmur,42.0
2128,decubitus ulcer,frail,42.0


### Creating Symptoms ID data frame

Extract unique values from the `Symptom` column.

In [3]:
symptoms = data['Symptom'].unique()
symptoms

array(['pain chest', 'shortness of breath', 'dizziness', 'asthenia',
       'fall', 'syncope', 'vertigo', 'sweat', 'sweating increased',
       'palpitation', 'nausea', 'angina pectoris', 'pressure chest',
       'polyuria', 'polydypsia', 'orthopnea', 'rale', 'unresponsiveness',
       'mental status changes', 'vomiting', 'labored breathing',
       'feeling suicidal', 'suicidal', 'hallucinations auditory',
       'feeling hopeless', 'weepiness', 'sleeplessness',
       'motor retardation', 'irritable mood', 'blackout',
       'mood depressed', 'hallucinations visual', 'worry', 'agitation',
       'tremor', 'intoxication', 'verbal auditory hallucinations',
       'energy increased', 'difficulty', 'nightmare',
       'unable to concentrate', 'homelessness', 'hypokinesia',
       'dyspnea on exertion', 'chest tightness', 'cough', 'fever',
       'decreased translucency', 'productive cough', 'pleuritic pain',
       'yellow sputum', 'breath sounds decreased', 'chill', 'rhonchus',
       '

Create an empty data frame for symptoms.

In [4]:
sym_df = pd.DataFrame(columns=['syd','symptom'])

Iterate through the list of symptoms and add an entry to the symptoms data frame.

In [5]:
id = 1
for symptom in symptoms:
  entry = {'syd': id, 'symptom': symptom}
  sym_df = sym_df.append(entry, ignore_index=True)
  id += 1
sym_df

Unnamed: 0,syd,symptom
0,1,pain chest
1,2,shortness of breath
2,3,dizziness
3,4,asthenia
4,5,fall
...,...,...
400,401,air fluid level
401,402,catching breath
402,403,large-for-dates fetus
403,404,immobile


Export the symptom data frame as a `CSV` file.

In [6]:
CSV_FILEPATH = 'dataset/symptoms_id.csv'
sym_df.to_csv(CSV_FILEPATH, index=False)

### Creating Diseases ID data frame

Extract unique values from the `Symptom` column.

In [7]:
diseases = data['Disease'].unique()
diseases

array(['hypertensive disease', 'diabetes', 'depression mental',
       'depressive disorder', 'coronary arteriosclerosis',
       'coronary heart disease', 'pneumonia', 'failure heart congestive',
       'accident cerebrovascular', 'asthma', 'myocardial infarction',
       'hypercholesterolemia', 'infection', 'infection urinary tract',
       'anemia', 'chronic obstructive airway disease', 'dementia',
       'insufficiency renal', 'confusion', 'degenerative polyarthritis',
       'hypothyroidism', 'anxiety state', 'malignant neoplasms',
       'primary malignant neoplasm',
       'acquired immuno-deficiency syndrome', 'HIV', 'hiv infections',
       'cellulitis', 'gastroesophageal reflux disease', 'septicemia',
       'systemic infection', 'sepsis (invertebrate)',
       'deep vein thrombosis', 'dehydration', 'neoplasm',
       'embolism pulmonary', 'epilepsy', 'cardiomyopathy',
       'chronic kidney failure', 'carcinoma', 'hepatitis C',
       'peripheral vascular disease', 'psychoti

Create an empty data frame for diseases.

In [8]:
dia_df = pd.DataFrame(columns=['did', 'disease'])

Iterate through the list of diseases and add an entry to the diseases data frame.

In [9]:
id = 1
for disease in diseases:
  entry = {'did': id, 'disease': disease}
  dia_df = dia_df.append(entry, ignore_index=True)
  id += 1
dia_df

Unnamed: 0,did,disease
0,1,hypertensive disease
1,2,diabetes
2,3,depression mental
3,4,depressive disorder
4,5,coronary arteriosclerosis
...,...,...
143,144,ileus
144,145,adhesion
145,146,delusion
146,147,affect labile


Export the diseases data frame as a `CSV` file.

In [10]:
CSV_FILEPATH = 'dataset/diseases_id.csv'
dia_df.to_csv(CSV_FILEPATH, index=False)

### Creating the sparse matrix

Append the symptoms id to the original data frame.

In [11]:
df = pd.merge(data, sym_df, left_on='Symptom', right_on='symptom',how='left').drop(columns=['Symptom','symptom'], axis=1)
df


Unnamed: 0,Disease,Occurence,syd
0,hypertensive disease,3363.0,1
1,hypertensive disease,3363.0,2
2,hypertensive disease,3363.0,3
3,hypertensive disease,3363.0,4
4,hypertensive disease,3363.0,5
...,...,...,...
2125,affect labile,45.0,112
2126,affect labile,45.0,130
2127,decubitus ulcer,42.0,211
2128,decubitus ulcer,42.0,300


Append the diseases id to the original data frame.

In [12]:
df = pd.merge(df, dia_df, left_on='Disease', right_on='disease', how='left').drop(columns=['Disease','disease'], axis=1)
df

Unnamed: 0,Occurence,syd,did
0,3363.0,1,1
1,3363.0,2,1
2,3363.0,3,1
3,3363.0,4,1
4,3363.0,5,1
...,...,...,...
2125,45.0,112,147
2126,45.0,130,147
2127,42.0,211,148
2128,42.0,300,148


Iterate through each entry and calculate the weight of symptom and disease association based solely on symptoms count.

In [13]:
for index, row in df.iterrows():
    symptoms_count = (df['did'] == row['did']).sum()  # Get the total number of symptoms for each disease.
    weight = 1 / symptoms_count # Set the weight of relationship of each symptom to the disease to 1 out of the total number of symptoms a disease have.
    df.loc[index, 'wei'] = weight   # Add a new column for the weight.
df

Unnamed: 0,Occurence,syd,did,wei
0,3363.0,1,1,0.076923
1,3363.0,2,1,0.076923
2,3363.0,3,1,0.076923
3,3363.0,4,1,0.076923
4,3363.0,5,1,0.076923
...,...,...,...,...
2125,45.0,112,147,0.111111
2126,45.0,130,147,0.111111
2127,42.0,211,148,0.333333
2128,42.0,300,148,0.333333


Save the sparse matrix as a `CSV` file with the columns set to `syd`, `did`, and `wei`.

In [14]:
CSV_FILEPATH = 'dataset/symptoms-disease_sparse-matrix.csv'
df.to_csv(CSV_FILEPATH, columns=['syd','did','wei'], index=False)

Iterate through each entry again and revise the weight with consideration to the occurence count.

In [15]:
for index, row in df.iterrows():
    weight = row['wei'] * row['Occurence']  # Multiply the count of occurence to the disease and symptom association weight.
    df.loc[index, 'wei'] = weight   # Update the entry for the weight column.
df = df.drop(columns=['Occurence'])
df

Unnamed: 0,syd,did,wei
0,1,1,258.692308
1,2,1,258.692308
2,3,1,258.692308
3,4,1,258.692308
4,5,1,258.692308
...,...,...,...
2125,112,147,5.000000
2126,130,147,5.000000
2127,211,148,14.000000
2128,300,148,14.000000


Normalize the weights.

In [16]:
df['wei'] = (df['wei'] - df['wei'].min()) / (df['wei'].max() - df['wei'].min())
df

Unnamed: 0,syd,did,wei
0,1,1,1.000000
1,2,1,1.000000
2,3,1,1.000000
3,4,1,1.000000
4,5,1,1.000000
...,...,...,...
2125,112,147,0.009114
2126,130,147,0.009114
2127,211,148,0.044266
2128,300,148,0.044266


Save the sparse matrix as another `CSV` file.

In [17]:
CSV_FILEPATH = 'dataset/symptoms-disease-weighted-count_sparse-matrix.csv'
df.to_csv(CSV_FILEPATH, index=False)