In [71]:
import pandas as pd
import numpy as np
from pathlib import Path

import relevant libriries for data handling and processing 

In [138]:
RawSymptomsFrame = pd.read_csv('/Users/jroyarekhua/SymptomCheckerHackthon/ml_operations/data/raw/dataset.csv',
                               header=None,
                               on_bad_lines='skip',
                               skip_blank_lines=True)
print(RawSymptomsFrame.head(n=3))
print(f'data shape: {RawSymptomsFrame.shape}')
RawSymptomsFrame.columns = ['Disease'] + [f'Symptom_{i}' for i in range(1, 18)]
RawSymptomsFrame = RawSymptomsFrame.drop(index=0).reset_index(drop=True)


                 0           1                      2                      3   \
0           Disease   Symptom_1              Symptom_2              Symptom_3   
1  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
2  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   

                     4          5          6          7          8   \
0             Symptom_4  Symptom_5  Symptom_6  Symptom_7  Symptom_8   
1   dischromic _patches        NaN        NaN        NaN        NaN   
2                   NaN        NaN        NaN        NaN        NaN   

          9           10          11          12          13          14  \
0  Symptom_9  Symptom_10  Symptom_11  Symptom_12  Symptom_13  Symptom_14   
1        NaN         NaN         NaN         NaN         NaN         NaN   
2        NaN         NaN         NaN         NaN         NaN         NaN   

           15          16          17  
0  Symptom_15  Symptom_16  Symptom_17  
1    

In [139]:
print(RawSymptomsFrame.info())
print(RawSymptomsFrame.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     4920 non-null   object
 1   Symptom_1   4920 non-null   object
 2   Symptom_2   4920 non-null   object
 3   Symptom_3   4920 non-null   object
 4   Symptom_4   4572 non-null   object
 5   Symptom_5   3714 non-null   object
 6   Symptom_6   2934 non-null   object
 7   Symptom_7   2268 non-null   object
 8   Symptom_8   1944 non-null   object
 9   Symptom_9   1692 non-null   object
 10  Symptom_10  1512 non-null   object
 11  Symptom_11  1194 non-null   object
 12  Symptom_12  744 non-null    object
 13  Symptom_13  504 non-null    object
 14  Symptom_14  306 non-null    object
 15  Symptom_15  240 non-null    object
 16  Symptom_16  192 non-null    object
 17  Symptom_17  72 non-null     object
dtypes: object(18)
memory usage: 692.0+ KB
None
                 Disease  Symptom_1  Symptom_

In [143]:
cleaned_symptoms = RawSymptomsFrame.copy()

for col in cleaned_symptoms.select_dtypes(include='object').columns:
    cleaned_symptoms[col] = (cleaned_symptoms[col]
                             .str.strip()
                             .str.lower()
                             .str.replace('_ ','_')
                             .str.replace(' _', '_')) 
print(cleaned_symptoms.head())


            Disease  Symptom_1             Symptom_2             Symptom_3  \
0  fungal infection    itching             skin_rash  nodal_skin_eruptions   
1  fungal infection  skin_rash  nodal_skin_eruptions    dischromic_patches   
2  fungal infection    itching  nodal_skin_eruptions    dischromic_patches   
3  fungal infection    itching             skin_rash    dischromic_patches   
4  fungal infection    itching             skin_rash  nodal_skin_eruptions   

            Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0  dischromic_patches       NaN       NaN       NaN       NaN       NaN   
1                 NaN       NaN       NaN       NaN       NaN       NaN   
2                 NaN       NaN       NaN       NaN       NaN       NaN   
3                 NaN       NaN       NaN       NaN       NaN       NaN   
4                 NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symptom_14 Symptom_15  \
0        

strip whitespace and conventionally make everything lowercase 

In [144]:
symptom_cols = cleaned_symptoms.drop(columns=['Disease']) # get only the symptom columns 
symptom_cols_long = symptom_cols.melt(var_name='Symptom', value_name='Symptoms')
symptom_cols_long.dropna(subset=['Symptom'],inplace=True)
symptom_cols_long['Symptoms'] = symptom_cols_long['Symptoms'].str.strip() # strip all whitespace from values
symptom_cols_long.drop(columns=['Symptom'],inplace=True)
print(symptom_cols_long.value_counts())

Symptoms              
fatigue                   1932
vomiting                  1914
high_fever                1362
loss_of_appetite          1152
nausea                    1146
                          ... 
extra_marital_contacts     108
dischromic_patches         108
dehydration                108
muscle_wasting             108
foul_smell_of urine        102
Name: count, Length: 131, dtype: int64


looking at value counts of all unique symptoms. and sorting them by frequency

In [163]:
diseases = RawSymptomsFrame['Disease']
diseases.dropna(inplace=True)
print(diseases.value_counts())


Disease
Fungal infection                           120
Hepatitis C                                120
Hepatitis E                                120
Alcoholic hepatitis                        120
Tuberculosis                               120
Common Cold                                120
Pneumonia                                  120
Dimorphic hemmorhoids(piles)               120
Heart attack                               120
Varicose veins                             120
Hypothyroidism                             120
Hyperthyroidism                            120
Hypoglycemia                               120
Osteoarthristis                            120
Arthritis                                  120
(vertigo) Paroymsal  Positional Vertigo    120
Acne                                       120
Urinary tract infection                    120
Psoriasis                                  120
Hepatitis D                                120
Hepatitis B                                120
Aller

counting all unique diseases, sorting them by frequency 

In [182]:
disease_count = cleaned_symptoms[['Disease']].copy()
disease_count = (disease_count
                 .value_counts()
                 .sort_index()
                 .reset_index())
disease_count.columns = ['Disease', 'Count']

disease_count['Proportion'] = disease_count['Count'] / disease_count['Count'].sum()
disease_count['Percentage'] = disease_count['Proportion'] * 100


tabulation = disease_count.T
tabulation.columns = tabulation.iloc[0,:]
tabulation.drop(tabulation.index[0],inplace=True)

tabulation['Total'] = [
    tabulation.loc['Count'].sum(),
    tabulation.loc['Proportion'].sum(),   
    tabulation.loc['Percentage'].sum()
]

print(tabulation)
print(tabulation['Total'])


Disease    (vertigo) paroymsal  positional vertigo      acne      aids  \
Count                                         2040      2040      2040   
Proportion                                 0.02439   0.02439   0.02439   
Percentage                                2.439024  2.439024  2.439024   

Disease    alcoholic hepatitis   allergy arthritis bronchial asthma  \
Count                     2040      2040      2040             2040   
Proportion             0.02439   0.02439   0.02439          0.02439   
Percentage            2.439024  2.439024  2.439024         2.439024   

Disease    cervical spondylosis chicken pox chronic cholestasis  ...  \
Count                      2040        2040                2040  ...   
Proportion              0.02439     0.02439             0.02439  ...   
Percentage             2.439024    2.439024            2.439024  ...   

Disease    osteoarthristis paralysis (brain hemorrhage) peptic ulcer diseae  \
Count                 2040                        

create a frequency table for diseases 

In [274]:
symptoms = symptom_cols_long
symptoms.dropna(inplace=True)

symptoms = (symptoms
            .value_counts()
            .sort_values(ascending=False)
            .reset_index()
)

symptoms.columns = ['Symptoms', 'Count'] # change column names
columns = symptoms['Symptoms'] # store columns for future use 

# calculate  proportions, and percentages and store for later use 
count,proportion,percentage = 0,0,0

symptoms['Proportion'] = symptoms['Count'] / symptoms['Count'].sum() 
symptoms['Percentage'] = symptoms['Proportion'] * 100 

count = symptoms['Count'].sum()
proportion , percentage = symptoms['Proportion'].sum(),symptoms['Percentage'].sum()
# transpose matrix
symptoms = symptoms.T

symptoms.drop(index=symptoms.index[:1], inplace=True)
symptoms.columns = columns

print(symptoms.head(n=5))
print(f'total: {count.sum()}  total proportion: {proportion} total percentage: {percentage} ')


Symptoms     fatigue  vomiting high_fever loss_of_appetite    nausea  \
Count           1932      1914       1362             1152      1146   
Proportion  0.052718  0.052227   0.037164         0.031434   0.03127   
Percentage  5.271775  5.222659   3.716437         3.143418  3.127046   

Symptoms    headache abdominal_pain yellowish_skin yellowing_of_eyes  \
Count           1134           1032            912               816   
Proportion  0.030943        0.02816       0.024885          0.022266   
Percentage  3.094303       2.815979        2.48854          2.226588   

Symptoms      chills  ... swollen_blood_vessels spinning_movements  \
Count            798  ...                   108                108   
Proportion  0.021775  ...              0.002947           0.002947   
Percentage  2.177472  ...              0.294695           0.294695   

Symptoms   spotting_urination sunken_eyes blackheads weakness_in_limbs  \
Count                     108         108        108               

create a frequency table for symptoms