## Data Extraction & Cleaning

* Normalize all data to lowercase or easier indexing and comparison
* Remove all superfluous whitespace
* Collapse columns into a single columns as appropriate (e.g. Symptome, Precautions)
* Save off new datasets for use in analysis


In [1]:
import pandas as pd
# File paths
base_path = r'C:\Users\bill\repos\genai-medibot\Dataset\\'

dataset_path = fr'{base_path}original\dataset.csv'
description_path = fr'{base_path}original\symptom_Description.csv'
precaution_path = fr'{base_path}original\symptom_precaution.csv'
severity_path = fr'{base_path}original\Symptom-severity.csv'

# Load files
dataset = pd.read_csv(dataset_path)
description = pd.read_csv(description_path)
precaution = pd.read_csv(precaution_path)
severity = pd.read_csv(severity_path)

In [4]:
# Clean zee data
def clean_text(df):
    # Clean column names: lowercase and replace underscores with spaces
    df.columns = df.columns.str.lower().str.replace('_', ' ')
    
    # Clean data values: remove leading/trailing spaces and convert to lowercase
    df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
    # Replace commas with spaces in text content
    df = df.applymap(lambda x: x.replace(',', ' ') if isinstance(x, str) else x)
    # Remove underscores in values
    df = df.applymap(lambda x: x.replace('_', ' ') if isinstance(x, str) else x)
    
    return df

# Clean data (now cleans both columns and values)
dataset_cleaned = clean_text(dataset)
description_cleaned = clean_text(description)
precaution_cleaned = clean_text(precaution)
df_severity_cleaned = clean_text(severity)

# Verify
print(dataset_cleaned.head())
print(description_cleaned.head())
print(precaution_cleaned.head())
print(df_severity_cleaned.head())


            disease  symptom 1             symptom 2             symptom 3  \
0  fungal infection    itching             skin rash  nodal skin eruptions   
1  fungal infection  skin rash  nodal skin eruptions   dischromic  patches   
2  fungal infection    itching  nodal skin eruptions   dischromic  patches   
3  fungal infection    itching             skin rash   dischromic  patches   
4  fungal infection    itching             skin rash  nodal skin eruptions   

             symptom 4 symptom 5 symptom 6 symptom 7 symptom 8 symptom 9  \
0  dischromic  patches       NaN       NaN       NaN       NaN       NaN   
1                  NaN       NaN       NaN       NaN       NaN       NaN   
2                  NaN       NaN       NaN       NaN       NaN       NaN   
3                  NaN       NaN       NaN       NaN       NaN       NaN   
4                  NaN       NaN       NaN       NaN       NaN       NaN   

  symptom 10 symptom 11 symptom 12 symptom 13 symptom 14 symptom 15  \
0  

  df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.replace(',', ' ') if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.replace('_', ' ') if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.replace(',', ' ') if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.replace('_', ' ') if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.replace(',', ' ') if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.replace('_', ' ') if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.replace(',', ' ') if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.replace('_', ' ') if isinstance(x, str) else x)


## Disease and Symptoms

* "dataset.csv" => disease_symptoms.csv

In [5]:
dataset_cleaned.head(2)

Unnamed: 0,disease,symptom 1,symptom 2,symptom 3,symptom 4,symptom 5,symptom 6,symptom 7,symptom 8,symptom 9,symptom 10,symptom 11,symptom 12,symptom 13,symptom 14,symptom 15,symptom 16,symptom 17
0,fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,


In [6]:
# Get only symptom columns
precaution_cols = [col for col in dataset_cleaned.columns if col.startswith("symptom")]

# Combine symptoms into a single string per row, dropping NaNs and joining with comma
dataset_cleaned["symptoms"] = dataset_cleaned[precaution_cols].apply(
    lambda row: ', '.join(row.dropna().astype(str)), axis=1
)

# Drop the old symptom_* columns
df_disease_symptoms = dataset_cleaned.drop(columns=precaution_cols, inplace=False)

# Drop the duplicate rows
df_disease_symptoms.drop_duplicates(subset=["disease", "symptoms"], inplace=True)

df_disease_symptoms.head(10)

Unnamed: 0,disease,symptoms
0,fungal infection,"itching, skin rash, nodal skin eruptions, disc..."
1,fungal infection,"skin rash, nodal skin eruptions, dischromic p..."
2,fungal infection,"itching, nodal skin eruptions, dischromic pat..."
3,fungal infection,"itching, skin rash, dischromic patches"
4,fungal infection,"itching, skin rash, nodal skin eruptions"
10,allergy,"continuous sneezing, shivering, chills, wateri..."
11,allergy,"shivering, chills, watering from eyes"
12,allergy,"continuous sneezing, chills, watering from eyes"
13,allergy,"continuous sneezing, shivering, watering from ..."
14,allergy,"continuous sneezing, shivering, chills"


## Symptom Description 

* "symptom_Description.csv" => disease_symptom_description.csv

In [7]:
print("Before cleaning:")
description.head()

Before cleaning:


Unnamed: 0,disease,description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...
1,Malaria,An infectious disease caused by protozoan para...
2,Allergy,An allergy is an immune system response to a f...
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi..."
4,Psoriasis,Psoriasis is a common skin disorder that forms...


In [8]:
# Load the file
df_disease_symptom_description = description.copy()

# Lowercase column names
df_disease_symptom_description.columns = df_disease_symptom_description.columns.str.lower().str.strip()

# Lowercase and clean whitespace in all text fields
df_disease_symptom_description = df_disease_symptom_description.applymap(lambda x: ' '.join(str(x).lower().split()) if isinstance(x, str) else x)

# Drop duplicates
df_disease_symptom_description.drop_duplicates(inplace=True)

print ("After cleaning:")
df_disease_symptom_description.head()

After cleaning:


  df_disease_symptom_description = df_disease_symptom_description.applymap(lambda x: ' '.join(str(x).lower().split()) if isinstance(x, str) else x)


Unnamed: 0,disease,description
0,drug reaction,an adverse drug reaction (adr) is an injury ca...
1,malaria,an infectious disease caused by protozoan para...
2,allergy,an allergy is an immune system response to a f...
3,hypothyroidism,"hypothyroidism, also called underactive thyroi..."
4,psoriasis,psoriasis is a common skin disorder that forms...


## Disease and Precaution 

* "symptom_precaution.csv" => disease_precautions.csv

In [9]:
# Get only precaution columns (updated to match cleaned column names)
precaution_cols = [col for col in precaution_cleaned.columns if col.startswith("precaution ")]

# Combine precautions into a single string per row, dropping NaNs and joining with comma
# Wrap the entire list in double quotes
precaution_cleaned["precautions"] = precaution_cleaned[precaution_cols].apply(
    lambda row: '"' + ', '.join(row.dropna().astype(str)) + '"', axis=1
)

# Drop the old precaution columns
df_precaution_cleaned_new = precaution_cleaned.drop(columns=precaution_cols, inplace=False)

df_severity_cleaned.rename(columns={"weight": "severity"}, inplace=True)


df_severity_cleaned.drop_duplicates(inplace=True)

df_precaution_cleaned_new.head(10)

Unnamed: 0,disease,precautions
0,drug reaction,"""stop irritation, consult nearest hospital, st..."
1,malaria,"""consult nearest hospital, avoid oily food, av..."
2,allergy,"""apply calamine, cover area with bandage, use ..."
3,hypothyroidism,"""reduce stress, exercise, eat healthy, get pro..."
4,psoriasis,"""wash hands with warm soapy water, stop bleedi..."
5,gerd,"""avoid fatty spicy food, avoid lying down afte..."
6,chronic cholestasis,"""cold baths, anti itch medicine, consult docto..."
7,hepatitis a,"""consult nearest hospital, wash hands through,..."
8,osteoarthristis,"""acetaminophen, consult nearest hospital, foll..."
9,(vertigo) paroymsal positional vertigo,"""lie down, avoid sudden change in body, avoid ..."


## Symptom Severity  

* "Symptom-severity.csv" => disease_symptom_severity.csv

In [10]:
df_severity_cleaned.to_csv(f"{base_path}disease_symptom_severity.csv", index=False)
df_precaution_cleaned_new.to_csv(f"{base_path}disease_precautions.csv", index=False)
df_disease_symptom_description.to_csv(f"{base_path}disease_symptom_description.csv", index=False, quoting=1)
df_disease_symptoms.to_csv(f"{base_path}disease_symptoms.csv", index=False)