In [1]:
# ================================================================
# BOOTSTRAP CELL FOR Synthetic_Dataset_Generator.ipynb
# Loads dependencies and ensures Drive access
# ================================================================

!pip install pandas numpy scikit-learn matplotlib --quiet

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

print("âœ… Dataset Generator Bootstrap Loaded Successfully")


Mounted at /content/drive
âœ… Dataset Generator Bootstrap Loaded Successfully


# **Synthetic Dataset Generator for Childhood Disease Classification**  
### VHT Assistant â€” Scenario 2 (Pneumonia, Malaria, Diarrhoea)
### Hybrid (Rule-Based + Random Noise) Dataset

This notebook generates a medically realistic synthetic dataset for training our
disease classifier. It combines clinical logic based on IMCI guidelines with
controlled randomness, so the dataset:
- Resembles real VHT cases  
- Allows the classifier to learn meaningful patterns  
- Is not perfectly predictable (prevents overfitting)  
- Satisfies exam requirements for a data pipeline  

The output will be saved as:
`datasets/child_disease_dataset.csv`


## **Dataset Logic (How We Generate Realistic Cases)**
****
We generate three main diseases:
1. **Pneumonia**
2. **Malaria**
3. **Diarrhoea / Dehydration**

### ðŸ”¹ Pneumonia Patterns
Usually includes:
- Cough = 1
- Fast breathing = 1
May include:
- Fever (sometimes)
Random noise:
- Some diarrhea or vomiting, but rarely

### ðŸ”¹ Malaria Patterns
Usually includes:
- Fever = 1
- Vomiting = common
May include:
- Weakness
- Poor feeding
- Sometimes cough (but low probability)

### ðŸ”¹ Diarrhoea Patterns
Usually includes:
- Diarrhea = 1
May include:
- Vomiting
- Weakness
- Fever occasionally

### ðŸ§  Why add random noise?
Real clinical cases are messy. Clean rule-based data makes the classifier behave like a lookup table.  
Noise forces the classifier to *learn patterns*, not memorize.

We will generate **1,000â€“2,000 synthetic cases** depending on your runtime preference.


In [None]:
import numpy as np          # Used for generating random numbers
import pandas as pd         # Used to store our dataset in table form
import os                   # For creating dataset folder


In [None]:
# Create a folder to store the dataset if it does not exist
os.makedirs("datasets", exist_ok=True)


In [None]:
def generate_pneumonia_case():
    # Base pneumonia symptoms
    fever = np.random.choice([1, 0], p=[0.7, 0.3])            # Fever common but not always
    cough = 1                                                  # Must have cough
    fast_breathing = 1                                         # Must have fast breathing

    # Possible additional symptoms (random noise)
    diarrhea = np.random.choice([0, 1], p=[0.85, 0.15])
    vomiting = np.random.choice([0, 1], p=[0.80, 0.20])
    weakness = np.random.choice([0, 1], p=[0.50, 0.50])
    poor_feeding = np.random.choice([0, 1], p=[0.60, 0.40])
    convulsions = np.random.choice([0, 1], p=[0.95, 0.05])

    age = np.random.randint(1, 5)                              # Child between 1â€“4 years
    duration = np.random.randint(1, 4)                         # Sick 1â€“3 days

    return [fever, cough, fast_breathing, diarrhea, vomiting, weakness, poor_feeding, convulsions, age, duration, "pneumonia"]


In [None]:
def generate_malaria_case():
    fever = 1                                                  # Must have fever
    cough = np.random.choice([0, 1], p=[0.80, 0.20])           # Some malaria patients cough
    fast_breathing = np.random.choice([0, 1], p=[0.60, 0.40])  # Can occur in severe malaria

    diarrhea = np.random.choice([0, 1], p=[0.40, 0.60])        # Common in children
    vomiting = np.random.choice([1, 0], p=[0.75, 0.25])        # Very common
    weakness = np.random.choice([1, 0], p=[0.70, 0.30])
    poor_feeding = np.random.choice([1, 0], p=[0.65, 0.35])
    convulsions = np.random.choice([0, 1], p=[0.90, 0.10])      # Sometimes present in severe cases

    age = np.random.randint(1, 5)
    duration = np.random.randint(1, 4)

    return [fever, cough, fast_breathing, diarrhea, vomiting, weakness, poor_feeding, convulsions, age, duration, "malaria"]


In [None]:
def generate_diarrhea_case():
    fever = np.random.choice([0, 1], p=[0.60, 0.40])           # Fever sometimes occurs
    cough = np.random.choice([0, 1], p=[0.85, 0.15])
    fast_breathing = np.random.choice([0, 1], p=[0.90, 0.10])

    diarrhea = 1                                               # Must have diarrhea
    vomiting = np.random.choice([0, 1], p=[0.50, 0.50])
    weakness = np.random.choice([0, 1], p=[0.60, 0.40])
    poor_feeding = np.random.choice([0, 1], p=[0.70, 0.30])
    convulsions = np.random.choice([0, 1], p=[0.98, 0.02])

    age = np.random.randint(1, 5)
    duration = np.random.randint(1, 4)

    return [fever, cough, fast_breathing, diarrhea, vomiting, weakness, poor_feeding, convulsions, age, duration, "diarrhea"]


In [None]:
# Number of samples per disease
N = 600   # adjust as needed (600 per class = 1800 total)

data = []

for _ in range(N):
    data.append(generate_pneumonia_case())
    data.append(generate_malaria_case())
    data.append(generate_diarrhea_case())

columns = [
    "fever", "cough", "fast_breathing", "diarrhea", "vomiting",
    "weakness", "poor_feeding", "convulsions", "age", "duration", "disease"
]

df = pd.DataFrame(data, columns=columns)


In [None]:
df.to_csv("datasets/child_disease_dataset.csv", index=False)
print("Dataset saved to datasets/child_disease_dataset.csv")


Dataset saved to datasets/child_disease_dataset.csv


In [None]:
df = pd.read_csv("datasets/child_disease_dataset.csv")


## **Linking the Synthetic Dataset to Google Drive**

In this section, we mount Google Drive so that our generated dataset can be
saved directly into the same folder where all our model notebooks are stored.

Why this is important:
- Colab resets often, but Google Drive does not lose files.
- All model notebooks (NLP, classifier, knowledge graph) can access the same dataset.
- Our final ZIP file will include the dataset in the correct folder.
- It keeps the whole project clean and well-structured for the exam.


In [None]:
from google.colab import drive
import os

# Mount Google Drive (this will prompt you to authorize)
drive.mount('/content/drive')

# Define the path where the dataset should be saved
# Adjust STUDENT_ID to your real student number when packaging final ZIP.
project_path = "/content/drive/MyDrive/Cognitive_Project/3_Model_Notebooks/datasets/"

# Create the folder if it doesn't already exist
os.makedirs(project_path, exist_ok=True)

print("Dataset folder ready at:", project_path)


Mounted at /content/drive
Dataset folder ready at: /content/drive/MyDrive/Cognitive_Project/3_Model_Notebooks/datasets/


In [None]:
# Save the dataset directly to Google Drive
save_path = project_path + "child_disease_dataset.csv"
df.to_csv(save_path, index=False)

print("Dataset saved successfully at:", save_path)


Dataset saved successfully at: /content/drive/MyDrive/Cognitive_Project/3_Model_Notebooks/datasets/child_disease_dataset.csv


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Cognitive_Project/3_Model_Notebooks/datasets/child_disease_dataset.csv")


In [None]:
import os

# Replace this with YOUR Drive path
project_path = "/content/drive/MyDrive/Cognitive_Project/3_Model_Notebooks/datasets/"

# List all files in the dataset folder
os.listdir(project_path)


['child_disease_dataset.csv']

In [None]:
import os
os.listdir("/content/drive/MyDrive")


['Colab Notebooks',
 'kasooli_dataset',
 'now add all this in one document (pdf) what is ne... (8).gdoc',
 'now add all this in one document (pdf) what is ne... (7).gdoc',
 'now add all this in one document (pdf) what is ne... (6).gdoc',
 'now add all this in one document (pdf) what is ne... (5).gdoc',
 'now add all this in one document (pdf) what is ne... (4).gdoc',
 'now add all this in one document (pdf) what is ne... (3).gdoc',
 'now add all this in one document (pdf) what is ne... (2).gdoc',
 'now add all this in one document (pdf) what is ne... (1).gdoc',
 'now add all this in one document (pdf) what is ne....gdoc',
 'DL Assignment .gdoc',
 'Cognitive_Project']

In [None]:
for root, dirs, files in os.walk("/content/drive/MyDrive"):
    for d in dirs:
        print(os.path.join(root, d))


/content/drive/MyDrive/Colab Notebooks
/content/drive/MyDrive/kasooli_dataset
/content/drive/MyDrive/Cognitive_Project
/content/drive/MyDrive/Colab Notebooks/Model Notebooks
/content/drive/MyDrive/Colab Notebooks/Model Notebooks/datasets
/content/drive/MyDrive/kasooli_dataset/MLN
/content/drive/MyDrive/kasooli_dataset/MSV
/content/drive/MyDrive/kasooli_dataset/Healthy
/content/drive/MyDrive/kasooli_dataset/kasooli_dataset
/content/drive/MyDrive/kasooli_dataset/kasooli_dataset/MSV
/content/drive/MyDrive/kasooli_dataset/kasooli_dataset/MLN
/content/drive/MyDrive/kasooli_dataset/kasooli_dataset/Healthy
/content/drive/MyDrive/Cognitive_Project/3_Model_Notebooks
/content/drive/MyDrive/Cognitive_Project/3_Model_Notebooks/datasets
