# 1. Data Loading & Preprocessing

This notebook handles the initial data loading and preprocessing steps for our symptom-to-disease prediction pipeline.

## Objectives
1. Load large dataset efficiently using chunking
2. Validate data quality
3. Normalize disease labels
4. Handle multi-label cases
5. Save processed dataset

## Setup and Dependencies

In [1]:
# Install required packages
!python -m pip install --upgrade pip
%pip install pandas numpy scikit-learn joblib python-dotenv tqdm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Data Loading

We'll load the data in chunks to handle the large file size efficiently.

In [3]:
def load_data_in_chunks(file_path, chunksize=10000):
    """Load large dataset in chunks and combine.
    
    Args:
        file_path (str): Path to the data file
        chunksize (int): Number of rows per chunk
        
    Returns:
        pd.DataFrame: Combined dataset
    """
    # Create data directory if it doesn't exist
    Path("../data/raw").mkdir(parents=True, exist_ok=True)
    
    chunks = []
    # Get total number of lines
    total_rows = sum(1 for _ in open(file_path)) - 1  # Subtract header
    
    # Calculate number of chunks
    num_chunks = (total_rows + chunksize - 1) // chunksize
    
    print(f"Loading {total_rows:,} rows in {num_chunks} chunks...")
    
    for chunk in tqdm(pd.read_csv(file_path, chunksize=chunksize), total=num_chunks):
        chunks.append(chunk)
    
    return pd.concat(chunks, ignore_index=True)

# Load the dataset
DATA_PATH = "../data/raw/disease_symptom_dataset.csv"
try:
    df = load_data_in_chunks(DATA_PATH)
    print(f"\nDataset loaded successfully with shape: {df.shape}")
    
    print("\nDataset columns:")
    print(df.columns.tolist())
    print("\nFirst few rows:")
    print(df.head())
except FileNotFoundError:
    print(f"Please place the dataset at {DATA_PATH}")
except Exception as e:
    print(f"Error loading dataset: {str(e)}")

Loading 246,945 rows in 25 chunks...


100%|██████████| 25/25 [00:04<00:00,  5.06it/s]




Dataset loaded successfully with shape: (246945, 378)

Dataset columns:
['diseases', 'anxiety and nervousness', 'depression', 'shortness of breath', 'depressive or psychotic symptoms', 'sharp chest pain', 'dizziness', 'insomnia', 'abnormal involuntary movements', 'chest tightness', 'palpitations', 'irregular heartbeat', 'breathing fast', 'hoarse voice', 'sore throat', 'difficulty speaking', 'cough', 'nasal congestion', 'throat swelling', 'diminished hearing', 'lump in throat', 'throat feels tight', 'difficulty in swallowing', 'skin swelling', 'retention of urine', 'groin mass', 'leg pain', 'hip pain', 'suprapubic pain', 'blood in stool', 'lack of growth', 'emotional symptoms', 'elbow weakness', 'back weakness', 'pus in sputum', 'symptoms of the scrotum and testes', 'swelling of scrotum', 'pain in testicles', 'flatulence', 'pus draining from ear', 'jaundice', 'mass in scrotum', 'white discharge from eye', 'irritable infant', 'abusing alcohol', 'fainting', 'hostile behavior', 'drug abus

## 2. Data Validation

Check for data quality issues:

In [4]:
def validate_dataset(df):
    """Perform data validation checks.
    
    Args:
        df (pd.DataFrame): Input dataset
        
    Returns:
        dict: Validation results
    """
    results = {
        'total_rows': len(df),
        'duplicates': df.duplicated().sum(),
        'missing_values': df.isnull().sum().sum(),
        'non_binary_columns': []
    }
    
    # Check for non-binary values in symptom columns
    symptom_cols = df.columns[1:]  # All columns except disease
    for col in symptom_cols:
        unique_vals = df[col].unique()
        if not all(val in [0, 1] for val in unique_vals if pd.notna(val)):
            results['non_binary_columns'].append(col)
    
    return results

validation_results = validate_dataset(df)
print("\nValidation Results:")
print(f"Total rows: {validation_results['total_rows']}")
print(f"Duplicate rows: {validation_results['duplicates']}")
print(f"Missing values: {validation_results['missing_values']}")
print(f"Columns with non-binary values: {len(validation_results['non_binary_columns'])}")


Validation Results:
Total rows: 246945
Duplicate rows: 57298
Missing values: 0
Columns with non-binary values: 0


## 3. Disease Label Normalization

In [5]:
def normalize_disease_labels(df):
    """Normalize disease labels and save mapping.
    
    Args:
        df (pd.DataFrame): Input dataset
        
    Returns:
        tuple: (transformed DataFrame, label encoder)
    """
    le = LabelEncoder()
    df_processed = df.copy()
    
    # Ensure column names are lowercase and create disease column if differently named
    df_processed.columns = df_processed.columns.str.lower()
    
    # Check for possible variations of 'disease' column
    disease_column_variants = ['disease', 'diseases', 'diagnosis', 'condition']
    found_column = None
    for variant in disease_column_variants:
        if variant in df_processed.columns:
            found_column = variant
            break
            
    if found_column is None:
        print("Available columns:", df_processed.columns.tolist())
        raise KeyError("Could not find disease column. Please check the column names.")
        
    if found_column != 'disease':
        df_processed['disease'] = df_processed[found_column]
    
    # Fit and transform disease labels
    df_processed['disease'] = le.fit_transform(df_processed['disease'])
    
    # Save label mapping
    # Convert numpy int64 to regular Python int for JSON serialization
    label_mapping = {str(k): int(v) for k, v in zip(le.classes_, le.transform(le.classes_))}
    
    # Create the processed directory in the main project folder
    processed_dir = Path("../data/processed")
    processed_dir.mkdir(parents=True, exist_ok=True)
    
    # Save the mapping file in the project's data/processed directory
    mapping_path = processed_dir / "disease_mapping.json"
    with open(mapping_path, 'w') as f:
        json.dump(label_mapping, f, indent=2)
    
    print(f"Normalized {len(label_mapping)} unique disease labels")
    print(f"Saved mapping to {mapping_path.resolve()}")
    return df_processed, le

df_normalized, label_encoder = normalize_disease_labels(df)

Normalized 773 unique disease labels
Saved mapping to C:\Users\Nirmal\SIH-Projects\zyra\data\processed\disease_mapping.json


## 4. Handle Multi-label Cases

In [6]:
def handle_multi_label_diseases(df):
    """Process rows with multiple disease labels.
    
    Args:
        df (pd.DataFrame): Input dataset
        
    Returns:
        pd.DataFrame: Processed dataset
    """
    df_processed = df.copy()
    
    # Print columns to debug
    print("Available columns:", df_processed.columns.tolist())
    
    # Convert column names to lowercase
    df_processed.columns = df_processed.columns.str.lower()
    
    # Check for various possible column names for disease
    disease_column_variants = ['disease', 'diseases', 'diagnosis', 'condition', 'Disease']
    found_column = None
    for variant in disease_column_variants:
        if variant.lower() in df_processed.columns:
            found_column = variant.lower()
            break
            
    if found_column is None:
        raise KeyError(f"Could not find disease column. Available columns: {df_processed.columns.tolist()}")
    
    # Check for multi-label indicators (e.g., commas or semicolons in disease names)
    multi_label_rows = df_processed[found_column].astype(str).str.contains('[,;]', regex=True)
    
    if multi_label_rows.any():
        print(f"Found {multi_label_rows.sum()} multi-label rows")
        
        # Split multi-label rows into separate rows
        new_rows = []
        for idx, row in df_processed[multi_label_rows].iterrows():
            diseases = [d.strip() for d in str(row[found_column]).split(',')]
            for disease in diseases:
                new_row = row.copy()
                new_row[found_column] = disease
                new_rows.append(new_row)
        
        # Replace original multi-label rows with split rows
        df_processed = pd.concat([
            df_processed[~multi_label_rows],
            pd.DataFrame(new_rows)
        ], ignore_index=True)
        
        # Ensure the column is named 'disease' for consistency
        if found_column != 'disease':
            df_processed['disease'] = df_processed[found_column]
            df_processed = df_processed.drop(columns=[found_column])
    
    return df_processed

# First handle multi-label cases
df_with_split_labels = handle_multi_label_diseases(df)

# Then normalize the labels
df_normalized, label_encoder = normalize_disease_labels(df_with_split_labels)

# Store final processed dataset
df_processed = df_normalized

Available columns: ['diseases', 'anxiety and nervousness', 'depression', 'shortness of breath', 'depressive or psychotic symptoms', 'sharp chest pain', 'dizziness', 'insomnia', 'abnormal involuntary movements', 'chest tightness', 'palpitations', 'irregular heartbeat', 'breathing fast', 'hoarse voice', 'sore throat', 'difficulty speaking', 'cough', 'nasal congestion', 'throat swelling', 'diminished hearing', 'lump in throat', 'throat feels tight', 'difficulty in swallowing', 'skin swelling', 'retention of urine', 'groin mass', 'leg pain', 'hip pain', 'suprapubic pain', 'blood in stool', 'lack of growth', 'emotional symptoms', 'elbow weakness', 'back weakness', 'pus in sputum', 'symptoms of the scrotum and testes', 'swelling of scrotum', 'pain in testicles', 'flatulence', 'pus draining from ear', 'jaundice', 'mass in scrotum', 'white discharge from eye', 'irritable infant', 'abusing alcohol', 'fainting', 'hostile behavior', 'drug abuse', 'sharp abdominal pain', 'feeling ill', 'vomiting',

## 5. Save Processed Dataset

In [7]:
# Create the processed directory in the main project folder
processed_dir = Path("../data/processed")
processed_dir.mkdir(parents=True, exist_ok=True)

# Save processed dataset
output_path = processed_dir / "processed_data.csv"
df_processed.to_csv(output_path, index=False)
print(f"Saved processed dataset to {output_path.resolve()}")

# Save dataset statistics
stats = {
    'n_samples': len(df_processed),
    'n_features': len(df_processed.columns) - 1,  # Exclude disease column
    'n_classes': len(df_processed['disease'].unique()),
    'memory_usage': df_processed.memory_usage(deep=True).sum() / 1024**2  # MB
}

stats_path = processed_dir / "dataset_stats.json"
with open(stats_path, 'w') as f:
    json.dump(stats, f, indent=2)

print("\nDataset Statistics:")
for k, v in stats.items():
    print(f"{k}: {v}")
print(f"\nSaved statistics to {stats_path.resolve()}")

Saved processed dataset to C:\Users\Nirmal\SIH-Projects\zyra\data\processed\processed_data.csv

Dataset Statistics:
n_samples: 246945
n_features: 378
n_classes: 773
memory_usage: 727.843071937561

Saved statistics to C:\Users\Nirmal\SIH-Projects\zyra\data\processed\dataset_stats.json
