In [1]:
import numpy as np
import pandas as pd

In [2]:
import pandas as pd
import numpy as np

def clean_diabetes_data(input_path, output_path):
    """Clean diabetes dataset with domain-specific rules."""
    df = pd.read_csv(input_path)
    # ----------------------------
    # 2. Remove Duplicates
    # ----------------------------
    df.drop_duplicates(inplace=True)
    
    # ----------------------------
    # 3. Fix Data Types & Values
    # ----------------------------
    # Convert binary columns to 0/1 (if stored as objects)
    binary_cols = ['hypertension', 'heart_disease', 'diabetes']
    for col in binary_cols:
        if col in df.columns:
            df[col] = df[col].astype(int)
    
    # Standardize categorical values
    if 'gender' in df.columns:
        df['gender'] = df['gender'].str.lower().replace({
            'female': 'female', 
            'male': 'male', 
            'other': 'other', 
            'f': 'female', 
            'm': 'male'
        })
    
    if 'smoking_history' in df.columns:
        df['smoking_history'] = df['smoking_history'].replace({
            'never': 'never',
            'ever': 'former', 
            'former': 'former', 
            'current': 'current',
            'not current': 'former',  # Assume "not current" = former smoker
            'No Info': 'unknown'
        })
    
    # ----------------------------
    # 4. Handle Outliers (Medical Constraints)
    # ----------------------------
    # Age: Clip to 18-100 (assuming adults only)
    if 'age' in df.columns:
        df['age'] = df['age'].clip(lower=18, upper=100)
    
    # BMI: Valid range 12-60 (extremes unlikely)
    if 'bmi' in df.columns:
        df['bmi'] = df['bmi'].clip(lower=12, upper=60)
    
    # HbA1c: Medically plausible range 3.5-20%
    if 'HbA1c_level' in df.columns:
        df['HbA1c_level'] = df['HbA1c_level'].clip(lower=3.5, upper=20)
    
    # Blood glucose: Fasting levels typically 70-125 mg/dL (non-fasting can be higher)
    if 'blood_glucose_level' in df.columns:
        df['blood_glucose_level'] = df['blood_glucose_level'].clip(lower=70, upper=300)
    
    # ----------------------------
    # 5. Save Cleaned Data
    # ----------------------------
    df.to_csv(output_path, index=False)
    print(f"\nCleaned data saved to {output_path}")
    print("\nMissing values after cleaning:")
    print(df.isnull().sum())

if __name__ == "__main__":
    clean_diabetes_data(
        input_path="diabetes_prediction_dataset.csv",  # Update with your filename
        output_path="cleaned_diabetes_prediction_dataset.csv"
    )


Cleaned data saved to cleaned_diabetes_prediction_dataset.csv

Missing values after cleaning:
gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64
