In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import joblib



In [9]:
# 1. Load the data
dataset = pd.read_csv('../../data/raw/dataset.csv')
severity = pd.read_csv('../../data/raw/Symptom-severity.csv')

In [10]:
severity_dict = dict(zip(severity['Symptom'], severity['weight']))

In [11]:
#create symptom encoder
all_symptoms = []
for col in dataset.columns:
    if 'Symptom_' in col:
        all_symptoms.extend(dataset[col].dropna().unique())
unique_symptoms = list(set(all_symptoms) | set(severity['Symptom'].unique()))

In [None]:
def create_symptom_features(row):
    symptoms_dict = {symptom: 0 for symptom in unique_symptoms}  # Step 1: Initialize all symptoms to 0
    
    for col in dataset.columns:
        if 'Symptom_' in col and pd.notna(row[col]):  # Step 2: Check if symptom exists
            symptom = row[col].strip()  
            if symptom in severity_dict:
                symptoms_dict[symptom] = severity_dict[symptom]  # Assign severity score
    return pd.Series(symptoms_dict)

In [13]:
X_preprocessed = dataset.apply(create_symptom_features, axis=1)
print("Fixed Preprocessing - Nonzero values:", (X_preprocessed != 0).sum().sum())


Fixed Preprocessing - Nonzero values: 36330


In [15]:
# encode the target (disease)
le = LabelEncoder()
y = le.fit_transform(dataset['Disease'])

disease_mapping = dict(zip(le.classes_, range(len(le.classes_))))
joblib.dump(disease_mapping, '../../data/processed/disease_mapping.joblib')

['../../data/processed/disease_mapping.joblib']

In [17]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=42, stratify=y
)


In [18]:
# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, '../../data/processed/scaler.joblib')


['../../data/processed/scaler.joblib']

In [52]:
joblib.dump(le, '../../data/processed/label_encoder.joblib')

['../../data/processed/label_encoder.joblib']

In [53]:
pd.DataFrame(X_train_scaled, columns=X.columns).to_csv('../../data/processed/preprocessed_X_train.csv', index=False)
pd.DataFrame(X_test_scaled, columns=X.columns).to_csv('../../data/processed/preprocessed_X_test.csv', index=False)
pd.Series(y_train).to_csv('../../data/processed/preprocessed_y_train.csv', index=False)
pd.Series(y_test).to_csv('../../data/processed/preprocessed_y_test.csv', index=False)

In [54]:
print("Data preprocessing completed!")
print(f"Number of features: {X.shape[1]}")
print(f"Number of classes: {len(le.classes_)}")

Data preprocessing completed!
Number of features: 262
Number of classes: 41
