In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# 1. Load the data
dataset = pd.read_csv('../../data/raw/dataset.csv')
severity = pd.read_csv('../../data/raw/Symptom-severity.csv')

In [None]:
#create symptom encoder
all_symptoms = []
for col in dataset.columns:
    if 'Symptom_' in col:
        all_symptoms.extend(dataset[col].dropna().unique())
unique_symptoms = list(set(all_symptoms))

In [None]:
def create_symptom_features(row):
    # init dict with 0s for all symptoms
    symptoms_dict = {symptom: 0 for symptom in unique_symptoms}
    
    # update dict for 1s for all present symptoms
    for col in dataset.columns:
        if 'Symptom_' in col and pd.notna(row[col]):
            symptoms_dict[row[col]] = 1
            
    return pd.Series(symptoms_dict)

In [None]:
# transform the data
X = dataset.apply(create_symptom_features, axis=1)
print("Feature matrix shape:", X.shape)

Feature matrix shape: (4920, 131)


In [None]:
# encode the target (disease)
le = LabelEncoder()
y = le.fit_transform(dataset['Disease'])

In [None]:
# split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# save processed data
processed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': list(X.columns),
    'target_names': list(le.classes_)
}

In [None]:
# save the data
import joblib
joblib.dump(processed_data, '../../data/processed/processed_data.joblib')
joblib.dump(le, '../../data/processed/label_encoder.joblib')

['../data/processed/label_encoder.joblib']

In [13]:
print("Data preprocessing completed!")
print(f"Number of features: {X.shape[1]}")
print(f"Number of classes: {len(le.classes_)}")

Data preprocessing completed!
Number of features: 131
Number of classes: 41
