In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import joblib
from imblearn.over_sampling import SMOTENC

In [None]:
df = pd.read_csv("../data/raw/DiseaseAndSymptoms.csv")
df.fillna('none', inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
symptom_cols = [col for col in df.columns if col.startswith("Symptom_")]
all_symptoms = pd.unique(df[symptom_cols].values.ravel('K'))
symptom_encoder = LabelEncoder().fit(all_symptoms)

for col in symptom_cols:
    df[col] = symptom_encoder.transform(df[col])

In [None]:
disease_encoder = LabelEncoder()
df['Disease'] = disease_encoder.fit_transform(df['Disease'])

In [None]:
joblib.dump(symptom_encoder, "../src/models/symptom_encoder.joblib")
joblib.dump(disease_encoder, "../src//models/disease_encoder.joblib")

In [None]:
X = df.drop("Disease", axis=1)
y = df["Disease"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
# symptom_indices = list(range(X_train.shape[1]))  # All features are "categorical"

In [None]:
smote = SMOTE(random_state=42, k_neighbors=3)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:
pd.concat([pd.DataFrame(X_train_res, columns=X.columns), pd.Series(y_train_res, name='Disease')], axis=1)\
  .to_csv("../data/processed/train_data.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
pd.Series(y_test, name='Disease').to_csv("../data/processed/y_test.csv", index=False)