In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

# Read the dataset
df = pd.read_csv("../data/raw/DiseaseAndSymptoms.csv")

# Fill missing values with 'none' and drop duplicates
df.fillna('none', inplace=True)
df.drop_duplicates(inplace=True)

# Get the symptom columns
symptom_cols = [col for col in df.columns if col.startswith("Symptom_")]

# Initialize the OneHotEncoder for symptoms
symptom_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_symptoms = symptom_encoder.fit_transform(df[symptom_cols])

# Create a DataFrame for encoded symptoms
encoded_symptoms_df = pd.DataFrame(encoded_symptoms, columns=symptom_encoder.get_feature_names_out(symptom_cols))

# Concatenate the encoded symptoms with the original DataFrame (without the old symptom columns)
df_encoded = pd.concat([df.drop(columns=symptom_cols), encoded_symptoms_df], axis=1)

# Encode the 'Disease' column using LabelEncoder
disease_encoder = LabelEncoder()
df_encoded['Disease'] = disease_encoder.fit_transform(df_encoded['Disease'])

# Split the dataset into features and target variable
X = df_encoded.drop("Disease", axis=1)
y = df_encoded["Disease"]

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Apply SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Save the resampled dataset
balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
balanced_df.insert(0, 'Disease', y_resampled)
balanced_df.to_csv("../data/processed/preprocessed_data_v2.csv", index=False)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

# Convert scaled features back to DataFrame for better visualization
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)

# Save scaled data for later use
X_train_scaled_df.to_csv("../data/processed/X_train_scaled_v2.csv", index=False)
X_test_scaled_df.to_csv("../data/processed/X_test_scaled_v2.csv", index=False)

# Initialize models
lr_model = LogisticRegression(solver='liblinear')
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
catboost_model = cb.CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, verbose=0)
lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1)
nb_model = GaussianNB()
svm_model = make_pipeline(StandardScaler(), SVC(kernel='linear', probability=True))

# Create a VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('lr', lr_model),
    ('rf', rf_model),
    ('catboost', catboost_model),
    ('lgb', lgb_model),
    ('nb', nb_model),
    ('svm', svm_model)
], voting='soft')

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'lr__C': [0.1, 1, 10],
    'rf__n_estimators': [100, 200],
    'catboost__iterations': [300, 500],
    'lgb__learning_rate': [0.01, 0.1],
    'svm__svc__C': [0.1, 1, 10],
}

grid_search = GridSearchCV(voting_clf, param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print("Best parameters:", grid_search.best_params_)

# Cross-validation scores
cross_val_scores = cross_val_score(voting_clf, X_train_scaled, y_train, cv=5)
print(f"Cross-validation scores: {cross_val_scores}")
print(f"Mean cross-validation score: {cross_val_scores.mean()}")

# Train the ensemble model on the full training set
voting_clf.fit(X_train_scaled, y_train)

# Make predictions
y_pred = voting_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Voting Classifier accuracy: {accuracy}")

# Save the confusion matrix for visualization
conf_matrix = confusion_matrix(y_test, y_pred)

# Load disease mapping (JSON file)
with open("../data/processed/disease_mapping.json", "r") as f:
    disease_mapping = json.load(f)

index_to_disease = {v: k for k, v in disease_mapping.items()}

# Plot confusion matrix
plt.figure(figsize=(20, 18))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=[index_to_disease[i] for i in range(len(disease_mapping))],
            yticklabels=[index_to_disease[i] for i in range(len(disease_mapping))])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix with Disease Names")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Save the trained model
joblib.dump(voting_clf, "../src/models/voting_classifier_model_v2.joblib")
print(f"Model saved as ../src/models/voting_classifier_model_v2.joblib")


TypeError: __init__() got an unexpected keyword argument 'sparse'