In [5]:

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import classification_report, f1_score, roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Set pandas option to avoid downcasting warning
pd.set_option('future.no_silent_downcasting', True)

# Load dataset
data = pd.read_csv('/content/chikungunya.csv')

# Drop redundant columns
drop_columns = ['Severe Chikungunya', 'Unnamed: 16', 'Unnamed: 17']
data = data.drop(columns=[col for col in drop_columns if col in data.columns])

# Encode binary and categorical features
data['arthritis'] = data['arthritis'].replace({'yes': 1, 'no': 0}).astype(int)
data['sex'] = data['sex'].replace({'male': 1, 'female': 0}).astype(int)
binary_cols = ['fever', 'cold', 'joint pains', 'myalgia', 'headache', 'fatigue', 'vomitting',
               'Conjuctivitis', 'Nausea', 'Maculopapular rash', 'Eye Pain', 'Chills', 'Swelling']
for col in binary_cols:
    data[col] = data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)

# Feature engineering
data['key_symptom_count'] = data[['Swelling', 'vomitting', 'joint pains']].sum(axis=1).astype(int)
data['Swelling_joint_pains'] = data['Swelling'] * data['joint pains']

# Split data
X = data.drop('arthritis', axis=1)
y = data['arthritis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Significant features
significant_features = ['Swelling', 'vomitting', 'joint pains', 'key_symptom_count', 'Swelling_joint_pains']
X_train_sm_selected = X_train_sm[significant_features]
X_test_selected = X_test[significant_features]

# Function to train and evaluate hybrid ensemble
def train_evaluate_hybrid(X_train, y_train, X_test, y_test, model_name, class_weights=None):
    lr_model = LogisticRegression(C=0.01, solver='lbfgs', max_iter=1000, class_weight=class_weights, random_state=42)
    xgb_model = XGBClassifier(max_depth=5, scale_pos_weight=3.0 if class_weights else 1.0, random_state=42)
    lr_model.fit(X_train, y_train)
    xgb_model.fit(X_train, y_train)
    lr_prob = lr_model.predict_proba(X_test)[:, 1]
    xgb_prob = xgb_model.predict_proba(X_test)[:, 1]
    ensemble_prob = (0.7 * xgb_prob + 0.3 * lr_prob)
    thresholds = np.arange(0.3, 0.7, 0.05)
    best_macro_f1 = 0
    best_y_pred = None
    for threshold in thresholds:
        y_pred = (ensemble_prob >= threshold).astype(int)
        macro_f1 = f1_score(y_test, y_pred, average='macro')
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_y_pred = y_pred
    auc_score = roc_auc_score(y_test, ensemble_prob)
    cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='f1_macro')
    print(f"\n{model_name} Results:")
    print(f"Macro F1: {best_macro_f1:.2f}")
    print(f"AUC: {auc_score:.3f}")
    print(f"CV Mean F1: {cv_scores.mean():.2f}, Std: {cv_scores.std():.2f}")
    print("Classification Report:")
    print(classification_report(y_test, best_y_pred, target_names=['No Arthritis (0)', 'Arthritis (1)'], zero_division=0))
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, best_y_pred)
    print(f"Actual 0: {cm[0,0]} {cm[0,1]}")
    print(f"Actual 1: {cm[1,0]} {cm[1,1]}")
    return best_macro_f1, auc_score, cv_scores

# Method 1: Refined Hybrid Ensemble
macro_f1_base, auc_base, cv_scores_base = train_evaluate_hybrid(X_train_sm_selected, y_train_sm, X_test_selected, y_test, "Refined Hybrid Ensemble")

# Method 2: LightGBM
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
lgb_model = LGBMClassifier(random_state=42)
param_grid = {'max_depth': [5, 7], 'n_estimators': [100, 500], 'num_leaves': [31], 'learning_rate': [0.1]}
grid_search = GridSearchCV(lgb_model, param_grid, cv=StratifiedKFold(5), scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train_sm_selected, y_train_sm)
best_lgb = grid_search.best_estimator_
y_pred_lgb = best_lgb.predict(X_test_selected)
y_prob_lgb = best_lgb.predict_proba(X_test_selected)[:, 1]
macro_f1_lgb = f1_score(y_test, y_pred_lgb, average='macro')
auc_lgb = roc_auc_score(y_test, y_prob_lgb)
cv_scores_lgb = cross_val_score(best_lgb, X_train_sm_selected, y_train_sm, cv=StratifiedKFold(5), scoring='f1_macro')
print("\nLightGBM Results:")
print(f"Macro F1: {macro_f1_lgb:.2f}")
print(f"AUC: {auc_lgb:.3f}")
print(f"CV Mean F1: {cv_scores_lgb.mean():.2f}, Std: {cv_scores_lgb.std():.2f}")
print(classification_report(y_test, y_pred_lgb, target_names=['No Arthritis (0)', 'Arthritis (1)']))

# Method 3: Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_us, y_train_us = rus.fit_resample(X_train, y_train)
macro_f1_us, auc_us, cv_scores_us = train_evaluate_hybrid(X_train_us[significant_features], y_train_us, X_test_selected, y_test, "Hybrid Ensemble with Undersampling")

# Method 4: Cost-Sensitive Learning
class_weights = {0: 3.0, 1: 1.0}
macro_f1_cs, auc_cs, cv_scores_cs = train_evaluate_hybrid(X_train_sm_selected, y_train_sm, X_test_selected, y_test, "Hybrid Ensemble with Cost-Sensitive Learning", class_weights=class_weights)

# Method 5: Bagging with Random Forest
rf_base = RandomForestClassifier(n_estimators=100, max_depth=5, class_weight='balanced', random_state=42)
bagging_model = BaggingClassifier(estimator=rf_base, n_estimators=10, random_state=42)
bagging_model.fit(X_train_sm_selected, y_train_sm)
y_pred_bagging = bagging_model.predict(X_test_selected)
y_prob_bagging = bagging_model.predict_proba(X_test_selected)[:, 1]
macro_f1_bagging = f1_score(y_test, y_pred_bagging, average='macro')
auc_bagging = roc_auc_score(y_test, y_prob_bagging)
cv_scores_bagging = cross_val_score(bagging_model, X_train_sm_selected, y_train_sm, cv=StratifiedKFold(5), scoring='f1_macro')
print("\nBagging Classifier Results:")
print(f"Macro F1: {macro_f1_bagging:.2f}")
print(f"AUC: {auc_bagging:.3f}")
print(f"CV Mean F1: {cv_scores_bagging.mean():.2f}, Std: {cv_scores_bagging.std():.2f}")
print(classification_report(y_test, y_pred_bagging, target_names=['No Arthritis (0)', 'Arthritis (1)']))

# Method 6: AdaBoost with Decision Tree
dt_base = DecisionTreeClassifier(max_depth=3, class_weight='balanced', random_state=42)
adaboost_model = AdaBoostClassifier(estimator=dt_base, n_estimators=50, random_state=42)
adaboost_model.fit(X_train_sm_selected, y_train_sm)
y_pred_adaboost = adaboost_model.predict(X_test_selected)
y_prob_adaboost = adaboost_model.predict_proba(X_test_selected)[:, 1]
macro_f1_adaboost = f1_score(y_test, y_pred_adaboost, average='macro')
auc_adaboost = roc_auc_score(y_test, y_prob_adaboost)
cv_scores_adaboost = cross_val_score(adaboost_model, X_train_sm_selected, y_train_sm, cv=StratifiedKFold(5), scoring='f1_macro')
print("\nAdaBoost Classifier Results:")
print(f"Macro F1: {macro_f1_adaboost:.2f}")
print(f"AUC: {auc_adaboost:.3f}")
print(f"CV Mean F1: {cv_scores_adaboost.mean():.2f}, Std: {cv_scores_adaboost.std():.2f}")
print(classification_report(y_test, y_pred_adaboost, target_names=['No Arthritis (0)', 'Arthritis (1)']))

# Method 7: Polynomial Features
scaler = StandardScaler()
X_train_sm_scaled = scaler.fit_transform(X_train_sm_selected)
X_test_scaled = scaler.transform(X_test_selected)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_sm_poly = poly.fit_transform(X_train_sm_scaled)
X_test_poly = poly.transform(X_test_scaled)
macro_f1_poly, auc_poly, cv_scores_poly = train_evaluate_hybrid(X_train_sm_poly, y_train_sm, X_test_poly, y_test, "Hybrid Ensemble with Polynomial Features")

# Save results
results_df = pd.DataFrame({
    'Model': ['Refined Hybrid Ensemble', 'LightGBM', 'Hybrid Ensemble (Undersampling)', 'Hybrid Ensemble (Cost-Sensitive)', 'Bagging Classifier', 'AdaBoost Classifier', 'Hybrid Ensemble (Polynomial Features)'],
    'Macro F1': [macro_f1_base, macro_f1_lgb, macro_f1_us, macro_f1_cs, macro_f1_bagging, macro_f1_adaboost, macro_f1_poly],
    'AUC': [auc_base, auc_lgb, auc_us, auc_cs, auc_bagging, auc_adaboost, auc_poly],
    'CV Mean F1': [cv_scores_base.mean(), cv_scores_lgb.mean(), cv_scores_us.mean(), cv_scores_cs.mean(), cv_scores_bagging.mean(), cv_scores_adaboost.mean(), cv_scores_poly.mean()],
    'CV Std F1': [cv_scores_base.std(), cv_scores_lgb.std(), cv_scores_us.std(), cv_scores_cs.std(), cv_scores_bagging.std(), cv_scores_adaboost.std(), cv_scores_poly.std()]
})
results_df.to_csv('/content/multiple_methods_results_updated.csv', index=False)
print("\nResults Saved to '/content/multiple_methods_results_updated.csv':")
print(results_df)

# Correlation heatmap
corr_matrix = data.corr()
plt.figure(figsize=(8, 6), dpi=300)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', square=True)
plt.title('Feature Correlation Matrix', fontsize=12)
plt.savefig('/content/correlation_heatmap_updated.png', dpi=300, bbox_inches='tight')
plt.close()



Refined Hybrid Ensemble Results:
Macro F1: 0.51
AUC: 0.537
CV Mean F1: 0.54, Std: 0.02
Classification Report:
                  precision    recall  f1-score   support

No Arthritis (0)       0.43      0.35      0.39        82
   Arthritis (1)       0.60      0.67      0.63       119

        accuracy                           0.54       201
       macro avg       0.51      0.51      0.51       201
    weighted avg       0.53      0.54      0.53       201

Confusion Matrix:
Actual 0: 29 53
Actual 1: 39 80
[LightGBM] [Info] Number of positive: 472, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12
[LightGBM] [Info] Number of data points in the train set: 944, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 377, number of negat