In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
# üöÄ Install all required libraries correctly
!pip install -U pandas scikit-learn imbalanced-learn xgboost catboost tensorflow


In [None]:
import pandas as pd
from tabulate import tabulate

# Set the filename to your dataset
DATASET_FILENAME = "Crop Recommendation using Soil Properties and Weather Prediction.csv"
SAMPLE_ROWS = 10

print(f"üîπ Loading dataset and checking feature names: {DATASET_FILENAME}...")

try:
    # Load the dataset
    df = pd.read_csv(DATASET_FILENAME)
    print("‚úÖ Dataset loaded successfully.")

    # --- Actual Columns Found (Printed for verification) ---
    actual_columns = df.columns.tolist()
    print("\n‚ö†Ô∏è **Actual Columns Found in Dataset:**")
    print(actual_columns)
    print("----------------------------------------------------------------")

    # --- CORRECTED FEATURE LIST (Using EXACT names from your output) ---
    # We will select the core nutrients, soil pH, soil color,
    # and a representative set of the complex weather features (e.g., Winter values).
    NECESSARY_FEATURES = [
        'N', 'P', 'K',             # Major Nutrients
        'Ph',                      # Corrected: Ph (Capital P)
        'Soilcolor',               # Corrected: Soilcolor (One word, Capital S)
        'PRECTOTCORR-W',           # Corrected: Placeholder for Winter Precipitation/Rainfall
        'T2M_MAX-W',               # Corrected: Placeholder for Winter Max Temperature
        'QV2M-W',                  # Corrected: Placeholder for Winter Humidity/Moisture content
        'label'                    # The target variable
    ]

    # Ensure all required features are present before proceeding
    for feature in NECESSARY_FEATURES:
        if feature not in actual_columns:
            raise KeyError(f"The necessary feature '{feature}' was not found in the dataset columns.")

    # 1. Filter the DataFrame to include only the necessary columns
    df_selected = df[NECESSARY_FEATURES]

    # 2. Print Sample Dataset (Head) of the selected columns
    print(f"\nüìã Sample Dataset (First {SAMPLE_ROWS} rows) - Selected Features:")

    print(tabulate(df_selected.head(SAMPLE_ROWS),
                   headers='keys',
                   tablefmt='fancy_grid',
                   showindex=False))

    print(f"\n‚úÖ Columns successfully displayed: {NECESSARY_FEATURES}")

except KeyError as e:
    print(f"\n‚ùå ERROR: Feature mismatch! {e}")
    print("ACTION: You must ensure the `NECESSARY_FEATURES` list EXACTLY matches the names printed above.")
except FileNotFoundError:
    print(f"\n‚ùå ERROR: The file '{DATASET_FILENAME}' was not found.")
except Exception as e:
    print(f"\n‚ùå An unexpected error occurred: {e}")

In [None]:
import pandas as pd

# Set the filename to your dataset
DATASET_FILENAME = "Crop Recommendation using Soil Properties and Weather Prediction.csv"

print(f"üîπ Loading dataset: {DATASET_FILENAME}...")

try:
    # Load the dataset
    df = pd.read_csv(DATASET_FILENAME)
    print("‚úÖ Dataset loaded successfully.")

    # Get the list of all column names (features + target)
    all_features_list = df.columns.tolist()

    print("\nüìã **All Features (Column Names) Present in the Dataset:**")
    # Print them clearly as a list
    for i, col in enumerate(all_features_list):
        print(f"{i+1}. {col}")

    # You can also print the list directly
    # print("\nComplete List:")
    # print(all_features_list)

except FileNotFoundError:
    print(f"\n‚ùå ERROR: The file '{DATASET_FILENAME}' was not found. Please ensure it is uploaded.")
except Exception as e:
    print(f"\n‚ùå An unexpected error occurred: {e}")

In [None]:
print("üîπ Installing required libraries...")
# We add 'tabulate' for printing clean tables
!pip install pandas scikit-learn "imblearn>=0.11.0" joblib matplotlib seaborn "tensorflow>=2.12" "xgboost>=1.7" "catboost>=1.2" tabulate
print("‚úÖ Installation complete!")

# Import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    VotingClassifier,
    StackingClassifier,
    BaggingClassifier
)
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout, Input
from tensorflow.keras.utils import to_categorical
# Import standard regularizers AND the base class to build our custom L3
from tensorflow.keras.regularizers import l1, l2, l1_l2, Regularizer
from tabulate import tabulate # For clean table output

print("‚úÖ All libraries imported successfully.")

In [None]:
import pandas as pd
# Load dataset
print("üîπ Loading raw dataset...")
# Make sure "Crop Recommendation using Soil Properties and Weather Prediction.csv" is uploaded to Colab
df = pd.read_csv("Crop Recommendation using Soil Properties and Weather Prediction.csv")
print(f"Dataset shape before preprocessing: {df.shape}")

# Identify categorical (non-numeric) columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("\nüîπ Categorical columns found:", categorical_cols)

# Encode label column separately
target_col = 'label'
categorical_cols.remove(target_col) # Remove 'label' from features to be encoded

# Encode other categorical features (e.g., soil color)
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    print(f"Encoded column: {col}")

# Encode the target (crop name)
le_target = LabelEncoder()
df['label_encoded'] = le_target.fit_transform(df[target_col])

# Split features and target
X = df.drop([target_col, 'label_encoded'], axis=1)
y = df['label_encoded']
print(f"\nüîπ Features (X) shape: {X.shape}")

# Scale numeric features
print("\nüîπ Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE for balancing
print("\nüîπ Applying SMOTE for class balancing...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
print("‚úÖ Balancing complete!")
print("Samples after SMOTE:", len(y_resampled))

# Create new DataFrame
df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
df_balanced['label_encoded'] = y_resampled

# Save processed data
df_balanced.to_csv("balanced_dataset.csv", index=False)
print(f"\n‚úÖ Preprocessing complete! Processed dataset saved as 'balanced_dataset.csv' with shape {df_balanced.shape}")

# Save encoders and scaler
joblib.dump(le_target, "label_encoder.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "feature_encoders.pkl")
print("‚úÖ Encoders and scaler saved successfully!")

In [None]:
print("üîπ Loading preprocessed balanced dataset...")
data = pd.read_csv("balanced_dataset.csv")
print(f"‚úÖ Balanced dataset shape: {data.shape}")

# Split features and target
X = data.drop('label_encoded', axis=1)
y = data['label_encoded']

# Get number of classes for later
num_classes = len(np.unique(y))
print(f"Number of unique classes: {num_classes}")

# Split into Train/Test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("‚úÖ Data split complete ‚Äî 80% train, 20% test")
print(f"Train samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

In [None]:
print("üöÄ Training standard ML models... please wait...")

# Define Models
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
ada_clf = AdaBoostClassifier(n_estimators=150, random_state=42)
cat_clf = CatBoostClassifier(verbose=0, allow_writing_files=False, random_state=42)

# Train Models
rf_clf.fit(X_train, y_train)
print("‚úÖ Random Forest trained.")
xgb_clf.fit(X_train, y_train)
print("‚úÖ XGBoost trained.")
ada_clf.fit(X_train, y_train)
print("‚úÖ AdaBoost trained.")
cat_clf.fit(X_train, y_train)
print("‚úÖ CatBoost trained.")

print("\n‚úÖ All base models trained successfully!\n")

In [None]:
print("üìä Evaluating Individual Models...")

models = {
    "Random Forest": rf_clf,
    "XGBoost": xgb_clf,
    "AdaBoost": ada_clf,
    "CatBoost": cat_clf
}

# This dictionary will hold all model accuracies for the final chart
model_accuracies = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    model_accuracies[name] = acc
    print(f"üî∏ {name} Accuracy: {acc:.4f}")
    # print(classification_report(y_test, y_pred)) # Uncomment to see full report
    print("-" * 60)

In [None]:
print("\nüß© Training and Evaluating Ensemble Models...")

# Voting Ensemble (Hard Voting)
print("üîπ Training Voting Ensemble...")
voting_clf = VotingClassifier(
    estimators=[('rf', rf_clf), ('xgb', xgb_clf), ('ada', ada_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)
voting_pred = voting_clf.predict(X_test)
voting_acc = accuracy_score(y_test, voting_pred)
model_accuracies["Voting Ensemble"] = voting_acc
print(f"‚úÖ Voting Ensemble Accuracy: {voting_acc:.4f}")

# Stacking Ensemble
print("\nüîπ Training Stacking Ensemble...")
estimators = [
    ('rf', rf_clf),
    ('xgb', xgb_clf),
    ('cat', cat_clf)
]
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=AdaBoostClassifier(n_estimators=100, random_state=42)
)
stacking_clf.fit(X_train, y_train)
stack_pred = stacking_clf.predict(X_test)
stack_acc = accuracy_score(y_test, stack_pred)
model_accuracies["Stacking Ensemble"] = stack_acc
print(f"‚úÖ Stacking Ensemble Accuracy: {stack_acc:.4f}")

In [None]:
print("\nüß† NEW: Starting CNN Optimizer/Activation Experiment...")

# Prepare data for 1D-CNN
X_train_cnn = np.expand_dims(X_train.values, axis=2)
X_test_cnn = np.expand_dims(X_test.values, axis=2)
y_train_cnn = to_categorical(y_train, num_classes)
y_test_cnn = to_categorical(y_test, num_classes)

print(f"CNN Train data shape: {X_train_cnn.shape}, CNN Test data shape: {X_test_cnn.shape}")

# Define experiment parameters
optimizers_list = ['adam', 'rmsprop']
activations_list = ['relu', 'gelu', 'tanh', 'sigmoid']
epochs_to_run = 50
input_shape = (X_train_cnn.shape[1], 1)

# This list will store results for the table
cnn_results_list = []
best_accuracy = 0
best_combo = {} # To store the best optimizer/activation for the next cell

# Helper function to create the model
def create_cnn_model(optimizer_name, activation_name, regularizer=None):
    model = Sequential([
        Input(shape=input_shape),
        Conv1D(64, 3, activation=activation_name),
        Conv1D(32, 3, activation=activation_name),
        Flatten(),
        Dropout(0.3),
        # Apply regularizer to the dense layer
        Dense(64, activation=activation_name, kernel_regularizer=regularizer),
        Dense(num_classes, activation='softmax') # Output layer
    ])

    model.compile(optimizer=optimizer_name,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Run the experiment loop
for optimizer_name in optimizers_list:
    for activation_name in activations_list:
        combo_name = f"CNN ({optimizer_name} + {activation_name})"
        print(f"\nüöÄ Training {combo_name} for {epochs_to_run} epochs...")

        model = create_cnn_model(optimizer_name, activation_name)

        model.fit(X_train_cnn, y_train_cnn,
                  epochs=epochs_to_run,
                  batch_size=32,
                  verbose=0, # Set to 1 to see epoch-by-epoch progress
                  validation_split=0.1)

        loss, accuracy = model.evaluate(X_test_cnn, y_test_cnn, verbose=0)

        print(f"‚úÖ {combo_name} - Test Accuracy: {accuracy:.4f}")

        # Store result for table and main dictionary
        result_entry = {
            "Optimizer": optimizer_name,
            "Activation": activation_name,
            "Accuracy": accuracy
        }
        cnn_results_list.append(result_entry)
        model_accuracies[combo_name] = accuracy

        # Track the best combination
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_combo = {"optimizer": optimizer_name, "activation": activation_name}

print("\n\n‚úÖ CNN Experiment Complete!")

# --- Generate and Print Results Table ---
results_df = pd.DataFrame(cnn_results_list)
results_df = results_df.sort_values(by="Accuracy", ascending=False)
results_df["Accuracy"] = results_df["Accuracy"].map(lambda x: f"{x:.4f}") # Format for printing

print("\nüìä CNN Optimizer/Activation Accuracy Comparison")
print(tabulate(results_df, headers='keys', tablefmt='pretty', showindex=False))

print(f"\nüèÜ Best Combination Found: {best_combo['optimizer']} + {best_combo['activation']} with {best_accuracy:.4f} accuracy.")

In [None]:
# --- Define Custom L3 Regularizer ---
# We build a class that Keras can use
# It calculates: strength * sum(abs(weights)^3)

@tf.keras.utils.register_keras_serializable() # Helps save/load models with this
class L3Regularizer(Regularizer):
    """
    A custom L3 regularizer that applies a penalty based on the L3 norm.
    Penalty = strength * sum(abs(w)^3)
    """
    def __init__(self, strength=0.0):
        self.strength = float(strength) # Ensure strength is a float

    def __call__(self, w):
        # The regularization function
        return self.strength * tf.reduce_sum(tf.pow(tf.abs(w), 3))

    def get_config(self):
        # Required for Keras to save/load the model
        return {'strength': self.strength}
# -----------------------------------

print(f"\nüß† NEW: Starting Regularization Experiment on best model ({best_combo['optimizer']} + {best_combo['activation']})...")

# Define regularization strengths
reg_strength = 0.001 # You can tune this
regularizers_to_test = {
    "Original (No Regularization)": None,
    "L1 Regularization": l1(reg_strength),
    "L2 Regularization": l2(reg_strength),
    "L1+L2 Regularization": l1_l2(reg_strength, reg_strength),
    "Custom L3 Regularization": L3Regularizer(strength=reg_strength) # Add custom L3
}

reg_results_list = []

for reg_name, reg_func in regularizers_to_test.items():
    print(f"\nüöÄ Training model with {reg_name}...")

    # Create the model using the best combo and the specified regularizer
    model = create_cnn_model(
        optimizer_name=best_combo['optimizer'],
        activation_name=best_combo['activation'],
        regularizer=reg_func
    )

    # Train the model
    model.fit(X_train_cnn, y_train_cnn,
              epochs=epochs_to_run,
              batch_size=32,
              verbose=0,
              validation_split=0.1)

    # Evaluate
    loss, accuracy = model.evaluate(X_test_cnn, y_test_cnn, verbose=0)

    print(f"‚úÖ {reg_name} - Test Accuracy: {accuracy:.4f}")

    # Store result for table
    reg_results_list.append({
        "Model": reg_name,
        "Accuracy": accuracy
    })

    # Add to main accuracy dictionary (except for original)
    if reg_name != "Original (No Regularization)":
        model_accuracies[f"CNN (Best + {reg_name})"] = accuracy

print("\n\n‚úÖ CNN Regularization Experiment Complete!")

# --- Generate and Print Regularization Table ---
reg_results_df = pd.DataFrame(reg_results_list)
reg_results_df["Accuracy"] = reg_results_df["Accuracy"].map(lambda x: f"{x:.4f}") # Format

print("\nüìä CNN Regularization Accuracy Comparison (Original vs L1 vs L2 vs L3)")
print(tabulate(reg_results_df, headers='keys', tablefmt='pretty', showindex=False))

In [None]:
print("\nüß© Building CNN + ML Hybrid Models...")

# We use the best CNN model from our experiment as the feature extractor
print(f"Using {best_combo['optimizer']}+{best_combo['activation']} for hybrid base.")
cnn_model_for_hybrid = create_cnn_model(
    optimizer_name=best_combo['optimizer'],
    activation_name=best_combo['activation']
)
cnn_model_for_hybrid.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=32, verbose=0)

# Extract features from CNN (remove final classification layer)
feature_extractor = Sequential(cnn_model_for_hybrid.layers[:-1])
cnn_features_train = feature_extractor.predict(X_train_cnn)
cnn_features_test = feature_extractor.predict(X_test_cnn)
print("‚úÖ CNN features extracted.")

# Define ML models for hybrid use
hybrid_models = {
    "CNN + RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "CNN + XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "CNN + AdaBoost": AdaBoostClassifier(n_estimators=150, random_state=42),
    "CNN + CatBoost": CatBoostClassifier(verbose=0, allow_writing_files=False, random_state=42)
}

# Train and evaluate hybrid models
for name, model in hybrid_models.items():
    print(f"üöÄ Training {name} ...")
    model.fit(cnn_features_train, y_train)
    y_pred = model.predict(cnn_features_test)
    acc = accuracy_score(y_test, y_pred)
    model_accuracies[name] = acc
    print(f"‚úÖ {name} Accuracy: {acc:.4f}")

# --- Bagging Combinations ---
print("\nüß© Building Bagging Model Combinations...")
bagging_combos = {
    "Bagging_RF_on_XGB": BaggingClassifier(estimator=xgb_clf, n_estimators=10, random_state=42),
    "Bagging_RF_on_CatBoost": BaggingClassifier(estimator=cat_clf, n_estimators=10, random_state=42),
    "Bagging_XGB_on_RF": BaggingClassifier(estimator=rf_clf, n_estimators=10, random_state=42),
}

for name, bag_model in bagging_combos.items():
    print(f"\nüöÄ Training {name} ...")
    bag_model.fit(X_train, y_train)
    y_pred = bag_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    model_accuracies[name] = acc
    print(f"‚úÖ {name} Accuracy: {acc:.4f}")

print("\n‚úÖ Hybrid and Bagging models complete.")

In [None]:
print("\nüìä Generating Final Combined Model Accuracy Chart...")

# Sort the models by accuracy for better visualization
sorted_accuracies = sorted(model_accuracies.items(), key=lambda item: item[1], reverse=True)
sorted_model_names = [item[0] for item in sorted_accuracies]
sorted_model_scores = [item[1] for item in sorted_accuracies]

plt.figure(figsize=(24, 12)) # Increased size for all models
bars = plt.bar(sorted_model_names, sorted_model_scores, color=plt.cm.Paired(np.arange(len(sorted_model_names))))

plt.title("üåæ Final Combined Model Accuracy Comparison (All Models)", fontsize=20)
plt.ylabel("Accuracy Score", fontsize=14)
plt.xlabel("Model", fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=10) # Rotate labels
plt.yticks(fontsize=10)
plt.ylim(0, 1.05) # Set y-axis limit
plt.tight_layout() # Adjust layout

# Add text labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.005, f'{yval:.4f}', ha='center', va='bottom', fontsize=9)

# Save the final chart
plt.savefig("final_combined_accuracy_comparison.png", dpi=300)
# plt.show() # In Colab, the plot will display automatically

print("\n‚úÖ Final Combined Accuracy Chart Saved: final_combined_accuracy_comparison.png")
print("\nüéâ All tasks complete!")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (Make sure to point to your original file)
print("üîπ Loading raw dataset to check imbalance...")
try:
    # Use the filename from section-3
    df_raw = pd.read_csv("Crop Recommendation using Soil Properties and Weather Prediction.csv")
    print(f"Dataset shape: {df_raw.shape}")
except FileNotFoundError:
    print("‚ùå Error: Original dataset file not found. Please ensure 'Crop Recommendation using Soil Properties and Weather Prediction.csv' is uploaded.")
    raise

# Check class distribution of the target variable ('label')
target_col = 'label'
class_counts = df_raw[target_col].value_counts()
class_proportions = df_raw[target_col].value_counts(normalize=True) * 100

print("\nüìä Class Distribution (Original Dataset):")
print(class_counts)
print("\nPercentage Distribution:")
print(class_proportions.map(lambda x: f"{x:.2f}%"))

# Visualize the distribution
plt.figure(figsize=(12, 6))
sns.countplot(y=df_raw[target_col], order=class_counts.index, palette="viridis")
plt.title(f'Class Distribution of Target Variable: {target_col}')
plt.xlabel('Number of Samples')
plt.ylabel('Crop Label')
plt.show()

# Conclusion based on proportions
min_class_prop = class_proportions.min()
max_class_prop = class_proportions.max()
imbalance_ratio = max_class_prop / min_class_prop

print(f"\n‚úÖ Imbalance Check Complete!")
if imbalance_ratio > 2: # Simple heuristic: largest class is more than 2x the smallest
    print(f"‚ö†Ô∏è **Result: The dataset is IMBLANCED!** (Largest class is {imbalance_ratio:.2f}x the size of the smallest.)")
else:
    print("üëç **Result: The dataset appears to be relatively balanced.**")

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

# --- Preprocessing without SMOTE ---
print("üîπ Loading raw dataset for UNBALANCED training...")
try:
    df_unbalanced = pd.read_csv("Crop Recommendation using Soil Properties and Weather Prediction.csv")
except FileNotFoundError:
    print("‚ùå Error: Original dataset file not found.")
    raise

# Identify categorical (non-numeric) columns and encode
target_col = 'label'
categorical_cols = df_unbalanced.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove(target_col)

le_target = LabelEncoder()
df_unbalanced['label_encoded'] = le_target.fit_transform(df_unbalanced[target_col])

for col in categorical_cols:
    le = LabelEncoder()
    df_unbalanced[col] = le.fit_transform(df_unbalanced[col])

# Split features and target
X_unbalanced = df_unbalanced.drop([target_col, 'label_encoded'], axis=1)
y_unbalanced = df_unbalanced['label_encoded']

# Scale numeric features (using the same logic as before)
scaler_unbalanced = StandardScaler()
X_scaled_unbalanced = scaler_unbalanced.fit_transform(X_unbalanced)
X_scaled_unbalanced = pd.DataFrame(X_scaled_unbalanced, columns=X_unbalanced.columns)

print(f"Features (X_unbalanced) shape: {X_scaled_unbalanced.shape}")

# Split into Train/Test sets (Stratify is critical for imbalanced data)
X_train_unbalanced, X_test_unbalanced, y_train_unbalanced, y_test_unbalanced = train_test_split(
    X_scaled_unbalanced, y_unbalanced, test_size=0.2, random_state=42, stratify=y_unbalanced
)
print("‚úÖ Data split complete ‚Äî UNBALANCED data")
print(f"Train samples: {X_train_unbalanced.shape[0]}, Test samples: {X_test_unbalanced.shape[0]}")

# --- Training Models on Unbalanced Data ---
print("\nüöÄ Training standard ML models on UNBALANCED data...")

# Re-define Models (same as before)
rf_clf_unb = RandomForestClassifier(n_estimators=200, random_state=42)
xgb_clf_unb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
ada_clf_unb = AdaBoostClassifier(n_estimators=150, random_state=42)
cat_clf_unb = CatBoostClassifier(verbose=0, allow_writing_files=False, random_state=42)

# Dictionary to hold unbalanced model accuracies
unbalanced_accuracies = {}

# Train Models
print("... Random Forest")
rf_clf_unb.fit(X_train_unbalanced, y_train_unbalanced)
unbalanced_accuracies["Random Forest (Unbalanced)"] = accuracy_score(y_test_unbalanced, rf_clf_unb.predict(X_test_unbalanced))

print("... XGBoost")
xgb_clf_unb.fit(X_train_unbalanced, y_train_unbalanced)
unbalanced_accuracies["XGBoost (Unbalanced)"] = accuracy_score(y_test_unbalanced, xgb_clf_unb.predict(X_test_unbalanced))

print("... AdaBoost")
ada_clf_unb.fit(X_train_unbalanced, y_train_unbalanced)
unbalanced_accuracies["AdaBoost (Unbalanced)"] = accuracy_score(y_test_unbalanced, ada_clf_unb.predict(X_test_unbalanced))

print("... CatBoost")
cat_clf_unb.fit(X_train_unbalanced, y_train_unbalanced)
unbalanced_accuracies["CatBoost (Unbalanced)"] = accuracy_score(y_test_unbalanced, cat_clf_unb.predict(X_test_unbalanced))

print("\n‚úÖ All unbalanced base models trained successfully!")
for name, acc in unbalanced_accuracies.items():
    print(f"üî∏ {name} Accuracy: {acc:.4f}")

# Store the final accuracies with SMOTE results (assuming model_accuracies is from your previous code)
global final_model_accuracies
final_model_accuracies = model_accuracies.copy() # Make a copy of the existing (SMOTE/CNN) results
final_model_accuracies.update(unbalanced_accuracies) # Add the new unbalanced results

In [None]:
# Assuming 'final_model_accuracies' contains all results from previous cells
# and 'model_accuracies' from the original cells (SMOTE, Ensemble, CNN)

print("\nüèÜ Final Model Comparison: Balanced (SMOTE) vs. Unbalanced Results")

# Filter for the core four ML models for direct comparison
comparison_data = []

# Get the results for the models trained on the original SMOTE-balanced data
smote_results = {
    "Random Forest (Balanced)": final_model_accuracies.get("Random Forest"),
    "XGBoost (Balanced)": final_model_accuracies.get("XGBoost"),
    "AdaBoost (Balanced)": final_model_accuracies.get("AdaBoost"),
    "CatBoost (Balanced)": final_model_accuracies.get("CatBoost"),
}

# Get the results for the models trained on the new Unbalanced data
unbalanced_results = {
    "Random Forest (Unbalanced)": final_model_accuracies.get("Random Forest (Unbalanced)"),
    "XGBoost (Unbalanced)": final_model_accuracies.get("XGBoost (Unbalanced)"),
    "AdaBoost (Unbalanced)": final_model_accuracies.get("AdaBoost (Unbalanced)"),
    "CatBoost (Unbalanced)": final_model_accuracies.get("CatBoost (Unbalanced)"),
}

# Combine into a comparison list
base_models = ["Random Forest", "XGBoost", "AdaBoost", "CatBoost"]

for model_name in base_models:
    balanced_key = f"{model_name} (Balanced)"
    unbalanced_key = f"{model_name} (Unbalanced)"

    acc_balanced = smote_results.get(balanced_key, 0)
    acc_unbalanced = unbalanced_results.get(unbalanced_key, 0)

    # Calculate difference
    difference = acc_balanced - acc_unbalanced

    comparison_data.append({
        "Model": model_name,
        "Accuracy (SMOTE Balanced)": f"{acc_balanced:.4f}",
        "Accuracy (Unbalanced)": f"{acc_unbalanced:.4f}",
        "Difference (B - UB)": f"{difference:+.4f}"
    })

comparison_df = pd.DataFrame(comparison_data)

print("\nüìà Base ML Model Accuracy Comparison")
print(tabulate(comparison_df, headers='keys', tablefmt='fancy_grid', showindex=False))

print("\n---")

# Find the Overall Best Model (Including Ensembles, CNN, Hybrids, and Unbalanced)
best_model_name = max(final_model_accuracies, key=final_model_accuracies.get)
best_accuracy = final_model_accuracies[best_model_name]

# Create a full results table
full_results_list = []
for name, acc in sorted(final_model_accuracies.items(), key=lambda item: item[1], reverse=True):
    full_results_list.append({"Model Name": name, "Accuracy": f"{acc:.4f}"})

print("\nü•á **OVERALL PERFORMANCE LEADERBOARD**")
print(f"The highest accuracy was achieved by: **{best_model_name}** with **{best_accuracy:.4f}**")
print("\nAll Model Accuracies:")
print(tabulate(full_results_list, headers='keys', tablefmt='fancy_grid', showindex=False))