In [None]:
# =============================
# Tree-Based Classification with GridSearchCV
# =============================

# Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

# Step 2: Load the Full Training and Test Sets
# Replace with actual file paths
train_df = pd.read_csv('train.csv')   # <-- Full training data WITH labels
test_df = pd.read_csv('test.csv')     # <-- Test set WITHOUT labels

# Step 3: Separate Features and Target
# Replace 'target' with the actual name of your label column
TARGET_COL = 'target'  # <-- Placeholder

X = train_df.drop(columns=[TARGET_COL])
y = train_df[TARGET_COL]
X_test = test_df  # Assuming test_df is already preprocessed and has same features as X

# Step 4: Split Training Data into Train/Validation Sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 5: Define Helper Function for Evaluation
def evaluate_model(model, name):
    """Evaluate the model on the validation set and print metrics."""
    y_pred = model.predict(X_val)
    print(f"\n--- {name} ---")
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("Precision:", precision_score(y_val, y_pred, average='weighted'))
    print("Recall:", recall_score(y_val, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_val, y_pred, average='weighted'))
    print("Classification Report:\n", classification_report(y_val, y_pred))
    return model.predict(X_test)

# Step 6: Define Models and Hyperparameter Grids
models_params = {
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [3, 5, 10, None],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [5, 10, None],
            "min_samples_split": [2, 5]
        }
    },
    "XGBoost": {
        "model": xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [3, 6],
            "learning_rate": [0.05, 0.1]
        }
    },
    "LightGBM": {
        "model": lgb.LGBMClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "num_leaves": [31, 64],
            "learning_rate": [0.05, 0.1]
        }
    },
    "Gradient Boosting (sklearn)": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [3, 5],
            "learning_rate": [0.05, 0.1]
        }
    }
}

# Step 7: Train, Tune, and Evaluate Each Model
best_models = {}
test_predictions = {}

for name, mp in models_params.items():
    print(f"\n>>> Running GridSearchCV for {name}...")
    clf = GridSearchCV(
        estimator=mp["model"],
        param_grid=mp["params"],
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1
    )
    clf.fit(X_train, y_train)
    print(f"Best Params for {name}: {clf.best_params_}")
    test_predictions[name] = evaluate_model(clf.best_estimator_, name)
    best_models[name] = clf.best_estimator_