In [None]:
# Install imbalanced-learn
!pip install imbalanced-learn --quiet
!pip install pandas --quiet
!pip install matplotlib --quiet
!pip install seaborn --quiet
!pip install scikit-learn --quiet
!pip install xgboost --quiet
!pip install lightgbm --quiet
!pip install catboost --quiet
!pip install optuna --quiet
print("Installed")

In [None]:
import pandas as pd
import numpy as np

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Libraries
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Model Selection and Evaluation
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

print("Libraries imported successfully!")

In [None]:
# Loading pre-saved training and validation sets
X_train_resampled = pd.read_csv('X_train_resampled.csv')
y_train_resampled = pd.read_csv('y_train_resampled.csv').squeeze()  # Squeeze to convert to Series if needed
X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv').squeeze()

# Loading test set
X_test = pd.read_csv('X_test.csv')

# Loading submission template
submission_template = pd.read_csv('submission_template.csv')

print("DataFrames loaded from CSV successfully!")


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
import xgboost as xgb

# Define the model with new hyperparameters
xgb_model = xgb.XGBClassifier(
    colsample_bytree=0.5,
    learning_rate=0.01,
    max_depth=7,
    min_child_weight=1,
    n_estimators=1000,
    subsample=0.7978109153629405,
    scale_pos_weight=10,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Fit the model to the resampled training data
xgb_model.fit(X_train_resampled, y_train_resampled)

# Step 4: Make predictions on the validation set
y_pred = xgb_model.predict(X_val)
y_prob = xgb_model.predict_proba(X_val)[:, 1]

# Step 5: Evaluate the model
print("XGBoost with Updated Hyperparameters")
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}")

# Step 6: Run probability predictions on the test set
test_probabilities = xgb_model.predict_proba(X_test)[:, 1]  # Select probability for the positive class (isFraud)

# Step 7: Prepare submission DataFrame with probabilities
submission = submission_template.copy()
submission['isFraud'] = test_probabilities

# Step 8: Save predictions to a CSV file
submission.to_csv('best/submission_xgb_best.csv', index=False)

print("Probability predictions saved to submission_xgb_best.csv")


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, class_weight=None, random_state=42)
# rf.fit(X_train, y_train)
rf.fit(X_train_resampled, y_train_resampled)

# Step 4: Make predictions on the validation set
y_pred = rf.predict(X_val)
y_prob = rf.predict_proba(X_val)[:, 1]

# Step 5: Evaluate the model
print("Random Forest")
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}")

# Step 6: Run predictions on the test set
test_probabilities = rf.predict_proba(X_test)[:, 1]  # Select probability for the positive class (isFraud)

# Prepare submission DataFrame
submission = submission_template.copy()
submission['isFraud'] = test_probabilities  # Assign probabilities

# Step 8: Save predictions to a CSV file
submission.to_csv('best/submission_rfc_best.csv', index=False)

print("Predictions saved to submission_rfc_best.csv")

In [None]:
from sklearn.linear_model import LogisticRegression

# params = {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cholesky', 'class_weight': None, 'max_iter': 1000, 'tol': 1e-4}

params = {'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 0.04783940335702436, 'tol': 1.3833218995442702e-05, 'max_iter': 631}

log_reg = LogisticRegression(random_state=42, **params)
log_reg.fit(X_train_resampled, y_train_resampled)

# Validation predictions and evaluation
y_pred = log_reg.predict(X_val)
y_prob = log_reg.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_prob)

test_probabilities = log_reg.predict_proba(X_test)
submission = submission_template.copy()
submission['isFraud'] = test_probabilities[:, 1]  # Assuming 'isFraud' is the positive class

# Save the submission file for each combination
submission.to_csv(f'best/submission_best_lr.csv', index=False)

In [None]:
from catboost import CatBoostClassifier
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score

    
# params={'learning_rate': 0.05, 'iterations': 500, 'depth': 8, 'l2_leaf_reg': 5, 'bagging_temperature': 0.5, 'random_strength': 2, 'scale_pos_weight': 10}

params = {'learning_rate': 0.14501102083693077, 'iterations': 751, 'depth': 8, 'l2_leaf_reg': 5.595512756503533, 'bagging_temperature': 1.0454087496215603, 'random_strength': 1.008663275116376, 'scale_pos_weight': 8.064297059289366, 
           'random_seed': 42, 'verbose': 0}
# Initialize the CatBoost model with current parameters
catboost_model = CatBoostClassifier(
    **params,
    eval_metric='AUC'
)

    # Fit the model on the resampled training data
catboost_model.fit(X_train_resampled, y_train_resampled)

# Step 4: Make predictions on the validation set
y_pred = catboost_model.predict(X_val)
y_prob = catboost_model.predict_proba(X_val)[:, 1]

# Step 5: Evaluate the model
print("Parameters:", params)
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}\n")

# Step 6: Predict probabilities on the test set
test_probabilities = catboost_model.predict_proba(X_test)

# Step 7: Prepare submission DataFrame with probabilities for the positive class
submission = submission_template.copy()
submission['isFraud'] = test_probabilities[:, 1]  # Assuming 'isFraud' is the positive class

# Step 8: Save predictions to a CSV file with a unique name for each parameter set
submission.to_csv(f'best/submission_catboost_best.csv', index=False)
print("DOne! , saved") 


In [None]:


from lightgbm import LGBMClassifier
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score

# params = {'learning_rate': 0.05, 'n_estimators': 500, 'max_depth': 15, 'num_leaves': 50, 'min_child_samples': 10,
 # 'subsample': 1.0, 'colsample_bytree': 0.6, 'reg_lambda': 5, 'reg_alpha': 0.1, 'scale_pos_weight': 10}
params = {'learning_rate': 0.09906417236746058, 'n_estimators': 454, 'max_depth': 10, 'num_leaves': 46, 
          'min_child_samples': 47, 'subsample': 0.8592558986910852, 'colsample_bytree': 0.7443545915949523, 
          'reg_lambda': 1.7308107963855828, 'reg_alpha': 0.24840082175007047, 'scale_pos_weight': 2.458598065242119}

# Initialize the LGBMClassifier with current parameters
lgbm_model = LGBMClassifier(
    **params,
    random_state=42
)

# Fit the model on the resampled training data
lgbm_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
y_pred = lgbm_model.predict(X_val)
y_prob = lgbm_model.predict_proba(X_val)[:, 1]

# Evaluate the model
print("Parameters:", params)
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}\n")

# Predict probabilities on the test set
test_probabilities = lgbm_model.predict_proba(X_test)[:, 1]

# Prepare submission DataFrame
submission = submission_template.copy()
submission['isFraud'] = test_probabilities  # Save probabilities as predictions

# Save predictions to a CSV file with a unique name for each parameter set
submission.to_csv(f'best/submission_lgbm_best.csv', index=False)

print("Probability predictions saved to submission_lgbm_best.csv")


In [None]:
import logging
import optuna
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import StratifiedKFold

# Pre-trained base models
pretrained_base_models = [
    ('lgbm', lgbm_model),    # Pre-trained LightGBM
    ('catboost', catboost_model),  # Pre-trained CatBoost
]

# Create timestamped folder for saving results
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_folder = f"stacking_simple_bayesian_meta_results_{timestamp}"
os.makedirs(output_folder, exist_ok=True)

# Set up logging
log_file = os.path.join(output_folder, "log.txt")
logging.basicConfig(
    filename=log_file,
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)
logger = logging.getLogger()

# Define the Optuna objective function for meta-model tuning
def objective(trial):
    # Hyperparameter search space for the meta-model
    # Extensive hyperparameter set for Logistic Regression
    solver = trial.suggest_categorical('solver', ['newton-cholesky', 'sag', 'saga'])
    penalty = trial.suggest_categorical('penalty', ['l2', None, 'l1'])
    C = trial.suggest_loguniform('C', 1e-3, 1e3)
    tol = trial.suggest_loguniform('tol', 1e-5, 1e-1)
    max_iter = trial.suggest_int('max_iter', 100, 1000)

    # Compatibility checks for solver and penalty
    if solver == 'newton-cholesky' and penalty not in ['l2', 'none']:
        logger.warning(f"Trial {trial.number}: Invalid combination (solver={solver}, penalty={penalty}). Pruning trial.")
        raise optuna.exceptions.TrialPruned()
    if solver == 'sag' and penalty != 'l2':
        logger.warning(f"Trial {trial.number}: Invalid combination (solver={solver}, penalty={penalty}). Pruning trial.")
        raise optuna.exceptions.TrialPruned()
        
    meta_model = LogisticRegression(
        solver=solver,
        penalty=penalty,
        C=C,
        tol=tol,
        max_iter=max_iter,
        random_state=42
    )

    # Stacking classifier with pre-trained base models
    stacking_clf = StackingClassifier(
        estimators=pretrained_base_models,
        final_estimator=meta_model,
        cv=3  # Cross-validation for meta-model
    )
    logger.info(f"Trial {trial.number}: Starting with meta-model parameters: solver={solver}, penalty={penalty}, C={C}, tol={tol}, max_iter={max_iter}")

    # Stratified K-Fold Cross-Validation
    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    fold_aucs = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_resampled, y_train_resampled), 1):
        logger.info(f"  Fold {fold}: Training StackingClassifier...")
        X_train_fold, X_val_fold = X_train_resampled.iloc[train_idx], X_train_resampled.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_resampled.iloc[train_idx], y_train_resampled.iloc[val_idx]

        # Train Stacking Classifier
        stacking_clf.fit(X_train_fold, y_train_fold)

        # Predict probabilities for validation set
        y_prob = stacking_clf.predict_proba(X_val_fold)[:, 1]
        fold_auc = roc_auc_score(y_val_fold, y_prob)
        fold_aucs.append(fold_auc)
        logger.info(f"  Fold {fold}: AUC-ROC = {fold_auc:.4f}")

    # Calculate mean AUC-ROC across folds
    mean_auc = np.mean(fold_aucs)
    logger.info(f"Trial {trial.number}: Mean AUC-ROC across folds = {mean_auc:.4f}")

    # Save predictions on test set for this trial
    test_probabilities = stacking_clf.predict_proba(X_test)[:, 1]
    submission = submission_template.copy()
    submission['isFraud'] = test_probabilities
    submission_file = os.path.join(output_folder, f"submission_trial_{trial.number}.csv")
    submission.to_csv(submission_file, index=False)
    logger.info(f"Trial {trial.number}: Submission saved to {submission_file}")

    # Store results
    trial.set_user_attr("fold_aucs", fold_aucs)
    trial.set_user_attr("mean_auc", mean_auc)
    trial.set_user_attr("meta_params", {
        'solver': solver,
        'penalty': penalty,
        'C': C,
        'tol': tol,
        'max_iter': max_iter
    })

    return mean_auc

# Run Bayesian Optimization
logger.info("Starting Bayesian optimization with Optuna...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)  # Adjust number of trials as needed

# Save all trials results
all_trials = []
for trial in study.trials:
    all_trials.append({
        "trial_number": trial.number,
        "meta_params": trial.user_attrs["meta_params"],
        "mean_auc": trial.user_attrs["mean_auc"],
        "fold_aucs": trial.user_attrs["fold_aucs"]
    })

results_df = pd.DataFrame(all_trials)
results_file = os.path.join(output_folder, "all_trials_results.csv")
results_df.to_csv(results_file, index=False)
logger.info(f"All trial results saved to {results_file}")

# Print and save top 3 models
top_3 = results_df.sort_values("mean_auc", ascending=False).head(3)
logger.info("\nTop 3 Models:\n" + top_3.to_string())

top_3_file = os.path.join(output_folder, "top_3_models.csv")
top_3.to_csv(top_3_file, index=False)
logger.info(f"Top 3 models saved to {top_3_file}")

# Save detailed trial logs
selected_params_file = os.path.join(output_folder, "selected_parameters.txt")
with open(selected_params_file, "w") as f:
    for trial in study.trials:
        f.write(f"Trial {trial.number}: Meta Params: {trial.user_attrs['meta_params']}, Mean AUC: {trial.user_attrs['mean_auc']}, Fold AUCs: {trial.user_attrs['fold_aucs']}\n")
logger.info(f"Selected parameters and AUC scores saved to {selected_params_file}")

logger.info(f"Optimization complete. Results saved in folder: {output_folder}")
