In [None]:
# Install imbalanced-learn
!pip install imbalanced-learn --quiet
!pip install pandas --quiet
!pip install matplotlib --quiet
!pip install seaborn --quiet
!pip install scikit-learn --quiet
!pip install xgboost --quiet
!pip install lightgbm --quiet
!pip install catboost --quiet
!pip install optuna --quiet
print("Installed")

In [None]:
import pandas as pd
import numpy as np

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Libraries
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Model Selection and Evaluation
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

print("Libraries imported successfully!")

In [None]:
# Loading pre-saved training and validation sets
X_train_resampled = pd.read_csv('X_train_resampled.csv')
y_train_resampled = pd.read_csv('y_train_resampled.csv').squeeze()  # Squeeze to convert to Series if needed
X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv').squeeze()

# Loading test set
X_test = pd.read_csv('X_test.csv')

# Loading submission template
submission_template = pd.read_csv('submission_template.csv')

print("DataFrames loaded from CSV successfully!")


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
import xgboost as xgb

# Define the model with new hyperparameters
xgb_model = xgb.XGBClassifier(
    colsample_bytree=0.5,
    learning_rate=0.01,
    max_depth=7,
    min_child_weight=1,
    n_estimators=1000,
    subsample=0.7978109153629405,
    scale_pos_weight=10,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Fit the model to the resampled training data
xgb_model.fit(X_train_resampled, y_train_resampled)

# Step 4: Make predictions on the validation set
y_pred = xgb_model.predict(X_val)
y_prob = xgb_model.predict_proba(X_val)[:, 1]

# Step 5: Evaluate the model
print("XGBoost with Updated Hyperparameters")
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}")

# Step 6: Run probability predictions on the test set
test_probabilities = xgb_model.predict_proba(X_test)[:, 1]  # Select probability for the positive class (isFraud)

# Step 7: Prepare submission DataFrame with probabilities
submission = submission_template.copy()
submission['isFraud'] = test_probabilities

# Step 8: Save predictions to a CSV file
submission.to_csv('best/submission_xgb_best.csv', index=False)

print("Probability predictions saved to submission_xgb_best.csv")


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, class_weight=None, random_state=42)
# rf.fit(X_train, y_train)
rf.fit(X_train_resampled, y_train_resampled)

# Step 4: Make predictions on the validation set
y_pred = rf.predict(X_val)
y_prob = rf.predict_proba(X_val)[:, 1]

# Step 5: Evaluate the model
print("Random Forest")
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}")

# Step 6: Run predictions on the test set
test_probabilities = rf.predict_proba(X_test)[:, 1]  # Select probability for the positive class (isFraud)

# Prepare submission DataFrame
submission = submission_template.copy()
submission['isFraud'] = test_probabilities  # Assign probabilities

# Step 8: Save predictions to a CSV file
submission.to_csv('best/submission_rfc_best.csv', index=False)

print("Predictions saved to submission_rfc_best.csv")

In [None]:
from sklearn.linear_model import LogisticRegression

# params = {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cholesky', 'class_weight': None, 'max_iter': 1000, 'tol': 1e-4}

params = {'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 0.04783940335702436, 'tol': 1.3833218995442702e-05, 'max_iter': 631}

log_reg = LogisticRegression(random_state=42, **params)
log_reg.fit(X_train_resampled, y_train_resampled)

# Validation predictions and evaluation
y_pred = log_reg.predict(X_val)
y_prob = log_reg.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_prob)

test_probabilities = log_reg.predict_proba(X_test)
submission = submission_template.copy()
submission['isFraud'] = test_probabilities[:, 1]  # Assuming 'isFraud' is the positive class

# Save the submission file for each combination
submission.to_csv(f'best/submission_best_lr.csv', index=False)

In [None]:
from catboost import CatBoostClassifier
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score

    
# params={'learning_rate': 0.05, 'iterations': 500, 'depth': 8, 'l2_leaf_reg': 5, 'bagging_temperature': 0.5, 'random_strength': 2, 'scale_pos_weight': 10}

params = {'learning_rate': 0.14501102083693077, 'iterations': 751, 'depth': 8, 'l2_leaf_reg': 5.595512756503533, 'bagging_temperature': 1.0454087496215603, 'random_strength': 1.008663275116376, 'scale_pos_weight': 8.064297059289366, 
           'random_seed': 42, 'verbose': 0}
# Initialize the CatBoost model with current parameters
catboost_model = CatBoostClassifier(
    **params,
    eval_metric='AUC'
)

    # Fit the model on the resampled training data
catboost_model.fit(X_train_resampled, y_train_resampled)

# Step 4: Make predictions on the validation set
y_pred = catboost_model.predict(X_val)
y_prob = catboost_model.predict_proba(X_val)[:, 1]

# Step 5: Evaluate the model
print("Parameters:", params)
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}\n")

# Step 6: Predict probabilities on the test set
test_probabilities = catboost_model.predict_proba(X_test)

# Step 7: Prepare submission DataFrame with probabilities for the positive class
submission = submission_template.copy()
submission['isFraud'] = test_probabilities[:, 1]  # Assuming 'isFraud' is the positive class

# Step 8: Save predictions to a CSV file with a unique name for each parameter set
submission.to_csv(f'best/submission_catboost_best.csv', index=False)
print("DOne! , saved") 


In [None]:


from lightgbm import LGBMClassifier
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score

# params = {'learning_rate': 0.05, 'n_estimators': 500, 'max_depth': 15, 'num_leaves': 50, 'min_child_samples': 10,
 # 'subsample': 1.0, 'colsample_bytree': 0.6, 'reg_lambda': 5, 'reg_alpha': 0.1, 'scale_pos_weight': 10}
params = {'learning_rate': 0.09906417236746058, 'n_estimators': 454, 'max_depth': 10, 'num_leaves': 46, 
          'min_child_samples': 47, 'subsample': 0.8592558986910852, 'colsample_bytree': 0.7443545915949523, 
          'reg_lambda': 1.7308107963855828, 'reg_alpha': 0.24840082175007047, 'scale_pos_weight': 2.458598065242119}

# Initialize the LGBMClassifier with current parameters
lgbm_model = LGBMClassifier(
    **params,
    random_state=42
)

# Fit the model on the resampled training data
lgbm_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
y_pred = lgbm_model.predict(X_val)
y_prob = lgbm_model.predict_proba(X_val)[:, 1]

# Evaluate the model
print("Parameters:", params)
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}\n")

# Predict probabilities on the test set
test_probabilities = lgbm_model.predict_proba(X_test)[:, 1]

# Prepare submission DataFrame
submission = submission_template.copy()
submission['isFraud'] = test_probabilities  # Save probabilities as predictions

# Save predictions to a CSV file with a unique name for each parameter set
submission.to_csv(f'best/submission_lgbm_best.csv', index=False)

print("Probability predictions saved to submission_lgbm_best.csv")


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Combine models in a voting ensemble
voting_clf = VotingClassifier(estimators=[
    ('lgbm', lgbm_model),
    ('catboost', catboost_model),
    ('xgb', xgb_model)
], voting='soft')

# Fit on the resampled training data
voting_clf.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate
y_pred = voting_clf.predict(X_val)
y_prob = voting_clf.predict_proba(X_val)[:, 1]

# Evaluation metrics
print("Voting Ensemble Model")
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}")

# Predict on the test data using the voting model
test_predictions = voting_clf.predict_proba(X_test)[:, 1]

submission = submission_template.copy()
submission['isFraud'] = test_probabilities

# Save the submission to a CSV file
submission.to_csv('best/submission_voting_1.csv', index=False)
print("Voting model submission file generated successfully!")

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Combine models in a voting ensemble
voting_clf = VotingClassifier(estimators=[
    ('lr', log_reg),
    ('lgbm', lgbm_model),
    ('catboost', catboost_model),
    ('xgb', xgb_model)
], voting='soft')

# Fit on the resampled training data
voting_clf.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate
y_pred = voting_clf.predict(X_val)
y_prob = voting_clf.predict_proba(X_val)[:, 1]

# Evaluation metrics
print("Voting Ensemble Model")
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}")

# Predict on the test data using the voting model
test_predictions = voting_clf.predict_proba(X_test)[:, 1]

submission = submission_template.copy()
submission['isFraud'] = test_probabilities

# Save the submission to a CSV file
submission.to_csv('best/submission_voting_2.csv', index=False)
print("Voting model submission file generated successfully!")

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Combine models in a voting ensemble
voting_clf = VotingClassifier(estimators=[
    ('lr', log_reg),
    ('rf', rf),
    ('lgbm', lgbm_model),
    ('catboost', catboost_model)
], voting='soft')

# Fit on the resampled training data
voting_clf.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate
y_pred = voting_clf.predict(X_val)
y_prob = voting_clf.predict_proba(X_val)[:, 1]

# Evaluation metrics
print("Voting Ensemble Model")
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}")

# Predict on the test data using the voting model
test_predictions = voting_clf.predict_proba(X_test)[:, 1]

submission = submission_template.copy()
submission['isFraud'] = test_probabilities

# Save the submission to a CSV file
submission.to_csv('best/submission_voting_3.csv', index=False)
print("Voting model submission file generated successfully!")

In [None]:
from sklearn.ensemble import  StackingClassifier

base_models = [
    ('lgbm', lgbm_model),    # Pre-trained LightGBM
    ('catboost', catboost_model),  # Pre-trained CatBoost
]

# Meta model
meta_model = LogisticRegression(
    solver='sag',
    penalty='l2',
    C=0.0016820949365318634,
    tol=0.000417900469159281,
    max_iter=900,
    class_weight=None,
    random_state=42
)

# Stacking classifier
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=3
)

# Fit the stacking classifier
stacking_clf.fit(X_train_resampled, y_train_resampled)

# Predict and evaluate
y_pred = stacking_clf.predict(X_val)
y_prob = stacking_clf.predict_proba(X_val)[:, 1]

# Evaluation metrics
print("Stacking Model")
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}")

# Step 4: Predict on the test data using the stacking model
# Run probability predictions on the test set using the stacking model
test_probabilities = stacking_clf.predict_proba(X_test)[:, 1]  # Select probability for the positive class (isFraud)

# Step 5: Prepare submission DataFrame with probabilities
submission = submission_template.copy()
submission['isFraud'] = test_probabilities

# Step 6: Save the submission to a CSV file
submission.to_csv('best/submission_stacking_best.csv', index=False)

print("Probability predictions saved to submission_stacking.csv")

print("Stacking model submission file generated successfully!")


In [None]:
!pip uninstall -y tensorflow tensorflow-gpu tensorflow-cpu
!pip install tensorflow-cpu

import tensorflow as tf

print("TensorFlow Version:", tf.__version__)
print("Num GPUs Available:", len(tf.config.list_physical_devices("GPU")))

# %%
# Disable GPU Usage and Optimize TensorFlow for CPU

import os

# Disable all GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import tensorflow as tf

# Configure TensorFlow to use all 36 CPU cores efficiently
tf.config.threading.set_intra_op_parallelism_threads(36)
tf.config.threading.set_inter_op_parallelism_threads(36)

# Verify TensorFlow is using CPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))

# Import Libraries
import pandas as pd
import numpy as np

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Libraries
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import class_weight

# Model Selection and Evaluation
from sklearn.model_selection import train_test_split

# Suppress warnings for cleaner output
import warnings

warnings.filterwarnings("ignore")

# Machine Learning Libraries
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.regularizers import l2
from sklearn.metrics import classification_report, roc_auc_score

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

print("Libraries imported successfully!")

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_val_scaled = scaler.transform(X_val)


In [None]:
# Define the Supervised Autoencoder Architecture

# Define the size of the input
input_dim = X_train_scaled.shape[1]

# Input Layer
input_layer = Input(shape=(input_dim,), name="input_layer")

# Encoder Layers with L2 regularization and LeakyReLU activation
encoded = Dense(256, kernel_regularizer=l2(0.001))(input_layer)
encoded = LeakyReLU(alpha=0.1)(encoded)
encoded = BatchNormalization()(encoded)
encoded = Dropout(0.3)(encoded)

encoded = Dense(128, kernel_regularizer=l2(0.001))(encoded)
encoded = LeakyReLU(alpha=0.1)(encoded)
encoded = BatchNormalization()(encoded)
encoded = Dropout(0.3)(encoded)

encoded = Dense(64, kernel_regularizer=l2(0.001))(encoded)
encoded = LeakyReLU(alpha=0.1)(encoded)
encoded = BatchNormalization()(encoded)
encoded = Dropout(0.3)(encoded)

encoded = Dense(32, kernel_regularizer=l2(0.001))(encoded)
encoded = LeakyReLU(alpha=0.1)(encoded)
encoded = BatchNormalization()(encoded)
encoded = Dropout(0.3)(encoded)

# Bottleneck Layer
bottleneck = Dense(16, activation="linear", name="bottleneck")(encoded)

# Decoder Layers
decoded = Dense(32, kernel_regularizer=l2(0.001))(bottleneck)
decoded = LeakyReLU(alpha=0.1)(decoded)
decoded = BatchNormalization()(decoded)
decoded = Dropout(0.3)(decoded)

decoded = Dense(64, kernel_regularizer=l2(0.001))(decoded)
decoded = LeakyReLU(alpha=0.1)(decoded)
decoded = BatchNormalization()(decoded)
decoded = Dropout(0.3)(decoded)

decoded = Dense(128, kernel_regularizer=l2(0.001))(decoded)
decoded = LeakyReLU(alpha=0.1)(decoded)
decoded = BatchNormalization()(decoded)
decoded = Dropout(0.3)(decoded)

decoded = Dense(256, kernel_regularizer=l2(0.001))(decoded)
decoded = LeakyReLU(alpha=0.1)(decoded)
decoded = BatchNormalization()(decoded)
decoded = Dropout(0.3)(decoded)

# Output Layer for Reconstruction
reconstruction_output = Dense(input_dim, activation="linear", name="reconstruction")(
    decoded
)

# Output Layer for Classification
classification_output = Dense(1, activation="sigmoid", name="classification")(
    bottleneck
)

# Combined Model with outputs as a list
autoencoder_classifier = Model(
    inputs=input_layer, outputs=[reconstruction_output, classification_output]
)

# Adjust optimizer with lower learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

# Compile the model with adjusted loss weights and AUC metric
autoencoder_classifier.compile(
    optimizer=optimizer,
    loss=[
        "mse",  # Reconstruction loss
        "binary_crossentropy",  # Classification loss
    ],
    loss_weights=[
        0.3,  # Weight for reconstruction loss
        0.7,  # Weight for classification loss
    ],
    metrics=[
        None,  # No metrics for reconstruction
        [tf.keras.metrics.AUC(name="auc")],  # Metrics for classification
    ],
)

# Display the model summary
autoencoder_classifier.summary()


In [None]:
# Compute sample weights for the classification output
sample_weights_classification = compute_sample_weight(
    class_weight="balanced", y=y_train
)

# Create a list of sample weights matching the outputs
sample_weight_list = [
    np.ones(len(y_train)),  # Sample weights for reconstruction (all ones)
    sample_weights_classification,  # Sample weights for classification
]

# Create sample weights for validation data
sample_weight_val = [
    np.ones(len(y_val)),  # Sample weights for reconstruction in validation
    np.ones(len(y_val)),  # Sample weights for classification in validation
]

# Define early stopping to prevent overfitting, monitoring AUC
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_output_2_auc",  # Adjusted to match the output name
    mode="max",
    patience=15,
    restore_best_weights=True,
)

# Define model checkpointing to save the model after every epoch
checkpoint_filepath = "models_checkpoints/model_checkpoint_epoch_{epoch:02d}.keras"

model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,  # Save the full model
    monitor="val_output_2_auc",
    mode="max",
    save_best_only=False,  # Save the model after every epoch
    verbose=1,
)

# Train the model with increased epochs and sample weights
history = autoencoder_classifier.fit(
    X_train_scaled,
    [X_train_scaled, y_train_resampled],
    sample_weight=sample_weight_list,
    epochs=3,
    batch_size=64,
    shuffle=True,
    validation_data=(
        X_val_scaled,
        [X_val_scaled, y_val],
        sample_weight_val,  # Include sample weights for validation data
    ),
    callbacks=[early_stopping, model_checkpoint_callback],
    verbose=1,
)


In [None]:
# Evaluate the Model

# Predict on validation set
predictions = autoencoder_classifier.predict(X_val_scaled)
reconstruction_pred = predictions[0]
y_val_pred_prob = predictions[1]

# Evaluate the performance using probabilities
print("Classification Report on Validation Set (Threshold = 0.5):")
y_val_pred = (y_val_pred_prob > 0.5).astype(int)
print(classification_report(y_val, y_val_pred))

print(f"AUC-ROC Score: {roc_auc_score(y_val, y_val_pred_prob):.4f}")


In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt

# Define custom objects dictionary
custom_objects = {"LeakyReLU": tf.keras.layers.LeakyReLU, "AUC": tf.keras.metrics.AUC}

# Directory containing the saved models
model_dir = "."  # Current directory, adjust if needed

# Lists to store metrics
epochs = []
train_losses = []
val_losses = []
train_aucs = []
val_aucs = []

model_dir = "/home/vdupati1/models_checkpoints"  # Replace with the actual path to your subfolder if different

# Get list of model files sorted by epoch
model_files = sorted(
    [
        f
        for f in os.listdir(model_dir)
        if f.startswith("model_checkpoint_epoch_") and f.endswith(".keras")
    ]
)

for model_file in model_files:
    # Extract epoch number from the filename
    epoch_num = int(model_file.split("_")[-1].split(".")[0])
    epochs.append(epoch_num)

    # Load the model
    # print(model_file)
    model_file = "/home/vdupati1/models_checkpoints/" + model_file
    model = load_model(model_file, custom_objects=custom_objects)

    # Evaluate on validation data
    val_metrics = model.evaluate(
        X_val_scaled, [X_val_scaled, y_val], sample_weight=sample_weight_val, verbose=0
    )
    val_losses.append(val_metrics[0])  # Total loss
    val_aucs.append(val_metrics[-1])  # AUC is the last metric

    # Clear the model from memory
    tf.keras.backend.clear_session()

# Combine metrics into a DataFrame
metrics_df = pd.DataFrame(
    {"Epoch": epochs, "Validation Loss": val_losses, "Validation AUC": val_aucs}
)

# Sort by Epoch
metrics_df.sort_values("Epoch", inplace=True)

plt.figure(figsize=(12, 6))

# Plot Validation AUC
plt.plot(metrics_df["Epoch"], metrics_df["Validation AUC"], label="Validation AUC")

# Optional: Plot Training AUC if collected
# plt.plot(metrics_df['Epoch'], metrics_df['Training AUC'], label='Training AUC')

plt.title("Model AUC over Epochs")
plt.xlabel("Epoch")
plt.ylabel("AUC")
plt.legend()
plt.show()

# Similarly, you can plot Validation Loss
plt.figure(figsize=(12, 6))
plt.plot(metrics_df["Epoch"], metrics_df["Validation Loss"], label="Validation Loss")
plt.title("Model Loss over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()


In [None]:
# Predict on Test Data

X_test = X_test.loc[:, X_test.columns != "TransactionID"]
X_test_scaled = scaler.transform(X_test)

# Predict on test data
predictions_test = autoencoder_classifier.predict(X_test_scaled)
y_test_pred_prob = predictions_test[1]

# Use the predicted probabilities directly for submission
test_predictions = y_test_pred_prob.flatten()

# Ensure the probabilities are within [0, 1]
test_predictions = np.clip(test_predictions, 0, 1)


In [None]:
# Prepare Submission

submission_autoencoder_classifier = submission_template.copy()
submission_autoencoder_classifier["isFraud"] = test_predictions

submission_autoencoder_classifier.to_csv(
    "submission_autoencoder_classifier_updated.csv", index=False
)

print(
    "Updated autoencoder-based classifier submission file with probabilities generated successfully!"
)
