Importing and Loading data

In [None]:
import pandas as pd
import numpy as np
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, schedules
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
import warnings

# Suppress ConvergenceWarning from LinearSVC for cleaner output
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# --- 1. Load Data and Prepare X and y (Training) ---
df_train = pd.read_csv("train_processed_parv.csv")
bool_cols_train = df_train.select_dtypes(include='bool').columns
df_train[bool_cols_train] = df_train[bool_cols_train].astype(int)

X = df_train.drop(columns=['ProfileID', 'RiskFlag'])
y = df_train['RiskFlag']

# Split for full training (80%) and evaluation (20%)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create the 20% Training Sample (25% of X_train_full = 20% of original data)
X_train_20pct, X_temp, y_train_20pct, y_temp = train_test_split(
    X_train_full, y_train_full, test_size=0.75, random_state=42, stratify=y_train_full
)

input_dim = X_train_full.shape[1]

print(f"Total Features: {input_dim}")
print(f"20% Training Sample Size: {len(X_train_20pct)}")
print(f"Full Training Set Size: {len(X_train_full)}")
print(f"Evaluation Test Set Size: {len(X_test)}\n")

# --- 2. Load and Prepare X for Submission (Competition Test Data) ---
df_test = pd.read_csv("test_processed_parv.csv")
bool_cols_test = df_test.select_dtypes(include='bool').columns
df_test[bool_cols_test] = df_test[bool_cols_test].astype(int)

# X_submit is the actual competition test data to predict
X_submit = df_test.drop(columns=['ProfileID'])
profile_ids = df_test['ProfileID']
print(f"Submission Test Set Size: {len(X_submit)}\n")


Total Features: 26
20% Training Sample Size: 40855
Full Training Set Size: 163421
Evaluation Test Set Size: 40856

Submission Test Set Size: 51070



Common Functions and Neural Networks

In [None]:
# --- 3. Define Common Functions ---

def evaluate_model(model_name, model, X_train, y_train, X_eval, y_eval, is_ann=False):
    """Trains, predicts, and reports metrics for a given model."""
    print(f"--- Training {model_name} on {len(X_train)} samples... ---")
    start_time = time.time()

    if is_ann:
        model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
    else:
        model.fit(X_train, y_train)

    training_time = time.time() - start_time
    print(f"Training Time: {training_time:.2f} seconds")

    # Predict on the hold-out evaluation set
    if is_ann:
        y_pred_proba = model.predict(X_eval, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int).flatten()
    else:
        y_pred = model.predict(X_eval)

    accuracy = accuracy_score(y_eval, y_pred)
    auc = roc_auc_score(y_eval, y_pred)

    print(f"Evaluation Accuracy: {accuracy:.4f}")
    print(f"Evaluation ROC AUC: {auc:.4f}")
    print("\nClassification Report (on Evaluation Set):")
    print(classification_report(y_eval, y_pred, zero_division=0))
    print("-" * 50)
    return model # Return the trained model

def create_ann_model(input_dim):
    """Creates a basic ANN model for binary classification."""
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

Linear SVM

In [None]:
# --- A. Linear SVM (Training on 20% Sample) ---

linear_svm_20pct = LinearSVC(random_state=42, dual=True, max_iter=10000)

evaluate_model("Linear SVM (20%)", linear_svm_20pct,

X_train_20pct, y_train_20pct, X_test, y_test)



# --- B. Linear SVM (Training on Full Dataset) ---

final_linear_svm = LinearSVC(random_state=42, dual=True, max_iter=10000)

final_linear_svm = evaluate_model("Linear SVM (Full)", final_linear_svm,

X_train_full, y_train_full, X_test, y_test)

# Suppress warnings that may arise from L1 penalty/loss combinations during CV
warnings.filterwarnings("ignore", category=UserWarning)

# --- 1. Define Model for Tuning ---
# Use the same base model with high max_iter
model_linear_svm = LinearSVC(random_state=42, dual=True, max_iter=10000)

# --- 2. Define the CORRECTED Grid ---
# We eliminate the unsupported l1/hinge combinations.
# We will focus on the L2 penalty which is robust and works well with dual=True.
param_grid_linear_svm = {
    'C': [0.1, 1.0, 10.0],
    'penalty': ['l2'], # Only use L2 penalty for robust/fast results with dual=True
    'loss': ['hinge', 'squared_hinge']
}
# Total Candidates: 3 * 1 * 2 = 6 candidates. (Reduced from 12, greatly reducing runtime and error chance)

# Initialize GridSearchCV (using ROC AUC)
grid_search_linear_svm = GridSearchCV(
    estimator=model_linear_svm,
    param_grid=param_grid_linear_svm,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    n_jobs=-1
)

print("\n--- Starting Hyperparameter Tuning for Linear SVM (Corrected Grid) ---")
grid_search_linear_svm.fit(X_train_full, y_train_full)

# Get the best estimator and parameters
best_linear_svm = grid_search_linear_svm.best_estimator_
print("\n--- TUNING RESULTS (Linear SVM) ---")
print(f"Best ROC AUC score found: {grid_search_linear_svm.best_score_:.4f}")
print(f"Best parameters: {grid_search_linear_svm.best_params_}")
print("-" * 50)

# Evaluate the final, best model on the holdout test set
final_linear_svm_tuned = evaluate_model("Tuned Linear SVM (Full)", best_linear_svm,
                                  X_train_full, y_train_full, X_test, y_test)


# --- Prediction and Submission (Using the Tuned Model) ---
print("Predicting with Tuned Linear SVM (Full) on Test Data...")
y_pred_linear = final_linear_svm_tuned.predict(X_submit)

# Create and save the submission DataFrame
submission_df_linear = pd.DataFrame({
    'ProfileID': profile_ids,
    'RiskFlag': y_pred_linear
})
submission_filename_linear = 'submission_linear_svm_tuned.csv'
submission_df_linear.to_csv(submission_filename_linear, index=False)
print(f"Generated submission file: {submission_filename_linear}")

--- Training Linear SVM (20%) on 40855 samples... ---
Training Time: 10.30 seconds
Evaluation Accuracy: 0.8837
Evaluation ROC AUC: 0.5002

Classification Report (on Evaluation Set):
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     36105
           1       0.50      0.00      0.00      4751

    accuracy                           0.88     40856
   macro avg       0.69      0.50      0.47     40856
weighted avg       0.84      0.88      0.83     40856

--------------------------------------------------
--- Training Linear SVM (Full) on 163421 samples... ---
Training Time: 74.21 seconds
Evaluation Accuracy: 0.8837
Evaluation ROC AUC: 0.5002

Classification Report (on Evaluation Set):
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     36105
           1       0.67      0.00      0.00      4751

    accuracy                           0.88     40856
   macro avg       0.78      0.50     

Predicting and Testing Linear SVM

In [None]:
# --- Prediction and Submission (Using the Tuned Model) ---
print("Predicting with Tuned Linear SVM (Full) on Test Data...")
y_pred_linear = final_linear_svm_tuned.predict(X_submit)

# Create and save the submission DataFrame
submission_df_linear = pd.DataFrame({
    'ProfileID': profile_ids,
    'RiskFlag': y_pred_linear
})
submission_filename_linear = 'submission_linear_svm_tuned.csv'
submission_df_linear.to_csv(submission_filename_linear, index=False)

Kernalized SVM

In [None]:
# --- A. Kernelized SVM (RBF) (Training on 20% Sample) ---
# NOW TRAINING ON THE ENTIRE 20% SAMPLE (X_train_20pct)
rbf_svm_20pct = SVC(kernel='rbf', gamma='scale', random_state=42)
rbf_svm_20pct = evaluate_model("Kernelized SVM (RBF) (20% full sample)", rbf_svm_20pct,
               X_train_20pct, y_train_20pct, X_test, y_test)


# --- B. Kernelized SVM (RBF) (Training on Full Dataset) ---
# NOW TRAINING ON THE ENTIRE FULL TRAINING DATASET (X_train_full)
final_rbf_svm = SVC(kernel='rbf', gamma='scale', random_state=42)
final_rbf_svm = evaluate_model("Kernelized SVM (RBF) (Full Dataset)", final_rbf_svm,
                               X_train_full, y_train_full, X_test, y_test)

# Define the model
model_rbf_svm = SVC(random_state=42, kernel='rbf', probability=True) # probability=True needed for roc_auc scoring

# Define a small, feasible grid of parameters to search due to high complexity
param_grid_rbf_svm = {
    'C': [1.0, 5.0],
    'gamma': ['scale', 0.1]
}

# Initialize GridSearchCV (using ROC AUC)
print("--- WARNING: Kernelized SVM Tuning is highly time-consuming! ---")
grid_search_rbf_svm = GridSearchCV(
    estimator=model_rbf_svm,
    param_grid=param_grid_rbf_svm,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    n_jobs=-1 # Use all available cores
)

print("\n--- Starting Hyperparameter Tuning for Kernelized SVM ---")
grid_search_rbf_svm.fit(X_train_full, y_train_full)

# Get the best estimator and parameters
best_rbf_svm = grid_search_rbf_svm.best_estimator_
print("\n--- TUNING RESULTS (Kernelized SVM) ---")
print(f"Best ROC AUC score found: {grid_search_rbf_svm.best_score_:.4f}")
print(f"Best parameters: {grid_search_rbf_svm.best_params_}")
print("-" * 50)

# Evaluate the final, best model on the holdout test set
final_rbf_svm = evaluate_model("Tuned Kernelized SVM (Full)", best_rbf_svm,
                               X_train_full, y_train_full, X_test, y_test)



Predicting and Testing Kernelized SVM

In [None]:
# --- C. Prediction and Submission ---
print("Predicting with Kernelized SVM (Full Dataset) on Test Data...")
y_pred_rbf = final_rbf_svm.predict(X_submit)

# Create and save the submission DataFrame
submission_df_rbf = pd.DataFrame({
    'ProfileID': profile_ids,
    'RiskFlag': y_pred_rbf
})
submission_filename_rbf = 'submission_kernelized_svm.csv'
submission_df_rbf.to_csv(submission_filename_rbf, index=False)
print(f"Generated submission file: {submission_filename_rbf}")

MLP (Neural Networks)

In [None]:
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

# Helper function to create a Keras model (slightly modified for standalone use)
def create_model_manual(input_dim, neurons, learning_rate):
    model = Sequential([
        Dense(neurons, activation='relu', input_shape=(input_dim,)),
        Dense(int(neurons / 2), activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    optimizer = Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Define the grid of parameters to search (8 candidates)
tuning_grid = [
    {'neurons': 32, 'learning_rate': 0.001, 'batch_size': 32},
    {'neurons': 64, 'learning_rate': 0.001, 'batch_size': 32},
    {'neurons': 32, 'learning_rate': 0.01, 'batch_size': 32},
    {'neurons': 64, 'learning_rate': 0.01, 'batch_size': 64},
    {'neurons': 32, 'learning_rate': 0.001, 'batch_size': 64},
    {'neurons': 64, 'learning_rate': 0.001, 'batch_size': 64},
    {'neurons': 32, 'learning_rate': 0.01, 'batch_size': 64},
    {'neurons': 64, 'learning_rate': 0.01, 'batch_size': 32}
]

# Initialize KFold for Cross-Validation (CV)
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

best_auc = -1
best_params = {}
input_dim = X_train_full.shape[1]
epochs = 5 # Use 5 epochs for faster CV

print(f"--- Starting Manual 3-Fold Tuning for MLP ({len(tuning_grid)} Candidates) ---")

# Iterate through every parameter combination
for params in tuning_grid:
    cv_auc_scores = []

    # Perform 3-Fold Cross-Validation
    for train_index, val_index in kfold.split(X_train_full, y_train_full):
        X_train_fold, X_val_fold = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
        y_train_fold, y_val_fold = y_train_full.iloc[train_index], y_train_full.iloc[val_index]

        # 1. Create and compile model
        model = create_model_manual(input_dim, params['neurons'], params['learning_rate'])

        # 2. Train model
        model.fit(X_train_fold, y_train_fold,
                  epochs=epochs,
                  batch_size=params['batch_size'],
                  verbose=0)

        # 3. Predict probabilities and calculate ROC AUC
        y_pred_proba = model.predict(X_val_fold, verbose=0).flatten()
        fold_auc = roc_auc_score(y_val_fold, y_pred_proba)
        cv_auc_scores.append(fold_auc)

    # Calculate average AUC for this parameter set
    mean_auc = np.mean(cv_auc_scores)

    print(f"Candidate {params} | Avg CV ROC AUC: {mean_auc:.4f}")

    # Check for the best parameter combination
    if mean_auc > best_auc:
        best_auc = mean_auc
        best_params = params

print("\n--- TUNING RESULTS (MLP/ANN) ---")
print(f"Best ROC AUC score found: {best_auc:.4f}")
print(f"Best parameters: {best_params}")
print("-" * 50)


# --- Retrain FINAL Model with Best Parameters and Full Epochs ---

# 1. Create the final model using best parameters
final_mlp_model = create_model_manual(
    input_dim=input_dim,
    neurons=best_params['neurons'],
    learning_rate=best_params['learning_rate']
)

# 2. Use the original evaluate_model function to train and report (e.g., using 10 epochs)
# NOTE: The evaluate_model function must be called with the appropriate arguments.
# Since we are outside the GridSearch environment, we call the fit method directly.
final_mlp_model.fit(X_train_full, y_train_full,
                    epochs=10,
                    batch_size=best_params['batch_size'],
                    verbose=0)


--- Starting Manual 3-Fold Tuning for MLP (8 Candidates) ---
Candidate {'neurons': 32, 'learning_rate': 0.001, 'batch_size': 32} | Avg CV ROC AUC: 0.7489
Candidate {'neurons': 64, 'learning_rate': 0.001, 'batch_size': 32} | Avg CV ROC AUC: 0.7498
Candidate {'neurons': 32, 'learning_rate': 0.01, 'batch_size': 32} | Avg CV ROC AUC: 0.7511
Candidate {'neurons': 64, 'learning_rate': 0.01, 'batch_size': 64} | Avg CV ROC AUC: 0.7501
Candidate {'neurons': 32, 'learning_rate': 0.001, 'batch_size': 64} | Avg CV ROC AUC: 0.7495
Candidate {'neurons': 64, 'learning_rate': 0.001, 'batch_size': 64} | Avg CV ROC AUC: 0.7497
Candidate {'neurons': 32, 'learning_rate': 0.01, 'batch_size': 64} | Avg CV ROC AUC: 0.7509
Candidate {'neurons': 64, 'learning_rate': 0.01, 'batch_size': 32} | Avg CV ROC AUC: 0.7497

--- TUNING RESULTS (MLP/ANN) ---
Best ROC AUC score found: 0.7511
Best parameters: {'neurons': 32, 'learning_rate': 0.01, 'batch_size': 32}
--------------------------------------------------

--- FI

Predicting and Testing Neural Networks

In [None]:
# 3. Evaluate the final, best model on the holdout test set (X_test, y_test)
print("\n--- FINAL EVALUATION of Tuned MLP (Full, 10 Epochs) ---")
y_pred_proba_final = final_mlp_model.predict(X_test, verbose=0).flatten()
y_pred_final = (y_pred_proba_final > 0.5).astype(int)

final_accuracy = accuracy_score(y_test, y_pred_final)
final_auc = roc_auc_score(y_test, y_pred_proba_final)

print(f"Evaluation Accuracy: {final_accuracy:.4f}")
print(f"Evaluation ROC AUC: {final_auc:.4f}")
print("\nClassification Report (on Evaluation Set):")
print(classification_report(y_test, y_pred_final, zero_division=0))

Logistic Regression

In [None]:


# --- A. Logistic Regression (Training on 20% Sample) ---
# Use max_iter=1000 for convergence on large datasets
log_reg_20pct = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

# Define the grid of parameters to search
param_grid_log_reg = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1.0, 10.0],
    'solver': ['liblinear'] # liblinear supports both l1 and l2
}

evaluate_model("Logistic Regression (20%)", log_reg_20pct,
               X_train_20pct, y_train_20pct, X_test, y_test)

# --- B. Logistic Regression (Training on Full Dataset) ---
final_log_reg = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
final_log_reg = evaluate_model("Logistic Regression (Full)", final_log_reg,
                               X_train_full, y_train_full, X_test, y_test)

# Define the model
model_log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Initialize GridSearchCV (using ROC AUC as the primary metric)
grid_search_log_reg = GridSearchCV(
    estimator=model_log_reg,
    param_grid=param_grid_log_reg,
    scoring='roc_auc',
    cv=3, # Use 3-fold cross-validation
    verbose=2,
    n_jobs=-1
)

print("--- Starting Hyperparameter Tuning for Logistic Regression ---")
grid_search_log_reg.fit(X_train_full, y_train_full)

# Get the best estimator and parameters
best_log_reg = grid_search_log_reg.best_estimator_
print("\n--- TUNING RESULTS (Logistic Regression) ---")
print(f"Best ROC AUC score found: {grid_search_log_reg.best_score_:.4f}")
print(f"Best parameters: {grid_search_log_reg.best_params_}")
print("-" * 50)

# Evaluate the final, best model on the holdout test set
final_log_reg = evaluate_model("Tuned Logistic Regression (Full)", best_log_reg,
                               X_train_full, y_train_full, X_test, y_test)



--- Training Logistic Regression (20%) on 40855 samples... ---
Training Time: 0.20 seconds
Evaluation Accuracy: 0.8842
Evaluation ROC AUC: 0.5209

Classification Report (on Evaluation Set):
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     36105
           1       0.52      0.05      0.09      4751

    accuracy                           0.88     40856
   macro avg       0.71      0.52      0.51     40856
weighted avg       0.85      0.88      0.84     40856

--------------------------------------------------
--- Training Logistic Regression (Full) on 163421 samples... ---
Training Time: 1.13 seconds
Evaluation Accuracy: 0.8843
Evaluation ROC AUC: 0.5204

Classification Report (on Evaluation Set):
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     36105
           1       0.53      0.05      0.09      4751

    accuracy                           0.88     40856
   macro avg       0.7

Predicting and Testing Logistic Regression

In [None]:
# --- C. Prediction and Submission ---
print("Predicting with Logistic Regression (Full) on Test Data...")
y_pred_log_reg = final_log_reg.predict(X_submit)

# Create and save the submission DataFrame
submission_df_log_reg = pd.DataFrame({
    'ProfileID': profile_ids,
    'RiskFlag': y_pred_log_reg
})
submission_filename_log_reg = 'submission_logistic_regression.csv'
submission_df_log_reg.to_csv(submission_filename_log_reg, index=False)
print(f"Generated submission file: {submission_filename_log_reg}")