In [None]:
Importing and Loading Data

In [None]:
import pandas as pd
import numpy as np
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.exceptions import ConvergenceWarning
import warnings

# Suppress ConvergenceWarning from LinearSVC for cleaner output
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# --- 1. Load Data and Prepare X and y (Training) ---
df_train = pd.read_csv("train_processed_parv.csv")
bool_cols_train = df_train.select_dtypes(include='bool').columns
df_train[bool_cols_train] = df_train[bool_cols_train].astype(int)

X = df_train.drop(columns=['ProfileID', 'RiskFlag'])
y = df_train['RiskFlag']

# Split for full training (80%) and evaluation (20%)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create the 20% Training Sample (25% of X_train_full = 20% of original data)
X_train_20pct, X_temp, y_train_20pct, y_temp = train_test_split(
    X_train_full, y_train_full, test_size=0.75, random_state=42, stratify=y_train_full
)

input_dim = X_train_full.shape[1]

print(f"Total Features: {input_dim}")
print(f"20% Training Sample Size: {len(X_train_20pct)}")
print(f"Full Training Set Size: {len(X_train_full)}")
print(f"Evaluation Test Set Size: {len(X_test)}\n")

# --- 2. Load and Prepare X for Submission (Competition Test Data) ---
df_test = pd.read_csv("test_processed_parv.csv")
bool_cols_test = df_test.select_dtypes(include='bool').columns
df_test[bool_cols_test] = df_test[bool_cols_test].astype(int)

# X_submit is the actual competition test data to predict
X_submit = df_test.drop(columns=['ProfileID'])
profile_ids = df_test['ProfileID']
print(f"Submission Test Set Size: {len(X_submit)}\n")

Common Functions and Neural Networks

In [None]:
# --- 3. Define Common Functions ---

def evaluate_model(model_name, model, X_train, y_train, X_eval, y_eval, is_ann=False):
    """Trains, predicts, and reports metrics for a given model."""
    print(f"--- Training {model_name} on {len(X_train)} samples... ---")
    start_time = time.time()

    if is_ann:
        model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
    else:
        model.fit(X_train, y_train)

    training_time = time.time() - start_time
    print(f"Training Time: {training_time:.2f} seconds")

    # Predict on the hold-out evaluation set
    if is_ann:
        y_pred_proba = model.predict(X_eval, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int).flatten()
    else:
        y_pred = model.predict(X_eval)

    accuracy = accuracy_score(y_eval, y_pred)
    auc = roc_auc_score(y_eval, y_pred)

    print(f"Evaluation Accuracy: {accuracy:.4f}")
    print(f"Evaluation ROC AUC: {auc:.4f}")
    print("\nClassification Report (on Evaluation Set):")
    print(classification_report(y_eval, y_pred, zero_division=0))
    print("-" * 50)
    return model # Return the trained model

def create_ann_model(input_dim):
    """Creates a basic ANN model for binary classification."""
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

Linear SVM

In [None]:
# --- A. Linear SVM (Training on 20% Sample) ---
linear_svm_20pct = LinearSVC(random_state=42, dual=True, max_iter=10000)
evaluate_model("Linear SVM (20%)", linear_svm_20pct,
               X_train_20pct, y_train_20pct, X_test, y_test)

# --- B. Linear SVM (Training on Full Dataset) ---
final_linear_svm = LinearSVC(random_state=42, dual=True, max_iter=10000)
final_linear_svm = evaluate_model("Linear SVM (Full)", final_linear_svm,
                                  X_train_full, y_train_full, X_test, y_test)


# --- C. Prediction and Submission ---
print("Predicting with Linear SVM (Full) on Test Data...")
y_pred_linear = final_linear_svm.predict(X_submit)

# Create and save the submission DataFrame
submission_df_linear = pd.DataFrame({
    'ProfileID': profile_ids,
    'RiskFlag': y_pred_linear
})
submission_filename_linear = 'submission_linear_svm.csv'
submission_df_linear.to_csv(submission_filename_linear, index=False)
print(f"Generated submission file: {submission_filename_linear}")

Kernalized SVM

In [None]:
# --- A. Kernelized SVM (RBF) (Training on 20% Sample) ---
# NOW TRAINING ON THE ENTIRE 20% SAMPLE (X_train_20pct)
rbf_svm_20pct = SVC(kernel='rbf', gamma='scale', random_state=42)
rbf_svm_20pct = evaluate_model("Kernelized SVM (RBF) (20% full sample)", rbf_svm_20pct,
               X_train_20pct, y_train_20pct, X_test, y_test)


# --- B. Kernelized SVM (RBF) (Training on Full Dataset) ---
# NOW TRAINING ON THE ENTIRE FULL TRAINING DATASET (X_train_full)
final_rbf_svm = SVC(kernel='rbf', gamma='scale', random_state=42)
final_rbf_svm = evaluate_model("Kernelized SVM (RBF) (Full Dataset)", final_rbf_svm,
                               X_train_full, y_train_full, X_test, y_test)


# --- C. Prediction and Submission ---
print("Predicting with Kernelized SVM (Full Dataset) on Test Data...")
y_pred_rbf = final_rbf_svm.predict(X_submit)

# Create and save the submission DataFrame
submission_df_rbf = pd.DataFrame({
    'ProfileID': profile_ids,
    'RiskFlag': y_pred_rbf
})
submission_filename_rbf = 'submission_kernelized_svm.csv'
submission_df_rbf.to_csv(submission_filename_rbf, index=False)
print(f"Generated submission file: {submission_filename_rbf}")

MLP (Neural Networks)

In [None]:
# --- A. ANN (Training on 20% Sample) ---
ann_model_20pct = create_ann_model(input_dim)
evaluate_model("ANN (20%)", ann_model_20pct,
               X_train_20pct, y_train_20pct, X_test, y_test, is_ann=True)


# --- B. ANN (Training on Full Dataset) ---
final_ann_model = create_ann_model(input_dim)
final_ann_model = evaluate_model("ANN (Full)", final_ann_model,
                                 X_train_full, y_train_full, X_test, y_test, is_ann=True)


# --- C. Prediction and Submission ---
print("Predicting with ANN (Full) on Test Data...")
# Predict probabilities
y_pred_proba_ann = final_ann_model.predict(X_submit, verbose=0)
# Convert probabilities to binary predictions (0 or 1)
y_pred_ann = (y_pred_proba_ann > 0.5).astype(int).flatten()

# Create and save the submission DataFrame
submission_df_ann = pd.DataFrame({
    'ProfileID': profile_ids,
    'RiskFlag': y_pred_ann
})
submission_filename_ann = 'submission_ann.csv'
submission_df_ann.to_csv(submission_filename_ann, index=False)
print(f"Generated submission file: {submission_filename_ann}")