In [23]:
%run ./config/parameters.py
%run ./config/path.py

In [24]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
import warnings
from collections import defaultdict
from statistics import mean
from sklearn import metrics
from catboost import CatBoostClassifier
warnings.filterwarnings('ignore')

In [25]:
train_data = pd.read_csv(P_TRAIN_DATA)
test_data = pd.read_csv(P_TEST_DATA )
sample_submission = pd.read_csv(P_SAMPLE_SUB)

# Target Preprocessing
hr_data = train_data.astype({'Loan_Status': 'int'})

# Constants Definition
TARGET_VAR = 'Loan_Status'
# Define the new column order
new_column_order = [
    'Gender', 'Married', 'Dependents', 'Education',
    'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome',
    'LoanAmount', 'Credit_History', 'Property_Area',
    'Total_Income', 'Loan_Amount_Term'
]

# Reorder the DataFrame columns
#train_data = train_data[new_column_order + ['Loan_Status']]

# Now your feature columns (excluding IDs and target) are:
FEATURE_COLUMNS = new_column_order

# Removing Constant Value Features
# FEATURE_COLUMNS.remove('Over18')
# FEATURE_COLUMNS.remove('StandardHours')
# FEATURE_COLUMNS.remove('EmployeeCount')

In [26]:
processed_train_data = train_data.loc[:, FEATURE_COLUMNS + [TARGET_VAR]].copy()
processed_test_data = test_data.loc[:, FEATURE_COLUMNS].copy()

In [28]:
# Feature 1: Gender-Credit Risk Factor
processed_train_data['female_credit_risk'] = (
    (processed_train_data['Gender'] == 0) & 
    (processed_train_data['Credit_History'] == 1)
).astype(int)

# Feature 2: Education Paradox Risk
processed_train_data['educated_risk_paradox'] = (
    (processed_train_data['Education'] == 1) & 
    (processed_train_data['Credit_History'] == 0)
).astype(int)

# Feature 3: Married Credit Risk Interaction
processed_train_data['married_credit_risk'] = (
    (processed_train_data['Married'] == 1) & 
    (processed_train_data['Credit_History'] == 0)
).astype(int)

# Feature 4: Urban Premium Risk
processed_train_data['urban_advantage'] = (
    (processed_train_data['Property_Area'] == 2) & 
    (processed_train_data['Credit_History'] == 1)
).astype(int)

# Feature 5: Suburban Safe Haven
processed_train_data['suburban_safe_haven'] = (
    (processed_train_data['Property_Area'] == 1) & 
    (processed_train_data['Credit_History'] == 0)
).astype(int)

# Feature 6: High Dependents Bonus
processed_train_data['high_dependents_boost'] = (
    (processed_train_data['Dependents'] == '3+') & 
    (processed_train_data['Credit_History'] == 0)
).astype(int)

# Feature 7: Self-Employment Paradox
processed_train_data['self_employed_risk'] = (
    (processed_train_data['Self_Employed'] == 1) & 
    (processed_train_data['Credit_History'] == 0)
).astype(int)

# First categorize loan terms (as previously defined)
processed_train_data['Loan_Term_Category'] = processed_train_data['Loan_Amount_Term'].apply(
    lambda x: 'short' if x <= 100 else ('medium' if x <= 200 else 'long'))

# Feature 8: Medium Term High Risk
processed_train_data['medium_term_high_risk'] = (
    (processed_train_data['Loan_Term_Category'] == 'medium') & 
    (processed_train_data['Credit_History'] == 0)
).astype(int)

# Feature 9: Double Income Vulnerability
processed_train_data['low_income_high_risk'] = (
    (processed_train_data['ApplicantIncome'] < processed_train_data['ApplicantIncome'].median()) & 
    (processed_train_data['CoapplicantIncome'] == 0) & 
    (processed_train_data['Credit_History'] == 0)
).astype(int)

# Feature 10: Approval Probability Estimator
conditions = [
    (processed_train_data['Credit_History'] == 1),
    (processed_train_data['Property_Area'] == 1),
    (processed_train_data['Dependents'] == '3+'),
    (processed_train_data['Loan_Term_Category'] == 'medium')
]

choices = [0.83, 0.85, 0.91, 1.00]  # Approval probabilities from insights
processed_train_data['composite_approval_score'] = np.select(conditions, choices, default=0.75)

In [29]:
# Feature 1: Gender-Credit Risk Factor
processed_test_data['female_credit_risk'] = (
    (processed_test_data['Gender'] == 0) & 
    (processed_test_data['Credit_History'] == 1)
).astype(int)

# Feature 2: Education Paradox Risk
processed_test_data['educated_risk_paradox'] = (
    (processed_test_data['Education'] == 1) & 
    (processed_test_data['Credit_History'] == 0)
).astype(int)

# Feature 3: Married Credit Risk Interaction
processed_test_data['married_credit_risk'] = (
    (processed_test_data['Married'] == 1) & 
    (processed_test_data['Credit_History'] == 0)
).astype(int)

# Feature 4: Urban Premium Risk
processed_test_data['urban_advantage'] = (
    (processed_test_data['Property_Area'] == 2) & 
    (processed_test_data['Credit_History'] == 1)
).astype(int)

# Feature 5: Suburban Safe Haven
processed_test_data['suburban_safe_haven'] = (
    (processed_test_data['Property_Area'] == 1) & 
    (processed_test_data['Credit_History'] == 0)
).astype(int)

# Feature 6: High Dependents Bonus
processed_test_data['high_dependents_boost'] = (
    (processed_test_data['Dependents'] == '3+') & 
    (processed_test_data['Credit_History'] == 0)
).astype(int)

# Feature 7: Self-Employment Paradox
processed_test_data['self_employed_risk'] = (
    (processed_test_data['Self_Employed'] == 1) & 
    (processed_test_data['Credit_History'] == 0)
).astype(int)

# First categorize loan terms (as previously defined)
processed_test_data['Loan_Term_Category'] = processed_test_data['Loan_Amount_Term'].apply(
    lambda x: 'short' if x <= 100 else ('medium' if x <= 200 else 'long'))

# Feature 8: Medium Term High Risk
processed_test_data['medium_term_high_risk'] = (
    (processed_test_data['Loan_Term_Category'] == 'medium') & 
    (processed_test_data['Credit_History'] == 0)
).astype(int)

# Feature 9: Double Income Vulnerability
processed_test_data['low_income_high_risk'] = (
    (processed_test_data['ApplicantIncome'] < processed_test_data['ApplicantIncome'].median()) & 
    (processed_test_data['CoapplicantIncome'] == 0) & 
    (processed_test_data['Credit_History'] == 0)
).astype(int)

# Feature 10: Approval Probability Estimator
conditions = [
    (processed_test_data['Credit_History'] == 1),
    (processed_test_data['Property_Area'] == 1),
    (processed_test_data['Dependents'] == '3+'),
    (processed_test_data['Loan_Term_Category'] == 'medium')
]

choices = [0.83, 0.85, 0.91, 1.00]  # Approval probabilities from insights
processed_test_data['composite_approval_score'] = np.select(conditions, choices, default=0.75)

In [30]:
processed_test_data.drop(columns=['Loan_Amount_Term'], inplace=True)
processed_train_data.drop(columns=['Loan_Amount_Term'], inplace=True)

In [31]:
for idx, value in enumerate(ORDINAL_CAT_ORDER):
    processed_train_data.loc[processed_train_data['Dependents'] == value, 'Dependents'] = idx
    processed_test_data.loc[processed_test_data['Dependents'] == value, 'Dependents'] = idx

processed_train_data = processed_train_data.astype({'Dependents': 'int'})
processed_test_data = processed_test_data.astype({'Dependents': 'int'})

In [32]:
# processed_train_data[BINARY_FEATURES] = processed_test_data[BINARY_FEATURES].astype(int)
# processed_test_data[BINARY_FEATURES] = processed_test_data[BINARY_FEATURES].astype(int)

In [33]:
# processed_train_data[CATEGORICAL_FEATURES] = processed_test_data[CATEGORICAL_FEATURES].astype(int)
# processed_test_data[CATEGORICAL_FEATURES] = processed_test_data[CATEGORICAL_FEATURES].astype(int)

In [34]:
# processed_train_data = processed_train_data.drop(columns=['Loan_Status'], axis=1)

In [35]:
for cont_feature in CONTINUOUS_FEATURES:
    mean_value = np.mean(processed_train_data[cont_feature])
    std_dev = np.std(processed_train_data[cont_feature])
    
    processed_train_data[cont_feature] = (processed_train_data[cont_feature] - mean_value) / std_dev
    
    processed_test_data[cont_feature] = (processed_test_data[cont_feature] - mean_value) / std_dev


In [36]:
SELECTED_FEATURES = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Credit_History', 
       'Total_Income', 'female_credit_risk', 'Property_Area',
       'educated_risk_paradox', 'married_credit_risk', 'urban_advantage',
       'suburban_safe_haven', 'high_dependents_boost', 'self_employed_risk', 
       'medium_term_high_risk', 'low_income_high_risk',
       'composite_approval_score']

In [37]:
# Initialize encoder (fit only on train data)
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=int)
ohe.fit(processed_train_data[['Loan_Term_Category']])  # Fit only on train

# Transform both datasets
train_encoded = ohe.transform(processed_train_data[['Loan_Term_Category']])
test_encoded = ohe.transform(processed_test_data[['Loan_Term_Category']])

# Get feature names (e.g., ['Loan_Term_Category_medium', 'Loan_Term_Category_long'])
encoded_cols = ohe.get_feature_names_out(['Loan_Term_Category'])

# Convert to DataFrames
train_loan_term_df = pd.DataFrame(train_encoded, columns=encoded_cols, index=processed_train_data.index)
test_loan_term_df = pd.DataFrame(test_encoded, columns=encoded_cols, index=processed_test_data.index)

# Concatenate and drop original column
processed_train_data = pd.concat([processed_train_data, train_loan_term_df], axis=1).drop('Loan_Term_Category', axis=1)
processed_test_data = pd.concat([processed_test_data, test_loan_term_df], axis=1).drop('Loan_Term_Category', axis=1)

In [42]:
# Define tuning parameters
n_estimators_values = [10, 25, 50, 100, 150, 200, 250, 300]
eta_values = [v / 10 for v in range(10)]
max_depth_values = [2, 4, 6, 8, 10]
subsample_values = [0.25, 0.50, 0.75, 0.90]
colsample_bytree_values = [0.25, 0.50, 0.75, 0.90]

cv_folds = 10
tuning_iterations = 10
include_orig = True
tuning_results = defaultdict(list)

# Create column names for predictions
col_names = [f'XGB_Step_{step}_Fold_{fold}' 
             for step in range(tuning_iterations) 
             for fold in range(cv_folds)]
test_predictions = pd.DataFrame(0, index=processed_test_data.index, columns=col_names)
valid_predictions = pd.DataFrame(0, index=processed_train_data.index, columns=col_names)

# Initialize confusion matrix storage
confusion_matrices = []

random.seed(2201020)

# Stratified K-Fold for Cross-Validation
skf_seed = random.randint(0, 2023)
skf = StratifiedKFold(n_splits=cv_folds, random_state=skf_seed, shuffle=True)

# Model Tuning Loop
for step in range(tuning_iterations):
    n_estimators = random.choice(n_estimators_values)
    eta = random.choice(eta_values)
    max_depth = random.choice(max_depth_values)
    subsample = random.choice(subsample_values)
    colsample_bytree = random.choice(colsample_bytree_values)
    
    aucs = []
    step_cm = np.zeros((2, 2))  # Initialize confusion matrix for this step

    for i, (train_index, val_index) in enumerate(skf.split(processed_train_data[SELECTED_FEATURES], processed_train_data[TARGET_VAR])):
        X_train, X_val = processed_train_data[SELECTED_FEATURES].iloc[train_index], processed_train_data[SELECTED_FEATURES].iloc[val_index]
        y_train, y_val = processed_train_data[TARGET_VAR].iloc[train_index], processed_train_data[TARGET_VAR].iloc[val_index]
        
        xgb_seed = random.randint(0, 2023)
        xgb = XGBClassifier(n_estimators=n_estimators, eta=eta, max_depth=max_depth, 
                           subsample=subsample, colsample_bytree=colsample_bytree, 
                           random_state=xgb_seed).fit(X_train.values, y_train)
        
        val_probs = [probs[1] for probs in xgb.predict_proba(X_val)]
        val_preds = [1 if p >= 0.5 else 0 for p in val_probs]  # Binary predictions at 0.5 threshold
        
        valid_predictions.loc[val_index, f'XGB_Step_{step}_Fold_{i}'] = val_probs
        
        # Calculate metrics
        fpr, tpr, thresholds = metrics.roc_curve(y_val, val_probs, pos_label=1)
        auc = metrics.auc(fpr, tpr)
        aucs.append(auc)
        
        # Update confusion matrix for this step
        cm = metrics.confusion_matrix(y_val, val_preds)
        step_cm += cm
        
        test_predictions[f'XGB_Step_{step}_Fold_{i}'] = [probs[1] for probs in xgb.predict_proba(processed_test_data[SELECTED_FEATURES])]
    
    # Store confusion matrix for this step (sum across all folds)
    confusion_matrices.append(step_cm)
    
    # Calculate performance metrics from confusion matrix
    tn, fp, fn, tp = step_cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Storing the tuning results
    tuning_results['step'].append(step)
    tuning_results['auc'].append(mean(aucs))
    tuning_results['n_estimators'].append(n_estimators)
    tuning_results['eta'].append(eta)
    tuning_results['max_depth'].append(max_depth)
    tuning_results['subsample'].append(subsample)
    tuning_results['colsample_bytree'].append(colsample_bytree)
    tuning_results['skf_seed'].append(skf_seed)
    tuning_results['xgb_seed'].append(xgb_seed)
    tuning_results['accuracy'].append(accuracy)
    tuning_results['precision'].append(precision)
    tuning_results['recall'].append(recall)
    tuning_results['f1'].append(f1)
    tuning_results['confusion_matrix'].append(str(step_cm))
    
    # print(f'Step: {step}  AUC: {mean(aucs):.4f}  Accuracy: {accuracy:.4f}  Precision: {precision:.4f}  Recall: {recall:.4f}  F1: {f1:.4f}')
    # print(f'Confusion Matrix:\n{step_cm}\n')

# Saving Predictions and Tuning Results
valid_predictions.to_csv('XGBoost_Valid_Predictions.csv', index=False)
test_predictions.to_csv('XGBoost_Test_Predictions.csv', index=False)

# Finalizing Tuning Results
tuning_results = pd.DataFrame(tuning_results)
tuning_results.sort_values(by='auc', ascending=False, inplace=True)
tuning_results.to_csv('XGBoost_Tuning_Results.csv', index=False)

# Save confusion matrices separately
confusion_df = pd.DataFrame({
    'step': range(tuning_iterations),
    'confusion_matrix': [str(cm) for cm in confusion_matrices],
    'true_negative': [cm[0, 0] for cm in confusion_matrices],
    'false_positive': [cm[0, 1] for cm in confusion_matrices],
    'false_negative': [cm[1, 0] for cm in confusion_matrices],
    'true_positive': [cm[1, 1] for cm in confusion_matrices]
})
confusion_df.to_csv('XGBoost_Confusion_Matrices.csv', index=False)

# Return the top performing models
tuning_results

Unnamed: 0,step,auc,n_estimators,eta,max_depth,subsample,colsample_bytree,skf_seed,xgb_seed,accuracy,precision,recall,f1,confusion_matrix
2,2,0.513658,100,0.7,10,0.25,0.5,1054,1517,0.733639,0.833134,0.8506,0.841777,[[ 148. 837.]\n [ 734. 4179.]]
4,4,0.508381,50,0.5,8,0.9,0.5,1054,1731,0.80451,0.832508,0.95807,0.890887,[[ 38. 947.]\n [ 206. 4707.]]
1,1,0.50494,25,0.9,10,0.25,0.25,1054,460,0.760258,0.835346,0.887034,0.860415,[[ 126. 859.]\n [ 555. 4358.]]
9,9,0.502987,250,0.2,6,0.75,0.9,1054,1128,0.802815,0.831624,0.957053,0.88994,[[ 33. 952.]\n [ 211. 4702.]]
7,7,0.502754,200,0.1,10,0.9,0.25,1054,745,0.827569,0.832764,0.992265,0.905545,[[ 6. 979.]\n [ 38. 4875.]]
3,3,0.502272,200,0.5,10,0.75,0.75,1054,1705,0.777043,0.83143,0.918583,0.872836,[[ 70. 915.]\n [ 400. 4513.]]
6,6,0.5,25,0.0,4,0.9,0.5,1054,1921,0.832994,0.832994,1.0,0.908889,[[ 0. 985.]\n [ 0. 4913.]]
5,5,0.498385,200,0.4,2,0.9,0.9,1054,1406,0.829773,0.832794,0.995522,0.906916,[[3.000e+00 9.820e+02]\n [2.200e+01 4.891e+03]]
0,0,0.488566,100,0.2,2,0.5,0.5,1054,983,0.832994,0.832994,1.0,0.908889,[[ 0. 985.]\n [ 0. 4913.]]
8,8,0.479672,25,0.2,6,0.9,0.9,1054,1694,0.832146,0.832966,0.998779,0.908367,[[1.000e+00 9.840e+02]\n [6.000e+00 4.907e+03]]


In [41]:
# # Define tuning parameters
# n_estimators_values = [10, 25, 50, 100, 150, 200, 250, 300]
# eta_values = [v / 10 for v in range(10)]
# max_depth_values = [2, 4, 6, 8, 10]
# subsample_values = [0.25, 0.50, 0.75, 0.90]
# colsample_bytree_values = [0.25, 0.50, 0.75, 0.90]

# cv_folds = 10
# tuning_iterations = 10
# include_orig = True
# tuning_results = defaultdict(list)

# # Create column names for predictions
# col_names = [f'XGB_Step_{step}_Fold_{fold}' 
#              for step in range(tuning_iterations) 
#              for fold in range(cv_folds)]
# test_predictions = pd.DataFrame(0, index = processed_test_data.index, columns = col_names)
# valid_predictions = pd.DataFrame(0, index = processed_train_data.index, columns = col_names)

# random.seed(2201020)

# # Stratified K-Fold for Cross-Validation
# skf_seed = random.randint(0, 2023)
# skf = StratifiedKFold(n_splits = cv_folds, random_state = skf_seed, shuffle = True)

# # Model Tuning Loop
# for step in range(tuning_iterations):
#     n_estimators = random.choice(n_estimators_values)
#     eta = random.choice(eta_values)
#     max_depth = random.choice(max_depth_values)
#     subsample = random.choice(subsample_values)
#     colsample_bytree = random.choice(colsample_bytree_values)
    
#     aucs = []

#     for i, (train_index, val_index) in enumerate(skf.split(processed_train_data[SELECTED_FEATURES], processed_train_data[TARGET_VAR])):
#         X_train, X_val = processed_train_data[SELECTED_FEATURES].iloc[train_index], processed_train_data[SELECTED_FEATURES].iloc[val_index]
#         y_train, y_val = processed_train_data[TARGET_VAR].iloc[train_index], processed_train_data[TARGET_VAR].iloc[val_index]
        
#         xgb_seed = random.randint(0, 2023)
#         xgb = XGBClassifier(n_estimators=n_estimators, eta=eta, max_depth=max_depth, subsample=subsample, colsample_bytree=colsample_bytree, random_state=xgb_seed).fit(X_train.values, y_train)
        
#         val_probs = [probs[1] for probs in xgb.predict_proba(X_val)]
#         valid_predictions.loc[val_index, f'XGB_Step_{step}_Fold_{i}'] = val_probs
        
#         fpr, tpr, thresholds = metrics.roc_curve(y_val, val_probs, pos_label=1)
#         auc = metrics.auc(fpr, tpr)
#         aucs.append(auc)
        
#         test_predictions[f'XGB_Step_{step}_Fold_{i}'] = [probs[1] for probs in xgb.predict_proba(processed_test_data[SELECTED_FEATURES])]
    
#     # Storing the tuning results
#     tuning_results['step'].append(step)
#     tuning_results['auc'].append(mean(aucs))
#     tuning_results['n_estimators'].append(n_estimators)
#     tuning_results['eta'].append(eta)
#     tuning_results['max_depth'].append(max_depth)
#     tuning_results['subsample'].append(subsample)
#     tuning_results['colsample_bytree'].append(colsample_bytree)
#     tuning_results['skf_seed'].append(skf_seed)
#     tuning_results['xgb_seed'].append(xgb_seed)
    
#     # print(f'Step: {step}  AUC: {mean(aucs)}')

# # Saving Predictions and Tuning Results
# valid_predictions.to_csv('XGBoost_Valid_Predictions.csv', index=False)
# test_predictions.to_csv('XGBoost_Test_Predictions.csv', index=False)

# # Finalizing Tuning Results
# tuning_results = pd.DataFrame(tuning_results)
# tuning_results.sort_values(by='auc', ascending=False, inplace=True)
# tuning_results.to_csv('XGBoost_Tuning_Results.csv', index=False)
# tuning_results


In [43]:
# Define specific steps to ensemble (1, 2, 3 regardless of AUC ranking)
target_steps = [1, 2, 3]  # Steps you identified as best and balanced
num_folds = 10  # From your cv_folds value

# Create list of columns for these steps across all folds
best_cols = [
    f'XGB_Step_{step}_Fold_{fold}'
    for step in target_steps
    for fold in range(num_folds)
]

# Calculate mean probability across selected models
cv_probs = test_predictions[best_cols].mean(axis=1)

# Convert probabilities to binary classes (0/1) with 0.5 threshold
cv_classes = (cv_probs >= 0.5).astype(int)

# Prepare submission (assuming 'Loan_Status' expects 0/1)
sub = sample_submission.copy()
sub['Loan_Status'] = cv_classes  # Use binary classes instead of probabilities

# Save with descriptive filename
sub.to_csv('xgb_steps_1_2_3_ensemble_binary.csv', index=False)