## **Phase 4: Model Training and Evaluation**

In [9]:
# Import libraries
import pandas as pd
import numpy as np

# Library to split data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score, 
                           balanced_accuracy_score, f1_score, precision_score, recall_score)



import warnings
warnings.filterwarnings('ignore')

In [10]:
# Load data
df = pd.read_csv('preprocessed_easy_visa.csv')
df.head()

Unnamed: 0,no_of_employees,prevailing_wage,unit_of_wage,company_age,wage_per_year,wage_employee_ratio,employees_growth_rate_ratio,wage_per_age_ratio,case_status_encoded,education_level_ordinal,...,continent_Europe,continent_North America,continent_Oceania,continent_South America,has_job_experience_encoded,requires_job_training_encoded,full_time_position_encoded,region_target_encoded,prevailing_wage_log,no_of_employees_log
0,7227.0,592.2029,Hour,18.0,1231782.032,170.44,401.5,68432.34,0,1,...,0,0,0,0,0,0,1,0.62,6.383849,8.885718
1,2412.0,83425.65,Year,23.0,83425.65,34.59,104.87,3627.2,1,3,...,0,0,0,0,1,0,1,0.63,11.331711,7.788626
2,7227.0,122996.86,Year,17.0,122996.86,17.02,425.12,7235.11,0,2,...,0,0,0,0,0,1,1,0.62,11.719914,8.885718
3,98.0,83434.03,Year,92.5,83434.03,851.37,1.06,901.99,0,2,...,0,0,0,0,0,0,1,0.62,11.331812,4.59512
4,1082.0,149907.39,Year,20.0,149907.39,138.55,54.1,7495.37,1,3,...,0,0,0,0,1,0,1,0.7,11.917773,6.98749


### **Data Splitting**

Based on the class imbalance, we will use stratified splitting

In [16]:
# importing selected features from preprocessing
selected_features = ['company_age', 'wage_per_year', 'wage_employee_ratio', 'employees_growth_rate_ratio', 'wage_per_age_ratio', 'education_level_ordinal', 'establishment_period_ordinal', 'continent_Africa', 'continent_Asia', 'continent_Europe', 'continent_North America', 'continent_Oceania', 'continent_South America', 'has_job_experience_encoded', 'requires_job_training_encoded', 'full_time_position_encoded', 'region_target_encoded', 'prevailing_wage_log', 'no_of_employees_log']

X = df.drop('case_status_encoded', axis=1)
y = df['case_status_encoded']
X

Unnamed: 0,no_of_employees,prevailing_wage,unit_of_wage,company_age,wage_per_year,wage_employee_ratio,employees_growth_rate_ratio,wage_per_age_ratio,education_level_ordinal,establishment_period_ordinal,...,continent_Europe,continent_North America,continent_Oceania,continent_South America,has_job_experience_encoded,requires_job_training_encoded,full_time_position_encoded,region_target_encoded,prevailing_wage_log,no_of_employees_log
0,7227.0,592.20290,Hour,18.0,1.231782e+06,170.44,401.50,68432.34,1,2,...,0,0,0,0,0,0,1,0.62,6.383849,8.885718
1,2412.0,83425.65000,Year,23.0,8.342565e+04,34.59,104.87,3627.20,3,2,...,0,0,0,0,1,0,1,0.63,11.331711,7.788626
2,7227.0,122996.86000,Year,17.0,1.229969e+05,17.02,425.12,7235.11,2,2,...,0,0,0,0,0,1,1,0.62,11.719914,8.885718
3,98.0,83434.03000,Year,92.5,8.343403e+04,851.37,1.06,901.99,2,5,...,0,0,0,0,0,0,1,0.62,11.331812,4.595120
4,1082.0,149907.39000,Year,20.0,1.499074e+05,138.55,54.10,7495.37,3,2,...,0,0,0,0,1,0,1,0.70,11.917773,6.987490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25442,2601.0,77092.57000,Year,17.0,7.709257e+04,29.64,153.00,4534.86,2,2,...,0,0,0,0,1,1,1,0.70,11.252762,7.864036
25443,3274.0,218315.56125,Year,19.0,2.183156e+05,66.68,172.32,11490.29,1,2,...,0,0,0,0,1,0,1,0.63,12.293697,8.094073
25444,1121.0,146298.85000,Year,92.5,1.462989e+05,130.51,12.12,1581.61,3,5,...,0,0,0,0,1,0,0,0.70,11.893407,7.022868
25445,1918.0,86154.77000,Year,92.5,8.615477e+04,44.92,20.74,931.40,3,5,...,0,0,0,0,1,1,1,0.62,11.363901,7.559559


In [19]:
# Stratified data splitting based on EDA findings about class imbalance
print("=== STRATIFIED DATA SPLITTING ===")
print("EDA identified class imbalance - using stratified splitting to preserve class distribution")

# Select the chosen features
X_selected = X[selected_features]
print(f"Selected features shape: {X_selected.shape}")

# First split: Based on the size of our data, we will split 70% train+val, 30% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=42, stratify=y
)
# 'test_size=0.3' means 30% of the data will be used for testing, 70% for training
# 'random_state=42'ensures reproducibility (same split every time the code is run)

# Second split: 75% train, 25% validation (of the 70%)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)
# 'test_size=0.25' means 25% of the data will be used for testing, 75% for training
# 'random_state=42'ensures reproducibility (same split every time the code is run)

print(f"\nData split results:")
print(f"Training set: {X_train.shape} ({(X_train.shape[0]/len(X_selected))*100:.1f}%)")
print(f"Validation set: {X_val.shape} ({(X_val.shape[0]/len(X_selected))*100:.1f}%)")
print(f"Test set: {X_test.shape} ({(X_test.shape[0]/len(X_selected))*100:.1f}%)")

# Check class distribution in each set (should be similar due to stratification)
print(f"\nClass distribution verification:")
print("Training set quality distribution:")
print(y_train.value_counts().sort_index())
print("\nValidation set quality distribution:")
print(y_val.value_counts().sort_index())
print("\nTest set quality distribution:")
print(y_test.value_counts().sort_index())


=== STRATIFIED DATA SPLITTING ===
EDA identified class imbalance - using stratified splitting to preserve class distribution
Selected features shape: (25447, 19)

Data split results:
Training set: (13359, 19) (52.5%)
Validation set: (4453, 19) (17.5%)
Test set: (7635, 19) (30.0%)

Class distribution verification:
Training set quality distribution:
case_status_encoded
0    4434
1    8925
Name: count, dtype: int64

Validation set quality distribution:
case_status_encoded
0    1478
1    2975
Name: count, dtype: int64

Test set quality distribution:
case_status_encoded
0    2534
1    5101
Name: count, dtype: int64


### **Feature Scaling**

Apply StandardScaler as our data has little to no outliers based on EDA recommendations.

In [22]:
# Apply StandardScaler as recommended by EDA

print("=== FEATURE SCALING (STANDARD SCALER) ===")
print("EDA recommended StandardScaler for distance-based models")

# Fit scaler on training data only (to avoid data leakage)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# Transform validation and test sets
X_val_scaled = scaler.transform(X_val)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)

X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print("✓ Scaling applied successfully!")
print(f"Training set scaled - Mean: {X_train_scaled.mean().mean():.4f}, Std: {X_train_scaled.std().mean():.4f}")
print(f"Validation set scaled - Mean: {X_val_scaled.mean().mean():.4f}, Std: {X_val_scaled.std().mean():.4f}")
print(f"Test set scaled - Mean: {X_test_scaled.mean().mean():.4f}, Std: {X_test_scaled.std().mean():.4f}")

# Verify scaling worked correctly
print(f"\nScaling verification:")
print(f"Training set - Mean ≈ 0: {abs(X_train_scaled.mean().mean()) < 0.01}")
print(f"Training set - Std ≈ 1: {abs(X_train_scaled.std().mean() - 1) < 0.01}")


=== FEATURE SCALING (STANDARD SCALER) ===
EDA recommended StandardScaler for distance-based models
✓ Scaling applied successfully!
Training set scaled - Mean: 0.0000, Std: 1.0000
Validation set scaled - Mean: 0.0019, Std: 1.0124
Test set scaled - Mean: 0.0012, Std: 1.0331

Scaling verification:
Training set - Mean ≈ 0: True
Training set - Std ≈ 1: True


### **Algorithm Selection**

Based on the EDA insights and recommendations, we would go with:
- Gradient Boosting Classifier, XG Boost Classifier and LightGBM:
    - *Dataset*: Our data is mostly categorical including the target and it is a large dataset containing above 10,000 samples.
    - *Performance*: Gradient Boosting and XGBoost is best for tabular data like risk prediction and credit scoring.


### **Model Comparison and Evaluation**

#### **1. GradientBoosting Classifier (EDA Recommendation)**

In [23]:
# Create GradientBoost Model
gb_clf = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

# Train the model
gb_clf.fit(X_train, y_train)

# Make predictions
y_train_pred_gb = gb_clf.predict(X_train)
y_test_pred_gb = gb_clf.predict(X_test)

# Evaluate performance
train_accuracy_gb = accuracy_score(y_train, y_train_pred_gb)
test_accuracy_gb = accuracy_score(y_test, y_test_pred_gb)

train_balanced_acc_gb = balanced_accuracy_score(y_train, y_train_pred_gb)
test_balanced_acc_gb = balanced_accuracy_score(y_test, y_test_pred_gb)

train_f1_gb = f1_score(y_train, y_train_pred_gb, average='macro')
test_f1_gb = f1_score(y_test, y_test_pred_gb, average='macro')

print(f"\nXGBoost Performance:")
print(f"Test - Accuracy: {test_accuracy_gb:.3f}, Balanced Acc: {test_balanced_acc_gb:.3f}, Macro F1: {test_f1_gb:.3f}")

# Feature importance analysis
print(f"\nFeature Importance (Top 10):")
feature_importance_gb = pd.DataFrame({
    'feature': X_train.columns,
    'importance': gb_clf.feature_importances_
}).sort_values('importance', ascending=False)

for i, (_, row) in enumerate(feature_importance_gb.head(10).iterrows(), 1):
    print(f"{i:2d}. {row['feature']}: {row['importance']:.3f}")

# Store XGBoost results
xgb_results = {
    'model': 'GradientBoosting',
    'train_accuracy': train_accuracy_gb,
    'test_accuracy': test_accuracy_gb,
    'train_balanced_acc': train_balanced_acc_gb,
    'test_balanced_acc': test_balanced_acc_gb,
    'train_f1': train_f1_gb,
    'test_f1': test_f1_gb
}

print("GradientBoost model completed!")


XGBoost Performance:
Test - Accuracy: 0.752, Balanced Acc: 0.688, Macro F1: 0.699

Feature Importance (Top 10):
 1. education_level_ordinal: 0.424
 2. has_job_experience_encoded: 0.146
 3. prevailing_wage_log: 0.144
 4. region_target_encoded: 0.060
 5. continent_Europe: 0.049
 6. wage_per_year: 0.032
 7. continent_North America: 0.025
 8. wage_per_age_ratio: 0.023
 9. no_of_employees_log: 0.021
10. wage_employee_ratio: 0.019
GradientBoost model completed!


#### **2. XGBoost Classifier (EDA Recommendation)**

In [25]:
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    random_state=42,
    eval_metric='mlogloss',
    n_jobs=-1
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_train_pred_xgb = xgb_model.predict(X_train)
y_test_pred_xgb = xgb_model.predict(X_test)

# Evaluate performance and calculate metrics
train_accuracy_xgb = accuracy_score(y_train, y_train_pred_xgb)
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)

train_balanced_acc_xgb = balanced_accuracy_score(y_train, y_train_pred_xgb)
test_balanced_acc_xgb = balanced_accuracy_score(y_test, y_test_pred_xgb)

train_f1_xgb = f1_score(y_train, y_train_pred_xgb, average='macro')
test_f1_xgb = f1_score(y_test, y_test_pred_xgb, average='macro')

print(f"\nXGBoost Performance:")
print(f"Test - Accuracy: {test_accuracy_xgb:.3f}, Balanced Acc: {test_balanced_acc_xgb:.3f}, Macro F1: {test_f1_xgb:.3f}")

# Feature importance analysis
print(f"\nFeature Importance (Top 10):")
feature_importance_xgb = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

for i, (_, row) in enumerate(feature_importance_xgb.head(10).iterrows(), 1):
    print(f"{i:2d}. {row['feature']}: {row['importance']:.3f}")

# Store XGBoost results
xgb_results = {
    'model': 'XGBoost',
    'train_accuracy': train_accuracy_xgb,
    'test_accuracy': test_accuracy_xgb,
    'train_balanced_acc': train_balanced_acc_xgb,
    'test_balanced_acc': test_balanced_acc_xgb,
    'train_f1': train_f1_xgb,
    'test_f1': test_f1_xgb
}

print("XGBoost model completed!")


XGBoost Performance:
Test - Accuracy: 0.726, Balanced Acc: 0.664, Macro F1: 0.671

Feature Importance (Top 10):
 1. education_level_ordinal: 0.203
 2. has_job_experience_encoded: 0.131
 3. continent_Europe: 0.102
 4. full_time_position_encoded: 0.089
 5. prevailing_wage_log: 0.051
 6. continent_North America: 0.048
 7. region_target_encoded: 0.046
 8. continent_South America: 0.045
 9. requires_job_training_encoded: 0.041
10. continent_Oceania: 0.031
XGBoost model completed!


#### **3. LightGBM (EDA recommendation)**