In [20]:
# ModelBuilding.ipynb

import pandas as pd
import numpy as np
import sys
sys.path.append('../src')  # Add the src directory to the system path

from data_preprocessing import load_data, handle_missing_values
from feature_engineering import add_total_income_feature
from model_training import encode_categorical_variables, train_model
from predictions import make_predictions, save_predictions
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the preprocessed data
train_data = pd.read_csv('../data/train_cleaned.csv')
test_data = pd.read_csv('../data/test_cleaned.csv')

# Combine train and test data for consistent encoding
combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

# Feature Engineering: Add Total Income feature
combined_data = add_total_income_feature(combined_data)

# Encode categorical variables using LabelEncoder
label_encoders = {}
for column in combined_data.select_dtypes(include=['object']):
    label_encoders[column] = LabelEncoder()
    combined_data[column] = label_encoders[column].fit_transform(combined_data[column])

# Separate back into train and test data
train_data = combined_data.iloc[:train_data.shape[0], :]
test_data = combined_data.iloc[train_data.shape[0]:, :]

# Split the data into features and target variable
X = train_data.drop(columns=['Loan_Status'])
y = train_data['Loan_Status']

# Split the data into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_features = test_data.drop(columns=['Loan_Status'])  # Exclude target variable
test_data_scaled = scaler.transform(test_data_features)

# Model Building: Train the Random Forest Classifier
model, accuracy, confusion, classification_rep = train_model(X_train_scaled, y_train, X_val_scaled, y_val)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{confusion}')
print(f'Classification Report:\n{classification_rep}')

# Save the model
import joblib
joblib.dump(model, '../model/random_forest_model.pkl')

# Make predictions on the test set
test_predictions = make_predictions(model, test_data_scaled)

# Prepare the submission file
save_predictions(test_predictions, test_data_features, filename='../data/loan_predictions.csv')


Accuracy: 0.7967479674796748
Confusion Matrix:
[[21 22]
 [ 3 77]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.49      0.63        43
           1       0.78      0.96      0.86        80

    accuracy                           0.80       123
   macro avg       0.83      0.73      0.74       123
weighted avg       0.81      0.80      0.78       123

