In [40]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sys
sys.path.append('../src')  # Add the src directory to the system path

from data_preprocessing import load_data, handle_missing_values
from feature_engineering import add_total_income_feature
from model_training import encode_categorical_variables, train_model
from predictions import make_predictions, save_predictions
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the preprocessed data
train_data = pd.read_csv('../data/train_cleaned.csv')
test_data = pd.read_csv('../data/test_cleaned.csv')

# Combine train and test data for consistent encoding
combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

# Feature Engineering: Add Total Income feature
combined_data = add_total_income_feature(combined_data)

# Encode categorical variables using LabelEncoder
label_encoders = {}
for column in combined_data.select_dtypes(include=['object']):
    label_encoders[column] = LabelEncoder()
    combined_data[column] = label_encoders[column].fit_transform(combined_data[column])

# Separate back into train and test data
train_data = combined_data.iloc[:train_data.shape[0], :]
test_data = combined_data.iloc[train_data.shape[0]:, :]

# Split the data into features and target variable
X = train_data.drop(columns=['Loan_Status'])
y = train_data['Loan_Status']

# Split the data into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_data_features = test_data.drop(columns=['Loan_Status'])  # Exclude target variable
test_data_scaled = scaler.transform(test_data_features)

# Model Building: Train the Random Forest Classifier
model, accuracy, confusion, classification_rep = train_model(X_train_scaled, y_train, X_val_scaled, y_val)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{confusion}')
print(f'Classification Report:\n{classification_rep}')

# Save the model
import joblib
joblib.dump(model, '../model/random_forest_model(1).pkl')

# Make predictions on the test set
test_predictions = make_predictions(model, test_data_scaled)

# Prepare the submission file
save_predictions(test_predictions, test_data_features, filename='../data/loan_predictions.csv')

# Load the CSV files for merging
loan_predictions = pd.read_csv('../data/loan_predictions.csv')
test_cleaned = pd.read_csv('../data/test_cleaned.csv')

# Drop the 'Loan_ID' column from loan_predictions
loan_predictions = loan_predictions.drop(columns=['Loan_ID'])

# Add remaining columns to test_cleaned
merged_data = pd.concat([test_cleaned, loan_predictions], axis=1)

# Save the merged data to model_training.csv
merged_data.to_csv('../data/model_training.csv', index=False)

print("Merged data saved as model_training.csv")


Accuracy: 0.7967479674796748
Confusion Matrix:
[[21 22]
 [ 3 77]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.49      0.63        43
           1       0.78      0.96      0.86        80

    accuracy                           0.80       123
   macro avg       0.83      0.73      0.74       123
weighted avg       0.81      0.80      0.78       123

Merged data saved as model_training.csv


In [50]:
#Buliding the model

import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

# Verify and update file path
file_path = '../data/model_training.csv'
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found at {file_path}")
else:
    print(f"File found at {file_path}")

# Load training data
train_data = pd.read_csv(file_path)

# Identify categorical columns and numeric columns
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
numeric_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

# Encode categorical variables
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])
    label_encoders[column] = le

# Encode target variable
le_status = LabelEncoder()
train_data['Loan_Status'] = le_status.fit_transform(train_data['Loan_Status'])

# Separate features and target
X = train_data.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_data['Loan_Status']

# Standardize numeric variables
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Output class distribution
print(f"Class distribution in training set: {np.bincount(y)}")

# Save the model
model_save_path = '../model/random_forest_model.pkl'
joblib.dump(model, model_save_path)
print(f"Model saved at {model_save_path}")

# Save the label encoders and scaler
encoders_save_path = '../model/label_encoders.pkl'
joblib.dump(label_encoders, encoders_save_path)
print(f"Label encoders saved at {encoders_save_path}")

scaler_save_path = '../model/scaler.pkl'
joblib.dump(scaler, scaler_save_path)
print(f"Scaler saved at {scaler_save_path}")

# Save the label encoder for the target variable
status_encoder_save_path = '../model/status_encoder.pkl'
joblib.dump(le_status, status_encoder_save_path)
print(f"Status encoder saved at {status_encoder_save_path}")


File found at ../data/model_training.csv
Class distribution in training set: [ 70 297]
Model saved at ../model/random_forest_model.pkl
Label encoders saved at ../model/label_encoders.pkl
Scaler saved at ../model/scaler.pkl
Status encoder saved at ../model/status_encoder.pkl
