In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import joblib

# Load the dataset
file_path = "C:\\Users\\Hiremath\\OneDrive\\Desktop\\loan_prediction.csv"
data = pd.read_csv(file_path)

# Data preprocessing
# Remove Loan_ID column before imputation
data = data.drop(columns=['Loan_ID'])

# Handle 'Dependents' column
data['Dependents'] = data['Dependents'].replace('3+', 3).astype(float)

# Separate target variable
X = data.drop(columns=['Loan_Status'])
y = data['Loan_Status']

# Handling missing values for categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
X[categorical_columns] = X[categorical_columns].fillna(X[categorical_columns].mode().iloc[0])

# Encoding categorical variables using label encoding
label_encoder = LabelEncoder()
for col in categorical_columns:
    X[col] = label_encoder.fit_transform(X[col])

# Handling missing values for numerical columns
imputer = KNNImputer(n_neighbors=5)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Model selection and training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Cross-validation to check for overfitting/underfitting
cross_val_scores = cross_val_score(best_model, X_train, y_train, cv=5)
mean_cv_accuracy = np.mean(cross_val_scores)

# Model evaluation on test data
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Save the best model
joblib.dump(best_model, 'loan_approval_model.pkl')

# Print results
print("Mean CV Accuracy:", mean_cv_accuracy)
print("Test Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

# Make predictions on new data
new_data = X_test.iloc[:5]  # Example: Replace with your new data
new_predictions = best_model.predict(new_data)
print("New Data Predictions:", new_predictions)


Mean CV Accuracy: 0.8124922696351268
Test Accuracy: 0.7967479674796748
Classification Report:
               precision    recall  f1-score   support

           N       0.95      0.44      0.60        43
           Y       0.77      0.99      0.86        80

    accuracy                           0.80       123
   macro avg       0.86      0.71      0.73       123
weighted avg       0.83      0.80      0.77       123

New Data Predictions: ['Y' 'Y' 'Y' 'Y' 'Y']
