In [23]:
# Import necessary libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pickle

In [24]:
# Step 1: Load and Prepare the Training Data
# Adjust the path to load your training data file
train_data = pd.read_csv("Dataset/loan_sanction_train.csv")


In [25]:
train_data['LoanAmount'] = train_data['LoanAmount'] / 1000


In [26]:
# Apply one-hot encoding to categorical columns
train_data = pd.get_dummies(train_data, columns=[
    'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'
])

In [27]:
# Define the feature columns (ensure these match what is used in `app.py`)
features = [
    "ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History",
    "Gender_Female", "Gender_Male", "Married_No", "Married_Yes", 
    "Dependents_0", "Dependents_1", "Dependents_2", "Dependents_3+",
    "Education_Graduate", "Education_Not Graduate", "Self_Employed_No", "Self_Employed_Yes",
    "Property_Area_Rural", "Property_Area_Semiurban", "Property_Area_Urban"
]

In [28]:
# Select the features from the data
train_features = train_data[features]
train_labels = train_data['Loan_Status']


In [29]:
# Step 2: Handle Missing Values by Imputing with Mean
imputer = SimpleImputer(strategy='mean')  # Impute missing values with mean
train_features = imputer.fit_transform(train_features)

In [30]:
# Save the imputer for use in app.py
with open('imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)

In [31]:
# Step 3: Initialize and Fit the Scaler
x_scaler = StandardScaler()
x_scaler.fit(train_features)

In [32]:
# Save the fitted scaler
with open('x_scaler.pkl', 'wb') as f:
    pickle.dump(x_scaler, f)

In [33]:
print("Scaler has been refitted and saved.")

Scaler has been refitted and saved.


In [34]:
  # Step 4: Train the Logistic Regression Model
model = LogisticRegression()
model.fit(train_features, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
# Save the trained model
with open('log_regression.pkl', 'wb') as f:
    pickle.dump(model, f)

In [36]:
print("Model has been trained and saved.")

Model has been trained and saved.


In [37]:
# Step 5: Test the Pipeline with Realistic Sample Input
sample_input = pd.DataFrame({
    "ApplicantIncome": [95000],
    "CoapplicantIncome": [0],
    "LoanAmount": [300000 / 1000],  # scaled to 300 for the model
    "Loan_Amount_Term": [360],
    "Credit_History": [1],
    "Gender_Female": [0],
    "Gender_Male": [1],
    "Married_No": [1],
    "Married_Yes": [0],
    "Dependents_0": [1],
    "Dependents_1": [0],
    "Dependents_2": [0],
    "Dependents_3+": [0],
    "Education_Graduate": [1],
    "Education_Not Graduate": [0],
    "Self_Employed_No": [1],
    "Self_Employed_Yes": [0],
    "Property_Area_Rural": [0],
    "Property_Area_Semiurban": [0],
    "Property_Area_Urban": [1]
})

In [38]:
# Apply imputer and scaler to sample input
sample_input_imputed = imputer.transform(sample_input)
scaled_sample_input = x_scaler.transform(sample_input_imputed)

In [39]:
# Make prediction
sample_prediction = model.predict_proba(scaled_sample_input)
print("Sample input prediction probabilities:", sample_prediction)

Sample input prediction probabilities: [[1.00000000e+000 7.48015097e-103]]
