In [102]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import requests

In [103]:
api_url= "https://debt-api-4301881a2ff8.herokuapp.com/loan/get_all"
res = requests.get(api_url)
train_data = pd.DataFrame(res.json()["data"])
test_data = pd.read_csv('test.csv')

In [104]:
train_data.head()

Unnamed: 0,loan_id,gender,married,dependents,education,self_employed,applicantIncome,coapplicant_income,loan_amount,loan_amount_term,credit_history,property_area,loan_status
0,LP001002,Male,No,0,Graduate,No,5849,0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0,141.0,360.0,1.0,Urban,Y


In [105]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   loan_id             614 non-null    object 
 1   gender              601 non-null    object 
 2   married             611 non-null    object 
 3   dependents          599 non-null    object 
 4   education           614 non-null    object 
 5   self_employed       582 non-null    object 
 6   applicantIncome     614 non-null    int64  
 7   coapplicant_income  614 non-null    object 
 8   loan_amount         592 non-null    float64
 9   loan_amount_term    600 non-null    float64
 10  credit_history      564 non-null    float64
 11  property_area       614 non-null    object 
 12  loan_status         614 non-null    object 
dtypes: float64(3), int64(1), object(9)
memory usage: 62.5+ KB


In [106]:
# convert string to float
train_data['coapplicant_income'] = train_data['coapplicant_income'].astype(float)
print(train_data['coapplicant_income'].dtype)

float64


In [107]:
# convert string to float
train_data['applicantIncome'] = train_data['applicantIncome'].astype(float)
print(train_data['coapplicant_income'].dtype)

float64


In [108]:
train_data.dtypes

loan_id                object
gender                 object
married                object
dependents             object
education              object
self_employed          object
applicantIncome       float64
coapplicant_income    float64
loan_amount           float64
loan_amount_term      float64
credit_history        float64
property_area          object
loan_status            object
dtype: object

In [109]:
train_data.isnull().sum()

loan_id                0
gender                13
married                3
dependents            15
education              0
self_employed         32
applicantIncome        0
coapplicant_income     0
loan_amount           22
loan_amount_term      14
credit_history        50
property_area          0
loan_status            0
dtype: int64

In [110]:
# handle categorical missing data
# train_data['Gender'].mode()[0]
train_data['gender'] = train_data['gender'].fillna(train_data['gender'].mode()[0])
train_data['married'] = train_data['married'].fillna(train_data['married'].mode()[0])
train_data['dependents'] = train_data['dependents'].fillna(train_data['dependents'].mode()[0])
train_data['self_employed'] = train_data['self_employed'].fillna(train_data['self_employed'].mode()[0])

In [111]:
# handle numerical missing data
train_data['loan_amount'] = train_data['loan_amount'].fillna(train_data['loan_amount'].mean())
train_data['loan_amount_term'] = train_data['loan_amount_term'].fillna(train_data['loan_amount_term'].mean())
train_data['credit_history'] = train_data['credit_history'].fillna(train_data['credit_history'].mean())

In [112]:
features = ['gender', 'married', 'dependents', 'education', 'self_employed',
            'applicantIncome', 'coapplicant_income', 'loan_amount', 'loan_amount_term',
            'credit_history', 'property_area']

X = train_data[features]
y = train_data['loan_status']

In [113]:
# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X)

In [114]:
X.head()

Unnamed: 0,applicantIncome,coapplicant_income,loan_amount,loan_amount_term,credit_history,gender_Female,gender_Male,married_No,married_Yes,dependents_0,dependents_1,dependents_2,dependents_3+,education_Graduate,education_Not Graduate,self_employed_No,self_employed_Yes,property_area_Rural,property_area_Semiurban,property_area_Urban
0,5849.0,0.0,146.412162,360.0,1.0,False,True,True,False,True,False,False,False,True,False,True,False,False,False,True
1,4583.0,1508.0,128.0,360.0,1.0,False,True,False,True,False,True,False,False,True,False,True,False,True,False,False
2,3000.0,0.0,66.0,360.0,1.0,False,True,False,True,True,False,False,False,True,False,False,True,False,False,True
3,2583.0,2358.0,120.0,360.0,1.0,False,True,False,True,True,False,False,False,False,True,True,False,False,False,True
4,6000.0,0.0,141.0,360.0,1.0,False,True,True,False,True,False,False,False,True,False,True,False,False,False,True


In [115]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [116]:
X_train.head()

Unnamed: 0,applicantIncome,coapplicant_income,loan_amount,loan_amount_term,credit_history,gender_Female,gender_Male,married_No,married_Yes,dependents_0,dependents_1,dependents_2,dependents_3+,education_Graduate,education_Not Graduate,self_employed_No,self_employed_Yes,property_area_Rural,property_area_Semiurban,property_area_Urban
83,6000.0,2250.0,265.0,360.0,0.842199,False,True,False,True,True,False,False,False,True,False,True,False,False,True,False
90,2958.0,2900.0,131.0,360.0,1.0,False,True,False,True,True,False,False,False,True,False,True,False,False,True,False
227,6250.0,1695.0,210.0,360.0,1.0,False,True,False,True,False,False,True,False,True,False,True,False,False,True,False
482,2083.0,3150.0,128.0,360.0,1.0,False,True,False,True,True,False,False,False,True,False,True,False,False,True,False
464,4166.0,0.0,98.0,360.0,0.0,False,True,True,False,True,False,False,False,True,False,True,False,False,True,False


In [117]:
y_train.head()

83     N
90     Y
227    Y
482    Y
464    N
Name: loan_status, dtype: object

In [118]:
# Create a random forest classifier
Model_RF = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
Model_RF.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = Model_RF.predict(X_val)

In [119]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy}')

# Print classification report for more detailed evaluation
print(classification_report(y_val, y_pred))

Accuracy: 0.7723577235772358
              precision    recall  f1-score   support

           N       0.86      0.42      0.56        43
           Y       0.75      0.96      0.85        80

    accuracy                           0.77       123
   macro avg       0.81      0.69      0.70       123
weighted avg       0.79      0.77      0.75       123



In [120]:
# Now, I can use the trained model to make predictions on the test dataset
X_test = test_data[features]

In [121]:
X_test.isnull().sum()

gender                11
married                0
dependents            10
education              0
self_employed         23
applicantIncome        0
coapplicant_income     0
loan_amount            5
loan_amount_term       6
credit_history        29
property_area          0
dtype: int64

In [122]:
X_test['loan_amount'] = X_test['loan_amount'].fillna(X_test['loan_amount'].mean())
X_test['loan_amount_term'] = X_test['loan_amount_term'].fillna(X_test['loan_amount_term'].mean())
X_test['credit_history'] = X_test['credit_history'].fillna(X_test['credit_history'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['loan_amount'] = X_test['loan_amount'].fillna(X_test['loan_amount'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['loan_amount_term'] = X_test['loan_amount_term'].fillna(X_test['loan_amount_term'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['credit_history']

In [123]:
X_test.isnull().sum()

gender                11
married                0
dependents            10
education              0
self_employed         23
applicantIncome        0
coapplicant_income     0
loan_amount            0
loan_amount_term       0
credit_history         0
property_area          0
dtype: int64

In [124]:
# handle categorical missing data
# train_data['Gender'].mode()[0]
X_test['gender'] = X_test['gender'].fillna(X_test['gender'].mode()[0])
X_test['dependents'] = X_test['dependents'].fillna(X_test['dependents'].mode()[0])
X_test['self_employed'] = X_test['self_employed'].fillna(X_test['self_employed'].mode()[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['gender'] = X_test['gender'].fillna(X_test['gender'].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['dependents'] = X_test['dependents'].fillna(X_test['dependents'].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['self_employed'] = X_test['self_employed'].f

In [125]:
X_test.isnull().sum()

gender                0
married               0
dependents            0
education             0
self_employed         0
applicantIncome       0
coapplicant_income    0
loan_amount           0
loan_amount_term      0
credit_history        0
property_area         0
dtype: int64

In [126]:
X_test = pd.get_dummies(X_test)

In [127]:
X_test.head()

Unnamed: 0,applicantIncome,coapplicant_income,loan_amount,loan_amount_term,credit_history,gender_Female,gender_Male,married_No,married_Yes,dependents_0,dependents_1,dependents_2,dependents_3+,education_Graduate,education_Not Graduate,self_employed_No,self_employed_Yes,property_area_Rural,property_area_Semiurban,property_area_Urban
0,5720,0,110.0,360.0,1.0,False,True,False,True,True,False,False,False,True,False,True,False,False,False,True
1,3076,1500,126.0,360.0,1.0,False,True,False,True,False,True,False,False,True,False,True,False,False,False,True
2,5000,1800,208.0,360.0,1.0,False,True,False,True,False,False,True,False,True,False,True,False,False,False,True
3,2340,2546,100.0,360.0,0.825444,False,True,False,True,False,False,True,False,True,False,True,False,False,False,True
4,3276,0,78.0,360.0,1.0,False,True,True,False,True,False,False,False,False,True,True,False,False,False,True


In [128]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   applicantIncome          367 non-null    int64  
 1   coapplicant_income       367 non-null    int64  
 2   loan_amount              367 non-null    float64
 3   loan_amount_term         367 non-null    float64
 4   credit_history           367 non-null    float64
 5   gender_Female            367 non-null    bool   
 6   gender_Male              367 non-null    bool   
 7   married_No               367 non-null    bool   
 8   married_Yes              367 non-null    bool   
 9   dependents_0             367 non-null    bool   
 10  dependents_1             367 non-null    bool   
 11  dependents_2             367 non-null    bool   
 12  dependents_3+            367 non-null    bool   
 13  education_Graduate       367 non-null    bool   
 14  education_Not Graduate   3

In [129]:
# convert string to float
X_test['coapplicant_income'] = X_test['coapplicant_income'].astype(float)
print(X_test['coapplicant_income'].dtype)

float64


In [130]:
# convert string to float
X_test['applicantIncome'] = X_test['applicantIncome'].astype(float)
print(X_test['coapplicant_income'].dtype)

float64


In [131]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   applicantIncome          367 non-null    float64
 1   coapplicant_income       367 non-null    float64
 2   loan_amount              367 non-null    float64
 3   loan_amount_term         367 non-null    float64
 4   credit_history           367 non-null    float64
 5   gender_Female            367 non-null    bool   
 6   gender_Male              367 non-null    bool   
 7   married_No               367 non-null    bool   
 8   married_Yes              367 non-null    bool   
 9   dependents_0             367 non-null    bool   
 10  dependents_1             367 non-null    bool   
 11  dependents_2             367 non-null    bool   
 12  dependents_3+            367 non-null    bool   
 13  education_Graduate       367 non-null    bool   
 14  education_Not Graduate   3

In [132]:
test_predictions = Model_RF.predict(X_test)

# Add the predictions to the test dataset
test_data['Loan_Status_Prediction'] = test_predictions

# Save the results to a new CSV file
test_data.to_csv('test_results.csv', index=False)

In [133]:
# Predict on the validation set
y_val_pred = Model_RF.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.2f}')

Validation Accuracy: 0.77


In [134]:
#"loan_id":"LP001002":"gender":"Male","married":"No","dependents":"0","education":"Graduate","self_employed":"No","applicantIncome":5849,"coapplicant_income":"0","loan_amount":null,"loan_amount_term":360.0,"credit_history":1,"property_area":"Urban","loan_status":"Y"}
#"loan_id":"LP001003","gender":"Male","married":"Yes","dependents":"1","education":"Graduate","self_employed":"No","applicantIncome":4583,"coapplicant_income":"1508","loan_amount":128,"loan_amount_term":360.0,"credit_history":1,"property_area":"Rural","loan_status":"N"}
#{"loan_id":"LP001005","gender":"Male","married":"Yes","dependents":"0","education":"Graduate","self_employed":"Yes","applicantIncome":3000,"coapplicant_income":"0","loan_amount":66,"loan_amount_term":360.0,"credit_history":1,"property_area":"Urban","loan_status":"Y"},
#{"loan_id":"LP001006","gender":"Male","married":"Yes","dependents":"0","education":"Not Graduate","self_employed":"No","applicantIncome":2583,"coapplicant_income":"2358","loan_amount":120,"loan_amount_term":360.0,"credit_history":1,"property_area":"Urban","loan_status":"Y"},
#{"loan_id":"LP001014","gender":"Male","married":"Yes","dependents":"3+","education":"Graduate","self_employed":"No","applicantIncome":3036,"coapplicant_income":"2504","loan_amount":158,"loan_amount_term":360.0,"credit_history":0,"property_area":"Semiurban","loan_status":"N"},
#{"loan_id":"LP001018","gender":"Male","married":"Yes","dependents":"2","education":"Graduate","self_employed":"No","applicantIncome":4006,"coapplicant_income":"1526","loan_amount":168,"loan_amount_term":360.0,"credit_history":1,"property_area":"Urban","loan_status":"Y"},
#{"loan_id":"LP001020","gender":"Male","married":"Yes","dependents":"1","education":"Graduate","self_employed":"No","applicantIncome":12841,"coapplicant_income":"10968","loan_amount":349,"loan_amount_term":360.0,"credit_history":1,"property_area":"Semiurban","loan_status":"N"},

In [135]:
# List of features used during training
categorical_features = ['gender', 'married', 'dependents', 'education', 'self_employed', 'property_area']

# Create a dictionary with the data for the single test point 
# 124

single_data = {
    'applicantIncome': 12841,
    'coapplicant_income': 10968,
    'loan_amount': 349,
    'loan_amount_term': 360,
    'credit_history': 1,
    'gender': 'Male',
    'married': 'Yes',
    'dependents': '1',
    'education': 'Graduate',
    'self_employed': 'No',
    'property_area': 'Semiurban'
}
# Convert the dictionary to a DataFrame
test_input = pd.DataFrame([single_data])

# Load the entire test dataset
test_data = pd.read_csv('test.csv')  # Replace 'test.csv' with your actual test dataset filename
test_data['loan_amount'] = test_data['loan_amount'].fillna(test_data['loan_amount'].mean())
test_data['loan_amount_term'] = test_data['loan_amount_term'].fillna(test_data['loan_amount_term'].mean())
test_data['credit_history'] = test_data['credit_history'].fillna(test_data['credit_history'].mean())

# Combine the single test point with the test dataset
combined_data = pd.concat([test_input, test_data], ignore_index=True)

# Convert categorical variables to dummy/indicator variables using the same order as during training
combined_data = pd.get_dummies(combined_data, columns=categorical_features)

# Ensure the order of columns matches the order during training
# Align the columns with the training data to handle potential mismatches
combined_data = combined_data.reindex(columns=X.columns, fill_value=0)

# Separate the single test point from the combined data
test_input = combined_data.iloc[0:1]
# test_data = combined_data.iloc[1:]

# Make the prediction using the trained model on the single test point
prediction_single_data = Model_RF.predict(test_input)

# Make the prediction using the trained model on the entire test dataset
# prediction_test_data = Model_RF.predict(test_data)

# Print the predictions
print(f'Loan Status Prediction for Single Data: {prediction_single_data[0]}')
# print('Loan Status Predictions for Test Data:')
# print(prediction_test_data)

Loan Status Prediction for Single Data: N


In [136]:
prediction_single_data 

array(['N'], dtype=object)

In [137]:
import pickle
# Save the model to a file using pickle
with open('random_forest_model1.pkl', 'wb') as file:
    pickle.dump(Model_RF, file)

In [138]:
# Load the model from the saved file
with open('random_forest_model1.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [139]:
# Now you can use the loaded model to make predictions on a single input
single_data = {
    'applicantIncome': 12841,
    'coapplicant_income': 10968,
    'loan_amount': 349,
    'loan_amount_term':360,
    'credit_history': 1,
    'gender': 'Male',
    'married': 'Yes',
    'dependents': '1',
    'education': 'Graduate',
    'self_employed': 'No',
    'property_area': 'Semiurban'
}

In [140]:
# Convert the input data to a DataFrame
single_data_df = pd.DataFrame([single_data])

# Convert categorical variables to dummy/indicator variables using the same order as during training
single_data_df = pd.get_dummies(single_data_df, columns=categorical_features)

# Ensure the order of columns matches the order during training
single_data_df = single_data_df.reindex(columns=X.columns, fill_value=0)

# Make predictions using the loaded model
prediction = loaded_model.predict(single_data_df)

# Print the prediction
print(f'Loan Status Prediction: {prediction[0]}')

Loan Status Prediction: N


In [141]:
prediction

array(['N'], dtype=object)