In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import h5py

In [2]:
# Read the CSV file from the folder into a Pandas DataFram6e
df = pd.read_csv(
    Path("../mainDS.csv")
)
# Review the DataFrame
df.head()

Unnamed: 0,ID,Loan Amount,Funded Amount,Funded Amount Investor,Term,Batch Enrolled,Interest Rate,Grade,Sub Grade,Home Ownership,...,Recoveries,Collection Recovery Fee,Collection 12 months Medical,Application Type,Last week Pay,Accounts Delinquent,Total Collection Amount,Total Current Balance,Total Revolving Credit Limit,Loan Status
0,65087372,10000,32236,12329.36286,59,BAT2522922,11.135007,B,C4,MORTGAGE,...,2.498291,0.793724,0,INDIVIDUAL,49,0,31,311301,6619,0
1,1450153,3609,11940,12191.99692,59,BAT1586599,12.237563,C,D3,RENT,...,2.377215,0.974821,0,INDIVIDUAL,109,0,53,182610,20885,0
2,1969101,28276,9311,21603.22455,59,BAT2136391,12.545884,F,D4,MORTGAGE,...,4.316277,1.020075,0,INDIVIDUAL,66,0,34,89801,26155,0
3,6651430,11170,6954,17877.15585,59,BAT2428731,16.731201,C,C3,MORTGAGE,...,0.10702,0.749971,0,INDIVIDUAL,39,0,40,9189,60214,0
4,14354669,16890,13226,13539.92667,59,BAT5341619,15.0083,C,D4,MORTGAGE,...,1294.818751,0.368953,0,INDIVIDUAL,18,0,430,126029,22579,0


In [3]:
columns_to_drop = ['ID', 'Application Type', 'Batch Enrolled', 'Term']

# Checking if the columns exist in the DataFrame before dropping them
columns_to_drop = [col for col in columns_to_drop if col in df.columns]

# Drop the specified columns
df = df.drop(columns=columns_to_drop)

In [49]:
print(df['Interest Rate'].max())
print(df['Interest Rate'].min())

27.18234758
5.320005799


In [4]:
# Want to bin Funded Amount 

bins1 = [1000, 5000, 10000, 20000, 30000, 50000]
labels1 = ['<5000', '5000 - 10000', '10000 - 20000', '20000 - 30000', '30000+']
df['Funded Amount'] = pd.cut(df['Funded Amount'], bins=bins1, labels=labels1)

In [5]:
# Want to bin Funded Amount Investor 

bins2 = [1000, 5000, 10000, 20000, 30000, 50000]
labels2 = ['<5000', '5000 - 10000', '10000 - 20000', '20000 - 30000', '30000+']
df['Funded Amount Investor'] = pd.cut(df['Funded Amount Investor'], bins=bins2, labels=labels2)

In [6]:
# Want to bin Loan Amount

bins3 = [1000, 5000, 10000, 20000, 30000, 50000]
labels3 = ['<5000', '5000 - 10000', '10000 - 20000', '20000 - 30000', '30000+']
df['Loan Amount'] = pd.cut(df['Loan Amount'], bins=bins3, labels=labels3)

In [7]:
# Want to bin Interest Rate

bins4 = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 100]
labels4 = ['5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15+']
df['Interest Rate'] = pd.cut(df['Interest Rate'], bins=bins4, labels=labels4)

In [54]:
df.head()

Unnamed: 0,Loan Amount,Funded Amount,Funded Amount Investor,Interest Rate,Grade,Sub Grade,Home Ownership,Employment Duration (years),Verification Status,Payment Plan,...,Total Received Late Fee,Recoveries,Collection Recovery Fee,Collection 12 months Medical,Last week Pay,Accounts Delinquent,Total Collection Amount,Total Current Balance,Total Revolving Credit Limit,Loan Status
0,5000 - 10000,30000+,10000 - 20000,11,B,C4,MORTGAGE,20,Not Verified,n,...,0.102055,2.498291,0.793724,0,49,0,31,311301,6619,0
1,<5000,10000 - 20000,10000 - 20000,12,C,D3,RENT,4,Source Verified,n,...,0.036181,2.377215,0.974821,0,109,0,53,182610,20885,0
2,20000 - 30000,5000 - 10000,20000 - 30000,12,F,D4,MORTGAGE,10,Source Verified,n,...,18.77866,4.316277,1.020075,0,66,0,34,89801,26155,0
3,10000 - 20000,5000 - 10000,10000 - 20000,15+,C,C3,MORTGAGE,12,Source Verified,n,...,0.044131,0.10702,0.749971,0,39,0,40,9189,60214,0
4,10000 - 20000,10000 - 20000,10000 - 20000,15+,C,D4,MORTGAGE,5,Source Verified,n,...,19.306646,1294.818751,0.368953,0,18,0,430,126029,22579,0


In [8]:
# Splitting data into training and testing datasets
from sklearn.model_selection import train_test_split
df_dummies = pd.get_dummies(df)

X = df_dummies.drop('Loan Status', axis=1)
y = df_dummies['Loan Status'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
X

Unnamed: 0,Employment Duration (years),Debit to Income,Delinquency - two years,Inquires - six months,Open Account,Public Record,Revolving Balance,Revolving Utilities,Total Accounts,Total Received Interest,...,Loan Title_home improvement,Loan Title_loan1,Loan Title_pay off bills,Loan Title_payoff,Loan Title_personal,Loan Title_refi,Loan Title_relief,Loan Title_vacation,Initial List Status_f,Initial List Status_w
0,20,16.284758,1,0,13,0,24246,74.932551,7,2929.646315,...,False,False,False,False,False,False,False,False,False,True
1,4,15.412409,0,0,12,0,812,78.297186,13,772.769385,...,False,False,False,False,False,False,False,False,True,False
2,10,28.137619,0,0,14,0,1843,2.073040,20,863.324396,...,False,False,False,False,False,False,False,False,False,True
3,12,18.043730,1,0,7,0,13819,67.467951,12,288.173196,...,False,False,False,False,False,False,False,False,False,True
4,5,17.209886,1,3,13,1,1544,85.250761,22,129.239553,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70361,4,27.743028,0,0,11,0,17635,78.841002,14,2572.262269,...,False,False,False,False,False,False,False,False,True,False
70362,5,30.345992,0,0,13,0,23876,49.880686,21,180.549634,...,False,False,False,False,False,False,False,False,False,True
70363,2,32.129263,0,0,26,0,12837,49.782126,13,1142.184219,...,False,False,False,False,False,False,False,False,True,False
70364,11,34.928045,0,0,10,0,3778,4.683398,8,1754.342481,...,False,False,False,False,False,False,False,False,False,True


In [57]:
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.DataFrame(y_test)


In [9]:
# Scaling
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [87]:
X_test_scaled

array([[-0.5250337 ,  1.04573953, -0.40738934, ..., -0.011152  ,
         1.07676793, -1.07676793],
       [-0.33180565, -0.72417347,  0.82914473, ..., -0.011152  ,
        -0.92870522,  0.92870522],
       [ 0.05465046, -0.9807762 , -0.40738934, ..., -0.011152  ,
         1.07676793, -1.07676793],
       ...,
       [ 0.63433463, -0.20976572, -0.40738934, ..., -0.011152  ,
         1.07676793, -1.07676793],
       [ 1.02079073,  0.58306641, -0.40738934, ..., -0.011152  ,
        -0.92870522,  0.92870522],
       [-0.71826175,  0.26209326, -0.40738934, ..., -0.011152  ,
        -0.92870522,  0.92870522]])

In [60]:
X_test_scaled_df = pd.DataFrame(X_test_scaled)
X_test_scaled_df.to_csv('X_test_scaled.csv', index=False)

In [162]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 42 to the model
model = LogisticRegression(random_state=42,
                           max_iter=500, 
                           class_weight={0:0.01, 1:0.09},
                           solver='saga', 
                           multi_class='ovr', 
                           n_jobs=-1)

# Fit the model using training data
model.fit(X_train_scaled, y_train)



In [163]:
# Make a prediction using the testing data
predictions = model.predict(X_test_scaled)
# Print the balanced_accuracy score of the model
balanced_accuracy = balanced_accuracy_score(y_test, predictions)
# Generate a confusion matrix for the model
confusion = confusion_matrix(y_test, predictions)
# Print the classification report for the model
print("Balanced Accuracy Score:", balanced_accuracy)
print("Confusion Matrix:", confusion)
class_rep = classification_report(y_test, predictions)
print("Classification Report:", class_rep  )

Balanced Accuracy Score: 0.5151591659700994
Confusion Matrix: [[10331  2526]
 [  941   276]]
Classification Report:               precision    recall  f1-score   support

           0       0.92      0.80      0.86     12857
           1       0.10      0.23      0.14      1217

    accuracy                           0.75     14074
   macro avg       0.51      0.52      0.50     14074
weighted avg       0.85      0.75      0.79     14074



In [46]:
X_test_df = pd.DataFrame(X_test)
y_test_df = pd.DataFrame(y_test)

# Define file names for saving
x_test_filename = "X_test.csv"
y_test_filename = "y_test.csv"

# Save X test data to a CSV file
X_test_df.to_csv(x_test_filename, index=False)

# Save y test data to a CSV file
y_test_df.to_csv(y_test_filename, index=False)

print(f"X test data saved to {x_test_filename}")
print(f"y test data saved to {y_test_filename}")

X test data saved to X_test.csv
y test data saved to y_test.csv


In [160]:
import joblib

In [161]:
# Save model into a file
joblib.dump(model, 'model_4.joblib')

['model_4.joblib']