In [1]:
# Import pandas module
import pandas as pd

# Set display columns to max to allow us to see all columns in results
pd.set_option('display.max_columns', None)

# Read the loan_clean csv file in
loan = pd.read_csv('loan_clean.csv')

In [2]:
# Create a function to encode 'acc_now_delinq' to 1 for delinquent and 0 for non-delinquent (to use as the target/result)

def delinq(delinq_count):
    if delinq_count != 0:
        return 1
    else:
        return 0

# Apply the 'acc_now_delinq' column which shows number of delinquent accounts to the function and assigning it to a new column 'delinq_status'
loan['delinq_status'] = loan.acc_now_delinq.apply(delinq)

In [3]:
# Importing relevant modules
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier



In [4]:
# Create feature matrix (X), i.e. features or factors that would impact prediction of delinquency

# Create list of feature columns with the numerical features
feature_cols = ['annual_inc', 'dti', 'pub_rec_bankruptcies', 'delinq_2yrs', 'mths_since_last_delinq']


In [5]:
# Creating feature matrix (X)
X = loan[feature_cols]

# Creating response vector (y)
y = loan.delinq_status


# Performing Train/Test Split

In [6]:
# Importing relevant modules for the train/test split
from sklearn.model_selection import train_test_split


In [7]:
# Performing the train/test split with the test set set to 30% of dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Creating Oversample set using SMOTE

In [8]:
# Importing SMOTE module
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE oversampling algorithm
smote = SMOTE()

# Apply SMOTE to oversample the minority class
X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)


In [9]:
# Define models to evaluate

models = [KNeighborsClassifier(n_neighbors = 1),
          RandomForestClassifier(n_estimators = 500, max_features='sqrt', random_state=123),
          XGBClassifier(max_depth=16, eta=0.1, n_estimators=1750, seed=123)]


In [10]:
# Perform k-fold cross-validation

k_folds = 5

for model in models:
    
    # Create a k-fold cross-validation object
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=123)
    
    # Instantiate the model
    model_instance = model
    
    # Perform cross-validation
    scores = cross_val_score(model_instance, X_train_oversampled, y_train_oversampled, cv=kf)
    
    # Calculate and print average score
    avg_score = scores.mean()
    print(f"Model: {type(model_instance).__name__}, Average Score: {avg_score}")

Model: KNeighborsClassifier, Average Score: 0.9951892326607625
Model: RandomForestClassifier, Average Score: 0.9983386324801288
Model: XGBClassifier, Average Score: 0.9986756332964021
