In [1]:
# Import pandas module
import pandas as pd

# Set display columns to max to allow us to see all columns in results
pd.set_option('display.max_columns', None)

# Read the loan_clean csv file in
loan = pd.read_csv('loan_clean.csv')

In [2]:
# Create a function to encode 'acc_now_delinq' to 1 for delinquent and 0 for non-delinquent (to use as the target/result)

def delinq(delinq_count):
    if delinq_count != 0:
        return 1
    else:
        return 0

# Apply the 'acc_now_delinq' column which shows number of delinquent accounts to the function and assigning it to a new column 'delinq_status'
loan['delinq_status'] = loan.acc_now_delinq.apply(delinq)

In [3]:
# Importing relevant modules
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier



In [4]:
# Create feature matrix (X), i.e. features or factors that would impact prediction of delinquency

# Create list of feature columns with the numerical features
feature_cols = ['annual_inc', 'dti', 'pub_rec_bankruptcies', 'delinq_2yrs', 'mths_since_last_delinq']


In [5]:
# Creating feature matrix (X)
X = loan[feature_cols]

# Creating response vector (y)
y = loan.delinq_status


In [6]:
# Performing the train/test split with the test set set to 30% of dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [7]:
# Importing SMOTE module
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE oversampling algorithm
smote = SMOTE()

# Apply SMOTE to oversample the minority class
X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)


In [8]:
# Define range of n_estimators to evaluate

n_estimators_range = [100, 200, 300, 400, 500, 600]


In [9]:
# Perform k-fold cross-validation

kfold = KFold(n_splits=5, shuffle=True, random_state=123)
results = []

for n_estimator in n_estimators_range:
    
    # Initialize Random Forest model with n_estimator
    rf = RandomForestClassifier(n_estimators = n_estimator, max_features='sqrt', random_state=123)
    
    # Perform cross-validation
    cv_scores = cross_val_score(rf, X_train_oversampled, y_train_oversampled, cv=kfold, scoring='accuracy')
    
    # Store mean accuracy
    results.append((n_estimator, cv_scores.mean()))
    
    print(f"{n_estimator} done")
    

100 done
200 done
300 done
400 done
500 done
600 done


In [10]:
# Print results
for n_estimator, mean_score in results:
    print("n_estimators: {}, Mean accuracy: {}".format(n_estimator, mean_score))

n_estimators: 100, Mean accuracy: 0.9983003124979015
n_estimators: 200, Mean accuracy: 0.9983128470651683
n_estimators: 300, Mean accuracy: 0.9983089076315188
n_estimators: 400, Mean accuracy: 0.9983167864994591
n_estimators: 500, Mean accuracy: 0.998317860891562
n_estimators: 600, Mean accuracy: 0.9983178608902795


In [11]:
results

[(100, 0.9983003124979015),
 (200, 0.9983128470651683),
 (300, 0.9983089076315188),
 (400, 0.9983167864994591),
 (500, 0.998317860891562),
 (600, 0.9983178608902795)]

In [12]:
sorted_list = sorted(results, key=lambda x: x[1], reverse=True)

sorted_list


[(500, 0.998317860891562),
 (600, 0.9983178608902795),
 (400, 0.9983167864994591),
 (200, 0.9983128470651683),
 (300, 0.9983089076315188),
 (100, 0.9983003124979015)]