Importing necessary libraries.

In [1]:
import pandas as pd
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

ModuleNotFoundError: No module named 'xgboost'

Loading and viewing the dataset. This dataset contains 39717 rows and 111 columns.

In [2]:
df = pd.read_csv('data/loan.csv')

df

  df = pd.read_csv('data/loan.csv')


Cleaning the dataframe by dropping the columns that only have null values. Only 57 columns remain.

In [4]:
df.dropna(axis=1, how='all', inplace=True)

df

Checking the null count for the remaining columns.

In [6]:
df.isnull().sum()

id                                0
member_id                         0
loan_amnt                         0
funded_amnt                       0
funded_amnt_inv                   0
term                              0
int_rate                          0
installment                       0
grade                             0
sub_grade                         0
emp_title                      2459
emp_length                     1075
home_ownership                    0
annual_inc                        0
verification_status               0
issue_d                           0
loan_status                       0
pymnt_plan                        0
url                               0
desc                          12942
purpose                           0
title                            11
zip_code                          0
addr_state                        0
dti                               0
delinq_2yrs                       0
earliest_cr_line                  0
inq_last_6mths              

Dropping the columns where the majority of rows are null. For example, the column `mths_since_last_delinq` has 25682 rows with null values out of the total 39717 rows.

In [None]:
# dropping these columns either because majority of their rows are null
df.drop(columns=['mths_since_last_record', 
                 'next_pymnt_d', 
                 'mths_since_last_delinq'], inplace=True)

Checking the number of unique values for the remaining columns.

In [7]:
unique_counts = df.nunique()
unique_counts

id                            39717
member_id                     39717
loan_amnt                       885
funded_amnt                    1041
funded_amnt_inv                8205
term                              2
int_rate                        371
installment                   15383
grade                             7
sub_grade                        35
emp_title                     28820
emp_length                       11
home_ownership                    5
annual_inc                     5318
verification_status               3
issue_d                          55
loan_status                       3
pymnt_plan                        1
url                           39717
desc                          26526
purpose                          14
title                         19615
zip_code                        823
addr_state                       50
dti                            2868
delinq_2yrs                      11
earliest_cr_line                526
inq_last_6mths              

Dropping the columns where the value in each row is unique. We also drop the columns that have the same value across all rows. Both cases do not display any pattern that can help predict the credit score of the individual.

In [9]:
columns_to_drop = unique_counts[(unique_counts == df.shape[0]) | (unique_counts == 1)].index
df.drop(columns=columns_to_drop, inplace=True)

Our target feature in this dataset is `grade`. This represents the credit score of the individual. We will convert this feature to numeric format.

In [10]:
label_encoder = LabelEncoder()
df['grade'] = label_encoder.fit_transform(df['grade'])

In [11]:
# feature engineering these columns to make them numeric
df['int_rate_'] = df['int_rate'].str.replace('%', '').astype(float)
df['emp_length_'] = df.emp_length.str[:2].str.strip().replace('<', '0').astype(float)
df['revol_util_'] = df.revol_util.str[:-1].astype(float)

# to convert dates to numeric format, the difference between the earliest date and current date is found
# changing the date columns' data types to datetime
for date_col in ['earliest_cr_line', 'issue_d', 'last_pymnt_d', 'last_credit_pull_d']:
    df[date_col] = pd.to_datetime(df[date_col], format='%b-%y')

# `earliest_cr_line` feature is calculated separately because its earliest date is 1969
elapsed_months = (df['earliest_cr_line'].dt.year - df['earliest_cr_line'].min().year) * 12 + (df['earliest_cr_line'].dt.month - df['earliest_cr_line'].min().month)
df['earliest_cr_line'] = elapsed_months

# calculating for other date features | January 2007 was selected because the dataset is b/w 2007-2011
earliest_ref_d = pd.to_datetime('2007-01-01')

for date_col in ['issue_d', 'last_pymnt_d', 'last_credit_pull_d']:
    elapsed_months = (df[date_col].dt.year - earliest_ref_d.year) * 12 + (df[date_col].dt.month - earliest_ref_d.month)
    df[date_col] = elapsed_months

In [12]:
# these columns are one-hot encoded
df['term_'] = df['term'].map({' 36 months': 0, ' 60 months': 1})
df = pd.get_dummies(df, columns=['home_ownership'])

# converting the dummy boolean columns to 0 and 1s
boolean_cols = df.select_dtypes(include='bool').columns
df[boolean_cols] = df[boolean_cols].astype(int)

# target encoding these columns due to high cardinality (too many categories)
for col in ['purpose', 'zip_code', 'addr_state']:
    df[col] = df[col].map(df.groupby(col)['grade'].mean())

In [13]:
# these columns are ordinally encoded due to their inherent order
df['loan_status_'] = df['loan_status'].map({'Charged Off': 0,
                                            'Current': 1,
                                            'Fully Paid': 2})
df['verification_status_'] = df['verification_status'].map({'Not Verified': 0,
                                                            'Verified': 1,
                                                            'Source Verified': 2})

In [14]:
# dropping the old columns after the above changes
df.drop(columns=['term',
                 'int_rate',
                 'loan_status',
                 'emp_length',
                 'verification_status',
                 'revol_util',
                 'home_ownership_NONE',  # mostly 0 values
                 'out_prncp',  # ditto
                 'home_ownership_OTHER',  # ditto
                 'desc',  # irrelevant
                 'earliest_cr_line', # ditto
                 'title',  # ditto
                 'emp_title',  # too many categories (28820)
                 'sub_grade',  # an extension of the target feature
                ], inplace=True)

# removing columns that end with `_inv` as they are almost duplicates of other columns
df.drop(columns=df.columns[df.columns.str[-4:] == '_inv'], inplace=True)

In [15]:
# normalization is unnecessary since tree-based algorithms can handle large differences in feature scales
# some features still have missing values. however, xgboost can work around them

In [24]:
# splitting the dataset into sets 60-20-20
X = df.drop('grade', axis=1)
y = df['grade']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [2]:
model = XGBClassifier(
    objective='multi:softmax',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=100,
    early_stopping_rounds=10
)

NameError: name 'XGBClassifier' is not defined

In [None]:
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True,
)

In [None]:
# evaluation on test set

y_pred = model.predict(X_test)
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# hyperparameter tuning

# rejoin train and val sets since cross validation is used
X_train = pd.concat([X_train, X_val], ignore_index=True)
y_train = pd.concat([y_train, y_val], ignore_index=True)

model = XGBClassifier(
    objective='multi:softmax',
    eval_metric='logloss',
    use_label_encoder=False
)

param_dist = {
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 6),
    'gamma': uniform(0, 0.5),
    'subsample': uniform(0.6, 0.9),
    'colsample_bytree': uniform(0.6, 0.9),
    'learning_rate': uniform(0.01, 0.2),
    'n_estimators': randint(100, 500)
}

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=30,
    scoring='accuracy',
    cv=5,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train)

In [None]:
best_params = random_search.best_params_

print(f'Best parameters: {best_params}')
print(f'Best score: {random_search.best_score_}')

y_pred = random_search.predict(X_test)
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
best_model = XGBClassifier(
    objective='multi:softmax',
    eval_metric='logloss',
    use_label_encoder=False,
    **best_params
)

In [None]:
best_model.fit(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_test)
print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
best_model.save_model('best_xgboost_model.json')

# loading the model
# xgb.Booster().load_model('my_xgboost_model.json')