Import

In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

%matplotlib inline

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from ydata_profiling import ProfileReport
import seaborn as sns
from sklearn import metrics

In [None]:
df = pd.read_csv("loan_data_2007_2014.csv", index_col=0, low_memory=False)
df.head()

In [None]:
df.describe()

# Pandas Profiling

In [None]:
# df_report = ProfileReport(df, minimal=True)
# df_report.to_file(output_file="1-profiling.html")

In [None]:
df.loan_status.unique()

In [None]:
df = df[df["loan_status"] != "Current"]

Drop unused columns

In [None]:
# Constant
df.drop(columns=["policy_code", "application_type"], inplace=True)

# Unique
df.drop(columns=["id", "member_id", "url"], inplace=True)

# NLP-related task
df.drop(columns=["title", "desc"], inplace=True)

# Redundant
df.drop(columns=["funded_amnt_inv"], inplace=True)

# Highly imbalanced
df.drop(columns=["pymnt_plan", "zip_code"], inplace=True)

In [None]:
# Unsupported col
unsup_cols = [
    "annual_inc_joint",
    "dti_joint",
    "verification_status_joint",
    "open_acc_6m",
    "open_il_6m",
    "open_il_12m",
    "open_il_24m",
    "mths_since_rcnt_il",
    "total_bal_il",
    "il_util",
    "open_rv_12m",
    "open_rv_24m",
    "max_bal_bc",
    "all_util",
    "inq_fi",
    "total_cu_tl",
    "inq_last_12m",
]

df.drop(columns=unsup_cols, inplace=True)

In [None]:
null_value_count = df.isnull().sum()

# Print the null value count for each column
print(null_value_count)

Drop duplicates

In [None]:
df.drop_duplicates(inplace=True)

Fillna

In [None]:
df["emp_title"].fillna("N/A", inplace=True)
df["emp_length"].fillna("N/A", inplace=True)

Replace

In [None]:
df["term"] = df["term"].replace({" 36 months": 36, " 60 months": 60})

In [None]:
df["loan_status"] = df["loan_status"].replace(
    {
        "Does not meet the credit policy. Status:Fully Paid": "Fully Paid",
        "Does not meet the credit policy. Status:Charged Off": "Charged Off",
        "Late (31-120 days)": "Late",
        "Late (16-30 days)": "Late",
    }
)

Extract date feature

In [None]:
col_date = [
    "issue_d",
    "earliest_cr_line",
    "last_pymnt_d",
    "next_pymnt_d",
    "last_credit_pull_d",
]

for col in col_date:
    df[col] = pd.to_datetime(df[col], format="%b-%y")
    df[col + "_month"] = df[col].dt.month
    df[col + "_year"] = df[col].dt.year

Copy

In [None]:
df_ori = df.copy(deep=True)

In [None]:
df.columns

# EDA

In [None]:
sns.pairplot(df[["loan_amnt", "funded_amnt", "int_rate", "installment"]])
plt.show()

loan amount over time

In [None]:
df_monthly = df.set_index("issue_d")

# Aggregate loan amounts by month
loan_amount_monthly = df_monthly["loan_amnt"].resample("M").sum()

# Count the number of loans by month
loan_count_monthly = df_monthly["loan_amnt"].resample("M").count()

# Combine the aggregated df into a new dfFrame
temporal_analysis_df = pd.DataFrame(
    {"Loan Amount": loan_amount_monthly, "Loan Count": loan_count_monthly}
)

In [None]:
# Plot the loan amount over time
plt.figure(figsize=(12, 6))
plt.plot(temporal_analysis_df["Loan Amount"])
plt.title("Loan Amount Over Time")
plt.xlabel("Year")
plt.ylabel("Loan Amount")

formatter = mticker.FuncFormatter(lambda x, pos: "{:,.0f}M".format(x * 1e-6))
plt.gca().yaxis.set_major_formatter(formatter)

plt.show()

loan count over time

In [None]:
# Plot the loan count over time
plt.figure(figsize=(12, 6))
plt.plot(temporal_analysis_df["Loan Count"])
plt.title("Loan Count Over Time")
plt.xlabel("Year")
plt.ylabel("Loan Count")

# Format y-axis labels
formatter = mticker.FuncFormatter(lambda x, pos: "{:,.0f}K".format(x * 1e-3))
plt.gca().yaxis.set_major_formatter(formatter)

plt.show()

credit history distribution

In [None]:
# Visualize credit history distribution (earliest_cr_line)
plt.figure(figsize=(8, 6))
sns.histplot(df["earliest_cr_line"].dt.year, bins=30, kde=True)
plt.title("Credit History Distribution")
plt.xlabel("Year")
plt.ylabel("Count")
plt.show()

debt-to-income (DTI) distribution

In [None]:
sns.histplot(df["dti"], kde=True)

payment rate over time

In [None]:
df["issue_d_monthly"] = df["issue_d"].dt.to_period("M")
loan_volume = df["issue_d_monthly"].value_counts().sort_index()

In [None]:
# Visualize payment rate over time (last_pymnt_d)
df["last_pymnt_d_monthly"] = df["last_pymnt_d"].dt.to_period("M")
payment_rate = df["last_pymnt_d_monthly"].value_counts().sort_index() / loan_volume

plt.figure(figsize=(12, 6))
payment_rate.plot(kind="line")
plt.title("Monthly Payment Rate")
plt.xlabel("Month")
plt.ylabel("Payment Rate")
plt.show()

payment status

In [None]:
# Visualize payment status (next_pymnt_d)
payment_status = df["loan_status"].value_counts()

plt.figure(figsize=(8, 6))
payment_status.plot(kind="bar")
plt.title("Payment Status")
plt.xlabel("Payment Status")
plt.ylabel("Loan Count")
plt.show()

# Machine Learning

Based on the provided dfset, various machine learning tasks can be applied depending on the specific objective or problem you want to solve. Here are some possible tasks that can be performed:

1. **Regression**: Predicting a continuous numerical value, such as:
   - Predicting the loan amount (`funded_amnt`) or last month's payment received (`loan_amnt`).
   - Predicting the monthly installment amount (`installment`).

2. **Classification**: Predicting a categorical value, such as:
   - Predicting the loan status (`loan_status`) to determine if a borrower will default or repay the loan.
   - Predicting the loan grade (`grade` or `sub_grade`), which represents the creditworthiness of the borrower.

3. **Binary Classification**: Similar to classification but with only two classes, such as:
   - Predicting whether a borrower's income was verified (`verification_status`).
   - Predicting whether a borrower's payment plan exists (`pymnt_plan`).

4. **Sequence Generation**: Generating a sequence of values, such as:
   - Predicting the next scheduled payment date (`next_pymnt_d`) based on historical df.

5. **Anomaly Detection**: Identifying unusual or outlier patterns in the df, such as:
   - Detecting borrowers with a significantly higher or lower income (`annual_inc`) compared to others.

6. **Clustering**: Grouping similar borrowers based on their features, such as:
   - Clustering borrowers based on their employment length (`emp_length`) and annual income (`annual_inc`).

7. **Feature Importance**: Identifying the most important features that contribute to a specific target variable, such as:
   - Determining the key factors that affect the loan status (`loan_status`) or interest rate (`int_rate`).

These are just a few examples of the machine learning tasks that can be applied to the given dfset. The choice of task depends on the specific problem you want to solve or the insights you want to extract from the df.

## Predicting monthly installment

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error

# # Select the relevant features and target variable
# features = ['loan_amnt', 'int_rate', 'term']  # Add more relevant features as needed
# target = 'installment'

# # Split the df into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# # Train the linear regression model
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = model.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# print('Mean Squared Error:', mse)

## Predicting the loan grade

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, classification_report

# # Select the relevant features and target variable
# features = ['loan_amnt', 'int_rate', 'dti']  # Add more relevant features as needed
# target = 'grade'  # or 'sub_grade' for more specific credit rating

# # Preprocess the df if needed (e.g., handle missing values, encode categorical variables)

# # Split the df into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# # Train the logistic regression model
# model = LogisticRegression(multi_class='auto')
# model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = model.predict(X_test)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# classification_rep = classification_report(y_test, y_pred)

# print('Accuracy:', accuracy)
# print('Classification Report:')
# print(classification_rep)

## Predicting whether a borrower's income was verified

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, confusion_matrix

# # Select the relevant features and target variable
# features = ['loan_amnt', 'dti', 'annual_inc']  # Add more relevant features as needed
# target = 'verification_status'

# # Preprocess the df if needed (e.g., handle missing values, encode categorical variables)
# df_n = df[features + [target]].dropna()

# # Split the df into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df_n[features].dropna(), df_n[target], test_size=0.2, random_state=42)

# # Train the logistic regression model
# model = LogisticRegression()
# model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = model.predict(X_test)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# confusion = confusion_matrix(y_test, y_pred)

# print('Accuracy:', accuracy)
# print('Confusion Matrix:')
# print(confusion)

## Predicting the next scheduled payment date (`next_pymnt_d`) based on historical df

In [None]:
# from statsmodels.tsa.arima.model import ARIMA

# # Select the relevant time series df
# time_series_df = df[['next_pymnt_d']]

# # Split the df into training and testing sets
# train_size = int(len(time_series_df) * 0.8)
# train_df, test_df = time_series_df[:train_size], time_series_df[train_size:]

# # Train the ARIMA model
# order = (1, 1, 1)  # Order (p, d, q) of the ARIMA model
# model = ARIMA(train_df, order=order)
# model_fit = model.fit()

# # Make predictions on the test set
# start_index = len(train_df)
# end_index = len(time_series_df) - 1
# predictions = model_fit.predict(start=start_index, end=end_index)

# # Convert the predicted values to the appropriate format if needed

# # Print the predicted values
# print('Predicted next scheduled payment dates:')
# print(predictions)


## Detecting borrowers with a significantly higher or lower income (`annual_inc`) compared to others

In [None]:
# import pandas as pd
# from sklearn.ensemble import IsolationForest
# # Select the relevant feature for anomaly detection
# feature = 'annual_inc'

# # Preprocess the data if needed (e.g., handle missing values, normalize the feature)

# # Train the Isolation Forest model
# model = IsolationForest(contamination=0.05)  # Adjust the contamination parameter as needed
# model.fit(df[[feature]])

# # Predict anomalies
# predictions = model.predict(df[[feature]])

# # Identify the indices of anomalies
# anomaly_indices = df.index[predictions == -1]

# # Print the anomalies
# anomalies = df.iloc[anomaly_indices]
# print('Anomalies:')
# print(anomalies)

## Clustering borrowers based on their employment length (`emp_length`) and annual income (`annual_inc`).

In [None]:
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import OneHotEncoder

# # Select the relevant features for clustering
# features = ['emp_length', 'annual_inc']
# columns_to_encode = ['emp_length']

# df_f = df[features].dropna()

# # Create an instance of the OneHotEncoder
# encoder = OneHotEncoder(sparse=False)

# # Fit and transform the selected column(s) using one-hot encoding
# encoded_columns = encoder.fit_transform(df_f[columns_to_encode])

# # Create a DataFrame with the encoded columns
# encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(columns_to_encode))

# # Concatenate the encoded DataFrame with the remaining columns
# df_encoded = pd.concat([encoded_df, df_f['annual_inc']], axis=1)

# # Preprocess the df if needed (e.g., handle missing values, scale the features)
# scaler = MinMaxScaler()
# df_scaled = scaler.fit_transform(df_encoded)

In [None]:
# # Determine the optimal number of clusters using the elbow method
# wcss = []
# for i in range(1, 20):
#     kmeans = KMeans(n_clusters=i, random_state=42)
#     kmeans.fit(df_scaled)
#     wcss.append(kmeans.inertia_)
# plt.plot(range(1, 20), wcss)
# plt.xlabel('Number of Clusters')
# plt.ylabel('WCSS')
# plt.title('Elbow Method')
# plt.show()

In [None]:
# # Train the K-means model with the chosen number of clusters
# k = 12  # Adjust the number of clusters based on the elbow method or domain knowledge
# kmeans = KMeans(n_clusters=k, random_state=42)
# kmeans.fit(df_scaled)

# # Assign cluster labels to the df points
# df_f['cluster_label'] = kmeans.labels_

# # Visualize the clusters (you can modify this based on your features)
# plt.scatter(df_f['emp_length'], df_f['annual_inc'], c=df_f['cluster_label'])
# plt.xlabel('Employment Length')
# plt.ylabel('Annual Income')
# plt.title('Clustering of Borrowers')
# plt.show()

## Determining the key factors that affect the loan status (`loan_status`) or interest rate (`int_rate`)

In [None]:
# from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
# from sklearn.inspection import permutation_importance
# df = df_ori.copy(deep=True)

In [None]:
# target = 'loan_status'
# cat_cols = ['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership',
#            'verification_status', 'purpose','addr_state', 'initial_list_status']
# num_cols = list(set(df.columns) - set(cat_cols + [target]))

In [None]:
# # Select the relevant features and target variable
# target = 'loan_status'  # Replace with the actual target variable
# features = num_cols  # Replace with the actual feature names

# # Train a Random Forest model for classification or regression
# model = RandomForestClassifier()

# model.fit(df[features], df[target])

# # Calculate feature importance using permutation importance
# perm_importance = permutation_importance(model, df[features], df[target])

# # Get the feature importance scores and their corresponding feature names
# importance_scores = perm_importance.importances_mean
# feature_names = df[features].columns

# # Create a feature importance dataframe
# feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance_scores})
# feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# # Print the feature importance ranking
# print('Feature Importance:')
# print(feature_importance_df)

# Predict credit risk (final)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Select relevant columns for credit risk prediction
selected_columns = [
    "annual_inc",
    "loan_status",
    "grade",
    "sub_grade",
    "dti",
    "inq_last_6mths",
    "open_acc",
]
df = df[selected_columns]

# Convert categorical columns to numeric using label encoding
categorical_columns = ["grade", "sub_grade", "loan_status"]
le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

# Split the dataset into features and target variable
X = df.drop("loan_status", axis=1)
y = df["loan_status"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create a HistGradientBoostingClassifier
clf = HistGradientBoostingClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Predict the credit risk on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))