# Task 2: Do an explanatory analysis for this dataset
We are performing an explanatory data analysis (EDA) on the preprocessed dataframe located in 'data/preprocessed' in order to get a better understanding of our data.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
sns.set_theme(style='whitegrid')
pd.options.mode.chained_assignment = None  # Supress warning, default='warn'

## Read Data

In [None]:
df = pd.read_pickle('../data/preprocessed/accepted_df_preprocessed.pkl')
print(f"Shape of df: {df.shape}")
df.head(5)

## EDA
In the following sections we explore our data set, i.e. we do EDA.

In [None]:
# Get Number of Non-Default (0) and Default (1)
loan_status_value_count = df['loan_status'].value_counts()
print(loan_status_value_count)

n_non_default = loan_status_value_count[0]
n_default = loan_status_value_count[1]

### Loan Distribution

In [None]:
# Plot Loan amount distribution
plt.figure(figsize=(20, 8))
plt.title(f"Distribution of the loan amount applied for the borrower "
          f"($n_{{non\_default}} = {n_non_default}, n_{{default}}$ = {n_default})", size=18)
sns.histplot(data=df, x='loan_amnt', bins=30, edgecolor='black', hue='loan_status', multiple='layer')
plt.legend(title='Loan Status', labels=['Default', 'Non-Default'])
plt.show()

# Plot boxplot
plt.figure(figsize=(20, 4))
plt.title(f"Boxplot of loan amount grouped by loan "
          f"status ($n_{{non\_default}} = {n_non_default}, n_{{default}}$ = {n_default})", size=18)
sns.boxplot(data=df, x='loan_amnt', y='loan_status', orient='h', hue='loan_status', saturation=0.6)
#plt.legend(title='Loan Status', labels=['Default', 'Non-Default'])
plt.show()

# Get descriptive statistics from loan_amnt grouped by status
df[['loan_amnt','loan_status']].groupby('loan_status').describe().applymap('{:.2f}'.format)

### Loan Status & LC Grade

In [None]:
# Set column as category, TODO: Do this step in the preprocessing notebook
df["grade"] = df["grade"].astype("category")

# Plot settings
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20,8))
sns.countplot(data=df, x="loan_status", hue="grade", edgecolor="black", palette="tab10", ax=axs[0], alpha=0.75)
sns.histplot(data=df, x="grade", edgecolor="black", palette="tab10", hue="grade", ax=axs[1], alpha=0.75)
axs[0].set_title("Distribution of the Loan Status grouped by LC Grades", size=18)
axs[1].set_title("Distribution of the LC Grades", size=18)
fig.tight_layout()
plt.show()

### Annual Income and Address State
Plot shows the average annual income of our borrowers grouped by the state they live in.

In [None]:
# Get mean of annual income grouped by addr_state
mean_annual_income_addr_state_df = df.groupby('addr_state')['annual_inc'].mean().sort_values(ascending=False)
# Attention: Address State 'IA' has only 7 observations, while the rest has at least 2000.

# Plot settings
plt.figure(figsize=(20,8))
plt.title("Mean annual income of the borrowers grouped by states", size=18)
sns.barplot(x = mean_annual_income_addr_state_df.keys(),
            y = mean_annual_income_addr_state_df.values,
            palette="tab20",
            alpha=0.9,
            edgecolor="black")
plt.show()