# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# set notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Import data

In [3]:
df = pd.read_parquet('s3://msbx5420-spr23/team_mba/cleaned_data.parquet')

ImportError: ignored

# Final Data Processing

In [None]:
# change column types to numeric
cols = ['annual_inc', 'dti']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [None]:
# define a function to map each value to its corresponding group value
def map_status(status):
    if status in ['Fully Paid', 'Current', 'In Grace Period']:
        return 'Current'
    elif status in ['Late (16-30 days)', 'Late (31-120 days)']:
        return 'Late'
    elif status in ['Charged Off', 'Default']:
        return 'Defaulted'
    else:
        return 'Unknown'
    
# create a new column by applying the function to the original column
df['status'] = df['loan_status'].map(map_status)

# Descriptive Statistics

In [None]:
# define key subset of features
subset = ['loan_amnt', 'int_rate', 'annual_inc', 'dti']

# generate descriptive statistics
df[subset].describe()

In [None]:
# group by status, then generate statistics
df.groupby('status')['loan_amnt'].describe()

In [None]:
# group by status, then generate statistics
df.groupby('status')['int_rate'].describe()

In [None]:
# group by status, then generate statistics
df.groupby('status')['dti'].describe()

In [None]:
# group by status, then generate statistics
df.groupby('status')['annual_inc'].describe()

In [None]:
# group by sub_grade, then generate statistics
df.groupby('home_ownership')[subset].describe()

# Data Visualizations

In [None]:
print(df.dtypes.head(40))

In [None]:
def kdeplot(feature):
    plt.figure(figsize=(9, 4))
    plt.title("plot {}".format(feature))
    ax0 = sns.kdeplot(df[df['status'] == 'Current'][feature].dropna(), color= 'navy', label= 'Status: Current')
    ax1 = sns.kdeplot(df[df['status'] == 'Late'][feature].dropna(), color= 'orange', label= 'Status: Late')
    ax2 = sns.kdeplot(df[df['status'] == 'Defaulted'][feature].dropna(), color= 'green', label= 'Status: Defaulted')
kdeplot('loan_amnt')
kdeplot('annual_inc')
kdeplot('dti')

In [None]:
kdeplot('loan_amnt')

In [None]:
corr = df.corr()
ax1 = sns.heatmap(corr, cbar=0, linewidths=2,vmax=1, vmin=0, square=True, cmap='Blues')
plt.show()

In [None]:
# loan amount vs interest rate
sns.lineplot(data = df, x = 'loan_amnt', y = 'int_rate')

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

loan_counts = df.groupby(['sub_grade', 'status']).size()
loan_counts = loan_counts.unstack()

ax = loan_counts.plot(kind='barh', stacked=True)

ax.set_xlabel("Number of Loans")
ax.set_ylabel("Loan Status")
ax.set_title("Total Number of Loans by Loan Status and Sub-Grade")

plt.show()

In [None]:
loan_counts = df.groupby(['sub_grade','status']).size()
loan_perc = loan_counts.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(loan_perc)

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

ax = loan_perc.unstack().plot(kind='barh', stacked=True)

ax.set_xlabel("Number of Loans")
ax.set_ylabel("Sub-Grade")
ax.set_title("Percentage of Loans by Loan Status and Sub-Grade")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
plt.show()

In [None]:
loan_counts = df.groupby(['sub_grade','status']).size()
loan_counts = loan_counts.unstack()
loan_counts = loan_counts.reindex(sorted(loan_counts.columns), axis=1)

sns.boxplot(x='sub_grade', y='int_rate', data=df, 
            order=sorted(df['sub_grade'].unique()))

plt.xlabel("Sub-Grade")
plt.ylabel("Interest Rate")
plt.title("Interest Rates by Sub-Grade")

plt.show()

In [None]:
sns.boxplot(x='sub_grade', y='dti', data=df, order=sorted(df['sub_grade'].unique()))

plt.xlabel("Sub-Grade")
plt.ylabel("Debt to Income Ratio")
plt.title("Debt to Income Ratio by Sub-Grade")

plt.show()