### EDA- Landing Club Study

### Objectives:

**Importing necessary Modules**:
- Import the modules necessary for Data Manipulation and Visualization.

**Reading dataset**:
- Read the dataset containing loan applicant information.

**Exploring the Dataset**:
- Understand the Structure and various datatypes of the attributes within the dataset.

**Missing value analysis**:
- Identify and analyze missing values in the dataset.

**Analysing categorical and numerical columns**:
- Analyze categorical and numerical columns to understand the statistical properties and relationships within the dataset.

**Univariate Analysis**:
- Conduct univariate analysis to explore the distribution and characteristics of individual variables.

**Outliers**:
- Identify and analyze outliers within the dataset to understand their impact on the analysis.

**Bivariate analysis**:
- Conduct bivariate analysis to explore relationships between different variables and their impact on loan default rates.

In [288]:
#!pip install plotly

In [289]:
# Importing libraries
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings("ignore")
pd.options.display.max_columns=None
pd.options.display.max_rows=None
import sketch as sketch
import os
import plotly.express as px 
#os.environ['SKETCH_MAX_COLUMNS']= '55'


In [None]:

df_loan=pd.read_csv("loan.csv")
df_loan.shape

In [None]:
df_loan.head()

In [None]:
# 
df_loan.info()

In [None]:
df_loan.isna().sum()

In [None]:
df_loan_percentage=round(df_loan.isna().sum()/len(df_loan)*100,2)
df_loan_percentage

## DATA CLEANING

In [295]:
## remove all null values columns

df_loan=df_loan.dropna(axis=1,how='all')

In [None]:

len(df_loan.columns)

In [None]:
# null % per column
round(df_loan.isna().sum()/len(df_loan)*100,2)


In [None]:
df_loan.isna().sum()

In [None]:

df_loan.recoveries.value_counts()

In [300]:
# cleaning
df_loan.int_rate=df_loan.int_rate.str.replace('%','')
df_loan.revol_util=df_loan.revol_util.str.replace('%','')
df_loan.term=df_loan.term.str.replace('months','')


In [301]:
#type conversion
df_loan.int_rate=df_loan.int_rate.astype(float)
df_loan.revol_util=df_loan.revol_util.astype(float)
df_loan.recoveries=df_loan.recoveries.astype(float)
#  df_loanrevol_bal
# df_loan.total_acc
# df_loan.funded_amnt
# df_loaninstallment
# df_loanout_prncp
# df_loantotal_pymnt','total_rec_prncp','total_rec_int','total_rec_late_fee','recoveries']
df_loan.term=df_loan.term.astype(int)
df_loan.emp_length=df_loan.emp_length.astype(str)
df_loan.desc=df_loan.desc.astype(str)
df_loan.title=df_loan.title.astype(str)
df_loan.issue_d=df_loan.issue_d.astype(str)

In [302]:
# Handle missing values 
df_loan['emp_length'] = df_loan['emp_length'].fillna('Unknown')
df_loan['title'] = df_loan['title'].fillna('Unknown')
df_loan['desc'] = df_loan['desc'].fillna('')
df_loan['chargeoff_within_12_mths'] = df_loan['chargeoff_within_12_mths'].fillna(0)
df_loan['pub_rec_bankruptcies'] = df_loan['pub_rec_bankruptcies'].fillna(0)

df_loan['total_pymnt']=round(df_loan['total_pymnt'],2)

In [303]:
#Derived Columns
df_loan['issue_month']=df_loan['issue_d'].apply(lambda x:  x.split('-')[0])
df_loan['issue_year']=df_loan['issue_d'].apply(lambda x:  x.split('-')[1])

In [None]:
round(df_loan.isna().sum()/len(df_loan)*100,2)

In [None]:
df_loan.shape

In [None]:
df_loan.info()

In [None]:
df_loan.desc.value_counts()

## Analysis 1

In [None]:
from matplotlib.gridspec import GridSpec
fig=plt.figure(constrained_layout=True,figsize=(20,10))
gs=GridSpec(2,2,figure=fig)
#Grade Analysis
ax6=fig.add_subplot(gs[0,0])
sns.countplot(data=df_loan, x='grade', order=sorted(df_loan['grade'].unique()), palette='viridis',ax=ax6)
plt.title('Distribution of Grades')
plt.xlabel('Grade')
plt.ylabel('Count')

#Sub-Grade Analysis
ax5=fig.add_subplot(gs[0,1])
sns.countplot(data=df_loan, x='sub_grade', order=sorted(df_loan['sub_grade'].unique()), palette='viridis',ax=ax5)
plt.title('Distribution of Sub-Grades')
plt.xlabel('Sub-Grade')
plt.ylabel('Count')

#State Analysis

ax5=fig.add_subplot(gs[1,0])
#sns.countplot(data=df_loan, x='addr_state', order=df_loan['addr_state'].unique(), palette='viridis',ax=ax5)
df_loan.addr_state.value_counts().plot(kind='bar')
plt.title('Distribution of States')
plt.xlabel('States')
plt.ylabel('Count')

ax5=fig.add_subplot(gs[1,1])
sns.histplot(df_loan['loan_amnt'], kde=True, bins=10, color='blue',ax=ax5)
plt.title('Distribution of Loan Amounts')
plt.xlabel('Loan Amount')
plt.ylabel('Frequency')
plt.show()

This visualization can help us understand the distribution of loan amounts in the dataset. We can see that most loans fall in the range of $5,000 to $15,000

In [None]:

fig=plt.figure(constrained_layout=True,figsize=(20,10))
gs=GridSpec(2,2,figure=fig)
#Month Analysis
ax6=fig.add_subplot(gs[0,0])
sns.countplot(data=df_loan, x='issue_month', order=sorted(df_loan['issue_month'].unique()), palette='viridis',ax=ax6)
plt.title('Distribution of Months')
plt.xlabel('Months')
plt.ylabel('Count')

#Year Analysis
ax5=fig.add_subplot(gs[0,1])
sns.countplot(data=df_loan, x='issue_year', order=sorted(df_loan['issue_year'].unique()), palette='viridis',ax=ax5)
plt.title('Distribution of Year')
plt.xlabel('Year')
plt.ylabel('Count')

plt.show()

This visualization can help us understand the distribution of loans in the dataset. We can see that most loans issued in year 20011

## Analysis 2

In [None]:
loan_status_counts = df_loan.groupby('loan_status').count()['id'] #
plt.bar(loan_status_counts.index, loan_status_counts)
plt.xlabel('Loan Status') 
plt.ylabel('Count')
plt.title('Number of Loans by Status')
 # Display the chart 
plt.show()

## Analysis 3

In [None]:
plt.figure(figsize=(20,8))
ax1 = plt.subplot(1,3,1)
df_loan['loan_status'].value_counts().plot.pie(autopct="%1.0f%%", ax = ax1)
plt.title('Loan Status')
plt.xlabel('Loan Status')
plt.ylabel('Percentage')
ax2 = plt.subplot(1,3,2)
df_loan['grade'].value_counts().plot.pie(autopct="%1.0f%%", ax = ax2)
plt.title('Grades')
plt.xlabel('Grade')
plt.ylabel('Percentage')
ax3 = plt.subplot(1,3,3)
df_loan['purpose'].value_counts().plot.pie( autopct='%1.1f%%', ax = ax3) 
plt.title('Loan Purposes')
plt.ylabel('Percentage')

plt.show()
plt.show()

This visualization can help us understand the proportion of loans that have been fully paid, charged off, or are currently in progress. We can see that the majority of loans have been fully paid.

## Analysis 4

In [None]:
df_loan=df_loan.sort_values(by=['grade','sub_grade'],ascending=True)
fig=px.box(df_loan,x='grade', y='int_rate',color="grade")
fig.update_layout(title_text="Interest Rate by Grade")
fig.update_xaxes(title_text='Loan Grade')
fig.update_yaxes(title_text='Interest Rate (%)')
fig1=px.box(df_loan,x='sub_grade', y='int_rate',color="sub_grade")
fig1.update_xaxes(title_text='Loan Sub Grade')
fig1.update_yaxes(title_text='Interest Rate (%)')
fig1.update_layout(title_text="Interest Rate by Sub Grade")
fig.show()
fig1.show()


**High-Risk Grades (E, F, G):**

Key Indicators:

High median interest rates, reflecting the high likelihood of default.

Wider IQR and more outliers, signaling a significant variability in risk levels.

Risk Profile:

These grades represent borrowers with poor credit histories or unstable incomes.

Lenders face a high risk of non-payment, which is offset by charging much higher interest rates.

This visualization can help us understand the relationship between loan grade and interest rates. We can see that higher grade loans tend to have lower interest rates. 

## Analysis 5

In [None]:
plt.scatter(df_loan['annual_inc'], df_loan['loan_amnt']) 
plt.xlabel('Annual Income') 
plt.ylabel('Loan Amount') 
plt.title('Scatter Plot of Annual Income vs. Loan Amount') 
plt.show()

**Weak Correlation:**

The plot suggests no strong relationship between annual income and loan amount. Borrowers with similar incomes might request vastly different loan amounts.

This lack of correlation implies that other factors (e.g., credit score, existing debt, or risk tolerance) are influencing loan amounts.

This visualization can help us understand the relationship between annual income and loan amount. We can see that there is a positive correlation between the two variables, indicating that individuals with higher incomes tend to receive larger loans.

In [None]:
df_loan[['loan_amnt', 'int_rate', 'dti', 'annual_inc', 'revol_bal', 'total_acc','funded_amnt','installment','out_prncp','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee','recoveries']].corr()

In [None]:
df_loan.head()

This visualization can help us understand the distribution of loan purposes in the dataset. We can see that the majority of loans are for credit card debt consolidation, followed by other purposes such as home improvement and small business loans.

In [316]:
# Define risk thresholds for filtering risky loans
high_risk_criteria = (
    (df_loan['grade'].isin(['E', 'F', 'G'])) |  # Low grades
    (df_loan['int_rate'] > 18) |               # High interest rates (>18%)
    (df_loan['dti'] > 30)                      # High Debt-to-Income ratio (>30%)
)

# Filter risky loans
risky_loans = df_loan[high_risk_criteria]

In [None]:
# Filter risky loans
risky_loans = df_loan[high_risk_criteria]

# Analyze the proportion of risky loans and their loan statuses
risky_loans_status = risky_loans['loan_status'].value_counts(normalize=True) * 100

# Summary statistics for risky loans
risky_loans_summary = risky_loans.describe()

risky_loans.shape, risky_loans_status, risky_loans_summary

## Analysis 6

In [None]:
#Distribution of Loan Grades (Risky Loans)
plt.figure(figsize=(20, 15))
grdRisky= plt.subplot(3,2,1)
sns.countplot(data=risky_loans, x='grade', order=['E', 'F', 'G'], palette='Reds',ax=grdRisky)
plt.title('Distribution of Loan Grades (Risky Loans)')
plt.xlabel('Grade')
plt.ylabel('Count')

grdRisky= plt.subplot(3,2,2)
sns.countplot(data=risky_loans, x='sub_grade', order=sorted(risky_loans['sub_grade']), palette='Reds',ax=grdRisky)
plt.title('Distribution of Loan Sub Grades (Risky Loans)')
plt.xlabel('Sub Grade')
plt.ylabel('Count')

grdRisky= plt.subplot(3,2,3)
sns.countplot(data=risky_loans, x='addr_state', order=risky_loans['addr_state'], palette='coolwarm',ax=grdRisky)
plt.title('Distribution of Loan by States (Risky Loans)')
plt.xlabel('States')
plt.ylabel('Count')
plt.xticks(rotation=90)

#Interest Rate Comparison
risky_loans['risk_category'] = 'Risky'
non_risky_loans = df_loan[~high_risk_criteria]
non_risky_loans['risk_category'] = 'Non-Risky'
comparison_data = pd.concat([risky_loans, non_risky_loans])
intrRisk= plt.subplot(3,2,4)
sns.boxplot(data=comparison_data, x='risk_category', y='int_rate', palette='coolwarm',ax=intrRisk)
plt.title('Interest Rate Comparison: Risky vs Non-Risky Loans')
plt.xlabel('Risk Category')
plt.ylabel('Interest Rate (%)')

plt.show()



## Analysis 7

In [None]:
# Default Trends in Risky Loans
plt.figure(figsize=(20, 12))
risky_loan_status_counts = risky_loans['loan_status'].value_counts()
ax0=plt.subplot(2,2,1)
risky_loan_status_counts.plot(kind='bar', color='darkorange', ax=ax0)
plt.title('Loan Status Distribution (Risky Loans)')
plt.xlabel('Loan Status')
plt.ylabel('Number of Loans')
plt.xticks(rotation=10)
# Risky Loans by Purpose

risky_loan_purpose_counts = risky_loans['purpose'].value_counts()
ax1=plt.subplot(2,2,2)
sns.barplot(y=risky_loan_purpose_counts.values, x=risky_loan_purpose_counts.index, palette='Reds',ax=ax1)
plt.title('Risky Loans by Purpose')
plt.ylabel('Number of Loans')
plt.xlabel('Purpose')
plt.xticks(rotation=90)

risky_loan_purpose_counts = risky_loans['addr_state'].value_counts()
ax1=plt.subplot(2,2,3)
sns.barplot(y=risky_loan_purpose_counts.values, x=risky_loan_purpose_counts.index, palette='Reds',ax=ax1)
plt.title('Risky Loans by State')
plt.ylabel('Number of Loans')
plt.xlabel('States')
plt.xticks(rotation=90)
plt.show()




## Analysis 8

In [320]:
categorical = df_loan.select_dtypes(include=['int64', 'float64']).columns

In [None]:
# Correlation Heat Map (Numerical Features)
plt.figure(figsize=(10, 6))
numerical_features =['loan_amnt', 'int_rate', 'dti', 'annual_inc', 'revol_bal', 'total_acc','funded_amnt','installment','out_prncp','total_pymnt','total_rec_prncp','total_rec_int','total_rec_late_fee','recoveries']
#['loan_amnt', 'int_rate', 'dti', 'annual_inc', 'revol_bal', 'total_acc','funded_amnt','installment']
correlation_matrix = risky_loans[numerical_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heat Map of Risky Loans')
plt.show()



**Debt-to-Income Ratio (DTI) and Risk:**

Negative correlation between DTI and annual income (-0.11): 

As income increases, the DTI ratio tends to decrease. A high DTI is a strong indicator of higher risk because it implies the borrower is already carrying a significant debt load relative to their income, which increases the likelihood of loan default.

The weak positive correlation with loan amount (0.15) and revolving balance (0.26) also shows that higher loan amounts and higher balances tend to go hand-in-hand with a higher DTI, which is a risk factor for default.


**Correlation between loan_amnt and other variables:**

The loan amount (loan_amnt) has a moderate positive correlation with the interest rate (int_rate) at 0.30 and a slightly higher positive correlation with annual income (annual_inc) at 0.41.
It also has moderate positive correlations with revolving balance (revol_bal) and total accounts (total_acc).
Interest rate correlations:

The interest rate (int_rate) has a low positive correlation with loan amount (loan_amnt), but it has very low correlations with other variables like annual income (annual_inc) and revolving balance (revol_bal).


## Analysis 9

In [None]:
# 6. Frequency Heat Map (Grade vs Purpose)
plt.figure(figsize=(20, 6))
grade_purpose_pivot = risky_loans.pivot_table(index='purpose', columns='grade', aggfunc='size', fill_value=0)
axGrdHM=plt.subplot(1,2,1)
sns.heatmap(grade_purpose_pivot, annot=True, fmt='d', cmap='YlGnBu',ax=axGrdHM)
plt.title('Frequency Heat Map: Grade vs Purpose (Risky Loans)')
plt.xlabel('Grade')
plt.ylabel('Purpose')
axSubGrdHM=plt.subplot(1,2,2)
grade_purpose_pivot = risky_loans.pivot_table(index='purpose', columns='sub_grade', aggfunc='size', fill_value=0)
sns.heatmap(grade_purpose_pivot, annot=True, fmt='d', cmap='YlGnBu',ax=axSubGrdHM)
plt.title('Frequency Heat Map: Sub Grade vs Purpose (Risky Loans)')
plt.xlabel('Sub Grade')
plt.ylabel('Purpose')
plt.show()

**Risk Implication:**

Debt consolidation loans are often used by individuals facing financial stress or managing existing debt. Therefore, loan grades D and E, with their high frequencies for debt consolidation, may reflect higher financial distress and a higher risk of default.

Credit card loans also seem to be somewhat common in these grades, further suggesting that borrowers in these grades may have challenges managing credit and could be at risk.

**Risk Insights:**
Debt consolidation is strongly associated with higher-risk borrowers (grades D, E, F, G and sub-grades D4, D5). Borrowers in these groups are likely facing financial struggles, making them higher-risk for loan defaults.

Home improvement and small business loans, while somewhat common across sub-grades, show lower frequencies in riskier grades. These loan purposes may still indicate a level of financial planning or stability, but they are less prominent in the high-risk groups.

Credit card loans show a mixed frequency across grades and sub-grades. This could suggest that some borrowers in lower grades are using loans for credit-related needs, which can indicate over-reliance on credit, adding to the default risk.

In [323]:
# # FICO Score vs Sub-Grades
# plt.figure(figsize=(10, 6))
# sns.boxplot(data=df_loan, x='sub_grade', y='last_fico_range_high', palette='coolwarm', order=sorted(df_loan['sub_grade'].unique()))
# plt.title('FICO Score Distribution by Sub-Grade')
# plt.xlabel('Sub-Grade')
# plt.ylabel('FICO Range (Low)')
# plt.xticks(rotation=45)
# plt.show()

## Analysis 10

In [None]:
# Revolving Utilization Heatmap
plt.figure(figsize=(20, 6))
axhmgrad=plt.subplot(1,2,1)
sns.heatmap(df_loan.pivot_table(index='grade', columns='loan_status', values='revol_util', aggfunc='mean', fill_value=0),
            annot=True, fmt='.1f', cmap='Reds', ax=axhmgrad)
plt.title('Average Revolving Utilization by Grade and Loan Status')
plt.xlabel('Loan Status')
plt.ylabel('Grade')

axhmsubgrad=plt.subplot(1,2,2)
sns.heatmap(df_loan.pivot_table(index='sub_grade', columns='loan_status', values='revol_util', aggfunc='mean', fill_value=0),
            annot=True, fmt='.1f', cmap='Reds', ax=axhmsubgrad)
plt.title('Average Revolving Utilization by Sub-Grade and Loan Status')
plt.xlabel('Loan Status')
plt.ylabel('Sub-Grade')
plt.show()



## TREND

**As the loan grade decreases (from A to G), the revolving utilization increases for each loan status category.**

There is variation within each grade and sub-grade, with some sub-grades showing significantly higher utilization rates, especially in the "Charged Off" and "Fully Paid" categories.


In [325]:
categorical = df_loan.select_dtypes(include=['int64', 'float64']).columns

In [326]:
# for column in categorical:
#     title = "Plot of " + column
#     plt.scatter(df_loan.index, df_loan[column])
#     plt.title(title)
#     plt.show()


## Analysis 11

In [None]:
# Box Plot: Loan Status vs Revolving Utilization
fig2=px.box(df_loan,x='loan_status', y='revol_util',color="loan_status")
fig2.update_xaxes(title_text='Loan Status')
fig2.update_yaxes(title_text='Revolving Utilization (%)')
fig2.update_layout(title_text="Revolving Utilization by Loan Status")
fig2.show()



**Fully Paid Loans:**

Median revolving utilization is lower compared to the other statuses.

Indicates that borrowers who fully repay loans tend to have more conservative credit usage.

The interquartile range (IQR) shows a narrower spread, suggesting more consistent credit behavior.
Charged Off Loans:

**Charged Off Loans:**

Median utilization is higher than Fully Paid loans, suggesting riskier credit behavior.

The IQR is wider, indicating a higher variability in credit usage among borrowers whose loans are charged off.

Borrowers in this group often have high utilization, hinting at financial stress or over-leverage.
Current Loans:

**Current Loans:**

Median utilization is slightly below Charged Off loans but above Fully Paid loans.

Reflects borrowers who are actively paying loans but may have higher revolving credit balances.

IQR suggests a moderate spread, showing variability in usage but less than Charged Off loans.



## Analysis 12

In [None]:
# Histogram: Distribution of Revolving Utilization
plt.figure(figsize=(10, 6))
sns.histplot(data=df_loan, x='revol_util', bins=30, kde=True, color='purple')
plt.title('Distribution of Revolving Utilization')
plt.xlabel('Revolving Utilization (%)')
plt.ylabel('Frequency')
plt.show()

## Analysis 13

In [None]:
# Sub-Grade vs Revolving Utilization: Bar Plot
plt.figure(figsize=(12, 8))
average_revol_util_by_sub_grade = df_loan.groupby('sub_grade')['revol_util'].mean().sort_index()
sns.barplot(x=average_revol_util_by_sub_grade.index, y=average_revol_util_by_sub_grade.values, palette='viridis')
plt.title('Average Revolving Utilization by Sub-Grade')
plt.xlabel('Sub-Grade')
plt.ylabel('Average Revolving Utilization (%)')
plt.xticks(rotation=45)
plt.show()



Insights:
**Risk Assessment:**

Borrowers with sub-grades in the F and G categories represent high-risk groups due to their high revolving utilization. These borrowers are more likely to default or struggle with repayments.

**Borrower Behavior by Sub-Grade:**

Borrowers with lower sub-grades (e.g., A and B) tend to use credit sparingly, making them safer for lenders.

Higher sub-grade borrowers (E, F, G) exhibit risky behavior with higher utilization, potentially signaling over-leverage or poor financial management.


**Custom Lending Criteria:**

Offer better terms (e.g., lower interest rates) to A and B borrowers to encourage safe credit usage.

Consider stricter repayment schedules or additional guarantees for F and G borrowers.

## Analysis 14

In [None]:
# Heatmap: Sub-Grade vs Loan Status (Revolving Utilization)
plt.figure(figsize=(12, 8))
sub_grade_status_pivot = df_loan.pivot_table(
    index='sub_grade', 
    columns='loan_status', 
    values='revol_util', 
    aggfunc='mean', 
    fill_value=0
)
sns.heatmap(sub_grade_status_pivot, annot=True, fmt='.1f', cmap='YlOrRd')
plt.title('Average Revolving Utilization by Sub-Grade and Loan Status')
plt.xlabel('Loan Status')
plt.ylabel('Sub-Grade')
plt.show()

**Risk Differentiation:**

Sub-grades F and G consistently display high utilization across all loan statuses, indicating they are the riskiest borrowers regardless of their loan outcome.

**Loan Status Analysis:**

Charged Off loans have significantly higher utilization compared to fully paid loans across all sub-grades, confirming that revolving utilization is a key risk indicator for defaults.

Current loans are intermediate, showing higher utilization than fully paid loans but lower than charged off loans, reflecting their ongoing risk exposure.
Credit Behavior by Sub-Grade:

Lower sub-grades (A and B categories) demonstrate strong financial discipline, with consistently low utilization, even for charged-off loans.
Borrowers in sub-grades G2, G3, and G4 are outliers with dangerously high utilization, particularly for charged-off and current loans.

## Analysis 15

In [None]:
# Bar Plot: Loan Status by Employment Length
plt.figure(figsize=(12, 8))
emp_length_status = df_loan.groupby(['emp_length', 'loan_status']).size().unstack()
emp_length_status.plot(kind='bar', stacked=True, color=['salmon','skyblue', 'green'], figsize=(12, 8))
plt.title('Loan Status by Employment Length')
plt.xlabel('Employment Length')
plt.ylabel('Number of Loans')
plt.legend(title='Loan Status')
plt.xticks(rotation=45)
plt.show()

# Heatmap: Employment Length vs Sub-Grade
plt.figure(figsize=(12, 8))
emp_length_sub_grade_pivot = df_loan.pivot_table(
    index='emp_length', 
    columns='sub_grade', 
    aggfunc='size', 
    fill_value=0
)
sns.heatmap(emp_length_sub_grade_pivot, annot=True, fmt='d', cmap='YlGnBu')
plt.title('Heatmap: Employment Length vs Sub-Grade')
plt.xlabel('Sub-Grade')
plt.ylabel('Employment Length')
plt.show()


# Bar Plot: Loan Status by Verification Status
plt.figure(figsize=(10, 6))
sns.countplot(data=df_loan, x='verification_status', hue='loan_status', palette='Set2')
plt.title('Loan Status by Verification Status')
plt.xlabel('Verification Status')
plt.ylabel('Count')
plt.legend(title='Loan Status')
plt.show()

# Stacked Bar Chart: Verification Status by Loan Grade
verification_grade = df_loan.groupby(['grade', 'verification_status']).size().unstack()
verification_grade.plot(kind='bar', stacked=True, color=['red', 'orange', 'lightgreen'], figsize=(12, 8))
plt.title('Verification Status by Loan Grade')
plt.xlabel('Loan Grade')
plt.ylabel('Number of Loans')
plt.legend(title='Verification Status')
plt.xticks(rotation=0)
plt.show()

## Analysis 16

In [None]:
# Bar Plot: Loan Status by Home Ownership
plt.figure(figsize=(10, 6))
sns.countplot(data=df_loan, x='home_ownership', hue='loan_status', palette='coolwarm')
plt.title('Loan Status by Home Ownership')
plt.xlabel('Home Ownership')
plt.ylabel('Count')
plt.legend(title='Loan Status')
plt.xticks(rotation=45)
plt.show()

# Stacked Bar Chart: Home Ownership by Loan Grade
home_ownership_grade = df_loan.groupby(['grade', 'home_ownership']).size().unstack()
home_ownership_grade.plot(kind='bar', stacked=True, figsize=(12, 8), color=['lightblue', 'lightgreen', 'salmon', 'gray'])
plt.title('Home Ownership by Loan Grade')
plt.xlabel('Loan Grade')
plt.ylabel('Number of Loans')
plt.legend(title='Home Ownership')
plt.xticks(rotation=0)
plt.show()

# Pie Chart: Overall Distribution of Home Ownership
plt.figure(figsize=(8, 8))
home_ownership_counts = df_loan['home_ownership'].value_counts()
home_ownership_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=['gold', 'skyblue', 'lightcoral', 'lightgray'])
plt.title('Overall Distribution of Home Ownership')
plt.xticks(rotation=90)
plt.ylabel('')
plt.show()

# Box Plot: Interest Rates by Home Ownership
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_loan, x='home_ownership', y='int_rate', palette='pastel')
plt.title('Interest Rates by Home Ownership')
plt.xlabel('Home Ownership')
plt.ylabel('Interest Rate (%)')
plt.xticks(rotation=20)
plt.show()

## Analysis 17

In [None]:
 #Bar Plot: Loan Status by Charge-Offs Within 12 Months
plt.figure(figsize=(10, 6))
sns.countplot(data=df_loan, x='chargeoff_within_12_mths', hue='loan_status', palette='Set1')
plt.title('Loan Status by Charge-Offs Within 12 Months')
plt.xlabel('Charge-Offs Within 12 Months')
plt.ylabel('Count')
plt.legend(title='Loan Status')
plt.xticks(rotation=45)
plt.show()

# Box Plot: Delinquency Amount by Loan Grade
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_loan, x='grade', y='delinq_amnt', palette='pastel')
plt.title('Delinquency Amount by Loan Grade')
plt.xlabel('Loan Grade')
plt.ylabel('Delinquency Amount ($)')
plt.show()

# Bar Plot: Loan Status by Public Record Bankruptcies
plt.figure(figsize=(10, 6))
sns.countplot(data=df_loan, x='pub_rec_bankruptcies', hue='loan_status', palette='coolwarm')
plt.title('Loan Status by Public Record Bankruptcies')
plt.xlabel('Public Record Bankruptcies')
plt.ylabel('Count')
plt.legend(title='Loan Status')
plt.xticks(rotation=45)
plt.show()

# Stacked Bar Chart: Tax Liens by Loan Grade
tax_liens_grade = df_loan.groupby(['grade', 'tax_liens']).size().unstack()
tax_liens_grade.plot(kind='bar', stacked=True, figsize=(12, 8), color=['lightcoral', 'lightblue', 'gold'])
plt.title('Tax Liens by Loan Grade')
plt.xlabel('Loan Grade')
plt.ylabel('Number of Loans')
plt.legend(title='Tax Liens')
plt.xticks(rotation=0)
plt.show()

## Analysis 18

In [None]:
plt.scatter(df_loan['loan_amnt'], df_loan['total_rec_late_fee'], color='red')
plt.title('Late Fees vs Loan Amount')
plt.xlabel('Loan Amount')
plt.ylabel('Late Fees')
plt.show()


## Analysis 20

In [None]:

recoveries=df_loan[(df_loan.recoveries>1000) & (df_loan.recoveries <= 20000)]
recoveries.recoveries.value_counts()
sns.boxplot(x='loan_status', y='recoveries', data=recoveries)

plt.title('Recoveries by Loan Status')
plt.show()

In [336]:
# df_loan[['total_rec_prncp', 'total_rec_int']].plot(kind='bar', stacked=True, figsize=(10, 6))
# plt.title('Distribution of Loan Repayments (Principal vs Interest)')
# plt.ylabel('Amount')
# plt.show()

In [337]:
# for col in df_loan.columns:
#     if df_loan[col].dtype=="object":
#         print(df_loan[col].value_counts())
#         print("----"*20)

In [None]:
df_loan.shape

In [None]:
df_loan.head()

In [340]:
#df_loan.addr_state.value_counts()