In [3]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = None
pd.options.display.max_rows = 100

import seaborn as sns
sns.set(style="whitegrid")
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import matplotlib as matplotlib
%matplotlib inline

In [20]:
# Read files 

df_raw_accepted = pd.read_csv('accepted_2007_to_2018Q4.csv')
# sample_accepted = pd.read_csv('sample_accepted.csv')
# df_processed = sample_accepted.copy()
df_processed = df_raw_accepted

In [21]:
# Add Labels - "charged off + default", "settlement involved", "hardship" 

label_list = ['label_hardship','label_chargedoff_default','label_settlement', 'loan_status' ]
chargedoff_default_list = ['Charged Off','Default', 'Does not meet the credit policy. Status:Charged Off']

df_processed['label_hardship'] = df_processed['hardship_status'].apply(lambda x: 0 if x  is np.nan else 1)
df_processed['label_chargedoff_default'] = df_processed['loan_status'].apply(lambda x: 1 if x in chargedoff_default_list else 0)
df_processed['label_settlement'] = df_processed['settlement_status'].apply(lambda x: 0 if x  is np.nan else 1)

In [22]:
df_label = df_processed[label_list].copy()

late_list = ['Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)'] 
paid_list = ['Fully Paid', 'Does not meet the credit policy. Status:Fully Paid']

df_label['loan_status'] = df_label['loan_status'].apply(lambda x: 'Late' if x in late_list else x)
df_label['loan_status'] = df_label['loan_status'].apply(lambda x: 'Fully Paid' if x in paid_list else x)
df_label['loan_status'] = df_label['loan_status'].apply(lambda x: 'Charged Off' if x in chargedoff_default_list else x)
df_label['total_label'] = df_label['label_hardship'] + df_label['label_chargedoff_default'] + df_label['label_settlement']

df_label.sample(10)

Unnamed: 0,label_hardship,label_chargedoff_default,label_settlement,loan_status,total_label
595977,0,0,0,Current,0
389947,0,0,0,Fully Paid,0
2122420,0,1,0,Charged Off,1
893622,0,0,0,Fully Paid,0
687077,0,0,0,Current,0
685632,0,0,0,Fully Paid,0
1050450,0,0,0,Fully Paid,0
1855359,0,0,0,Fully Paid,0
1180508,0,0,0,Fully Paid,0
1355596,1,0,0,Current,1


In [23]:
df_label.groupby(['total_label', 'label_hardship', 'label_settlement' , 'label_chargedoff_default']).agg({'total_label': 'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,total_label
total_label,label_hardship,label_settlement,label_chargedoff_default,Unnamed: 4_level_1
0,0,0,0,1674924
1,0,0,1,218028
1,0,1,0,17216
1,1,0,0,152121
2,0,1,1,32416
2,1,0,1,17542
2,1,1,0,147080
3,1,1,1,1374


In [24]:
df_settlement = df_label.loc[df_label['label_settlement'] == 1].groupby('loan_status').size().reset_index(name='counts')
df_settlement

Unnamed: 0,loan_status,counts
0,Charged Off,33790
1,Current,152773
2,Fully Paid,7984
3,Late,3537


In [25]:
settlement_chargedoff = df_settlement.loc[df_settlement['loan_status']=='Charged Off','counts'].iloc[0]
settlement_late = df_settlement.loc[df_settlement['loan_status']=='Late','counts'].iloc[0]
settlement_current = df_settlement.loc[df_settlement['loan_status']=='Current','counts'].iloc[0]
settlement_fully_paid = df_settlement.loc[df_settlement['loan_status']=='Fully Paid','counts'].iloc[0]

In [26]:
df_hardship = df_label.loc[df_label['label_hardship'] == 1].groupby('loan_status').size().reset_index(name='counts')
df_hardship

Unnamed: 0,loan_status,counts
0,Charged Off,18916
1,Current,203040
2,Fully Paid,90295
3,Late,5862


In [27]:
hardship_chargedoff = df_hardship.loc[df_hardship['loan_status']=='Charged Off','counts'].iloc[0]
hardship_late = df_hardship.loc[df_hardship['loan_status']=='Late','counts'].iloc[0]
hardship_current = df_hardship.loc[df_hardship['loan_status']=='Current','counts'].iloc[0]
hardship_fully_paid = df_hardship.loc[df_hardship['loan_status']=='Fully Paid','counts'].iloc[0]

In [28]:
df_chargedoff = df_label.loc[df_label['label_chargedoff_default'] == 1].groupby('loan_status').size().reset_index(name='counts')
df_chargedoff

Unnamed: 0,loan_status,counts
0,Charged Off,269360


In [29]:
chargedoff = df_chargedoff.loc[df_chargedoff['loan_status']=='Charged Off','counts'].iloc[0]

## Graphs 

- Link: https://plot.ly/python/sunburst-charts/

In [30]:
# import plotly.graph_objects as go


fig =go.Figure(go.Sunburst(
    labels=["Charged Off", "Current", "Late", "Hardship", "Hardship", "Hardship","Settlement","Settlement"],
    parents=["Total", "Total", "Total", "Charged Off", "Current", "Late", "Charged Off", "Late" ],
    values=[980, 235, 743, 42, 5, 1, 3, 32, 1],
))
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))

fig.show()

In [31]:
# import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/sales_success.csv')
print(df.head())

levels = ['salesperson', 'county', 'region'] # levels used for the hierarchical chart
color_columns = ['sales', 'calls']
value_column = 'calls'

def build_hierarchical_dataframe(df, levels, value_column, color_columns=None):
    """
    Build a hierarchy of levels for Sunburst or Treemap charts.

    Levels are given starting from the bottom to the top of the hierarchy, 
    ie the last level corresponds to the root.
    """
    df_all_trees = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
    for i, level in enumerate(levels):
        df_tree = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
        dfg = df.groupby(levels[i:]).sum(numerical_only=True)
        dfg = dfg.reset_index()
        df_tree['id'] = dfg[level].copy()
        if i < len(levels) - 1:
            df_tree['parent'] = dfg[levels[i+1]].copy()
        else:
            df_tree['parent'] = 'total'
        df_tree['value'] = dfg[value_column]
        df_tree['color'] = dfg[color_columns[0]] / dfg[color_columns[1]]
        df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
    total = pd.Series(dict(id='total', parent='', 
                              value=df[value_column].sum(),
                              color=df[color_columns[0]].sum() / df[color_columns[1]].sum()))
    df_all_trees = df_all_trees.append(total, ignore_index=True)
    return df_all_trees


df_all_trees = build_hierarchical_dataframe(df, levels, value_column, color_columns)
average_score = df['sales'].sum() / df['calls'].sum()

fig = make_subplots(1, 2, specs=[[{"type": "domain"}, {"type": "domain"}]],)
    
fig.add_trace(go.Sunburst(
    labels=df_all_trees['id'],
    parents=df_all_trees['parent'],
    values=df_all_trees['value'],
    branchvalues='total',
    marker=dict(
        colors=df_all_trees['color'],
#         colorscale='RdBu',
        cmid=average_score
    ),
    hovertemplate='<b>%{label} </b> <br> Sales: %{value}<br> Success rate: %{color:.2f}',
    name=''
    ), 1, 1)

fig.add_trace(go.Sunburst(
    labels=df_all_trees['id'],
    parents=df_all_trees['parent'],
    values=df_all_trees['value'],
    branchvalues='total',
    marker=dict(
        colors=df_all_trees['color'],
#         colorscale='RdBu',
        cmid=average_score
    ),
    hovertemplate='<b>%{label} </b> <br> Sales: %{value}<br> Success rate: %{color:.2f}',
    maxdepth=2
    ), 1, 2)

fig.update_layout(margin=dict(t=10, b=10, r=10, l=10))
fig.show()

   Unnamed: 0 region   county salesperson  calls  sales
0           0  North   Dallam          JE     35     23
1           1  North   Dallam          ZQ     49     13
2           2  North   Dallam          IJ     20      6
3           3  North  Hartley          WE     39     37
4           4  North  Hartley          PL     42     37
