In [3]:
import pandas as pd
import dataframe_image as dfi

# Load and process the output of the model

In [22]:
raw = pd.read_csv('log_output.csv')

In [23]:
raw.rename(columns={'0':'topic_cat','1':'probability'},inplace=True)
raw.set_index(raw.id+raw.probability, inplace=True)
derived_topic_summary = {0:'Credit Cards/Consumer Banking', 1:'Credit Reporting Communications',
                        2:'Credit Reporting / Debt Collection', 3:'Debt Collection Communications',
                        4:'Customer Service', 5:'Credit Reporting Identity Theft (5)',
                        6:'Communication General (6)', 7:'Credit Reporting Inquiries',
                        8:'Unclear, Navy Federal', 9:'Mortgages', 10:'Credit Reporting General',
                        11:'Capital One / Credit Reporting', 12:'Student Loans',
                        13:'Communication General (13)', 14:'Credit Reporting Identity Theft (14)'
                        }
raw['derived_topic'] = raw.topic_cat.map(derived_topic_summary)

# Get Top Topic for Each Complaint

In [24]:
top_prob = raw.pivot(index="id", columns="derived_topic", values="probability").max(axis=1).reset_index().rename(columns={0:'probability'})
top_prob['top_cat'] = 1
top_prob.set_index(top_prob.id+top_prob.probability, inplace=True)


In [25]:
counts = top_prob.join(raw, how='left',lsuffix='_y').groupby(['complaint_type','derived_topic'],as_index=True)['top_cat'].agg('sum').reset_index()


### Export formatted tables of results

In [62]:

export_prep = counts.pivot(index="derived_topic", columns="complaint_type", values="top_cat").fillna(0).astype('int32')
export = pd.DataFrame()
split = 0
for i in export_prep.columns:
    export[i] = (export_prep[i]/export_prep[i].sum())*100
    split+=1
    if not split%9:
        df_styled0 = export.style.background_gradient()
        dfi.export(df_styled0, "top_topic{}.png".format(int(18/split)))
        export = pd.DataFrame()

In [28]:
counts.groupby('complaint_type').sum()

Unnamed: 0_level_0,top_cat
complaint_type,Unnamed: 1_level_1
Bank account or service,4500
Checking or savings account,9350
Consumer Loan,2885
Credit card,5529
Credit card or prepaid card,15758
Credit reporting,9901
"Credit reporting, credit repair services, or other personal consumer reports",80212
Debt collection,42680
"Money transfer, virtual currency, or money service",4287
Money transfers,452


In [27]:
counts.groupby('derived_topic').sum()

Unnamed: 0_level_0,top_cat
derived_topic,Unnamed: 1_level_1
Capital One / Credit Reporting,1609
Communication General (13),32098
Communication General (6),12193
Credit Cards/Consumer Banking,22009
Credit Reporting / Debt Collection,5801
Credit Reporting Communications,9344
Credit Reporting General,6670
Credit Reporting Identity Theft (14),10355
Credit Reporting Identity Theft (5),1396
Credit Reporting Inquiries,3005


# Get Permutations of Topic and Complaint

In [30]:
topics=list(raw.topic_cat.unique())
ids =list(raw.id.unique())
full_comb_df = pd.DataFrame([(t,i) for t in topics for i in ids])
full_comb_df.rename(columns={0:'topic_cat',1:'id'},inplace=True)

In [31]:
cat_str = raw.topic_cat.astype('str')
id_str = raw.id.astype('str')
fc_cat_str = full_comb_df.topic_cat.astype('str')
fc_id_str = full_comb_df.id.astype('str')

In [32]:
full_comb_df.set_index(fc_cat_str + '-' + fc_id_str,inplace=True)
raw.set_index(cat_str + '-' + id_str,inplace=True)

In [38]:
df = full_comb_df.join(raw.probability,how='left').fillna(0)
df.sort_values(["id", "probability",'topic_cat'], ascending = (True,True,True),inplace=True)
df['ranks']=[i for i in range(0,15)]*int(df.shape[0]/len(df.topic_cat.unique()))
df = df.join(raw[['id','complaint_type']].drop_duplicates().set_index('id'),how='left',on='id')

# Derive Top 3 Ranked Topic Probabilities per Complaint

In [39]:
top_categories_df = df[(df.ranks<3) & (df.probability>0)].groupby(['complaint_type','topic_cat'])['ranks'].agg('count').reset_index()
top_categories_df['derived_topic'] = top_categories_df.topic_cat.map(derived_topic_summary)

### Export formatted tables of results

In [77]:
export_prep2 = top_categories_df.pivot(index="derived_topic", columns="complaint_type", values="ranks").fillna(0).astype('int32')
export2 = pd.DataFrame()

for i in [j for j in export_prep2.columns][0:9]:
    export2[i] = (export_prep2[i]/export_prep2[i].sum())*100
    if 

df_styled = export2.style.background_gradient()
dfi.export(df_styled, "topic_in_top31.png")

export2 = pd.DataFrame()
for i in [j for j in export_prep2.columns][10:]:
    export2[i] = (export_prep2[i]/export_prep2[i].sum())*100

df_styled = export2.style.background_gradient()
dfi.export(df_styled, "topic_in_top32.png")