In [1]:
# pip install xlsxwriter

In [1]:
import snowflake.connector
import os
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.hazmat.primitives.asymmetric import dsa
from cryptography.hazmat.primitives import serialization
import pandas as pd
import numpy as np
import copy

In [2]:
# Snowflake connection - DSC
with open('/etc/security/snowflake/' + "/rsa_plbiap01dy.p8", "rb") as key:
    p_key = serialization.load_pem_private_key(
        key.read(),
        password='snowflake'.encode(),
        backend=default_backend())

pkb = p_key.private_bytes(
    encoding=serialization.Encoding.DER,
    format=serialization.PrivateFormat.PKCS8,
    encryption_algorithm=serialization.NoEncryption())

ctx = snowflake.connector.connect(
        user='plbiap01dy',
        account='hfsg_prod.us-east-1.privatelink',
        private_key=pkb,
        warehouse='DSC_PLBI_PRD_MFG_WHS',
        role='plbiap01dy_prd_pii_role'
)

# Snowflake queries
query_scoremart = """select * FROM DSC_PLDS_DB.APP_AUTOMATA_PRD.PREVAIL_HOME_ULM_QUOTE_AGG_NB_QUALITY where trans_dt between '2022-01-01' and '2024-06-30' and issue_ind = 'Y'"""
# query_scoremart = """select * FROM DSC_PLDS_DB.APP_AUTOMATA_PRD.PREVAIL_HOME_ULM_QUOTE_AGG_NB_QUALITY where trans_dt >= '2022-01-01' and issue_ind = 'Y'"""

# Get data from Snowflake
cs = ctx.cursor()
try:
    cs.execute(query_scoremart)
    df = cs.fetch_pandas_all()
finally:
    cs.close()
ctx.close()

df.shape

(89400, 94)

In [3]:
# calculate relativities
all_state_mean_lr_dict = df.groupby('PRIMARYRATINGSTATE').agg(all_state_mean_lr_dict = ('EXPECTED_LOSS_RATIO', 'mean')).to_dict()
df["all_state_lr_mean"] = df["PRIMARYRATINGSTATE"].map(all_state_mean_lr_dict['all_state_mean_lr_dict'])
df['ULM_eLRR'] = df['EXPECTED_LOSS_RATIO'] / df['all_state_lr_mean']

all_state_mean_ol_lr_dict = df.groupby('PRIMARYRATINGSTATE').agg(all_state_mean_ol_lr_dict = ('OL_EXPECTED_LOSS_RATIO', 'mean')).to_dict()
df["all_state_ol_lr_mean"] = df["PRIMARYRATINGSTATE"].map(all_state_mean_ol_lr_dict['all_state_mean_ol_lr_dict'])
df['ULM_OL_eLRR'] = df['OL_EXPECTED_LOSS_RATIO'] / df['all_state_ol_lr_mean']

In [4]:
def summarize_data(dataframe, by_vars, aggdict, writer, outsheet):
    
    # create deep copy
    df_tmp = copy.deepcopy(dataframe)        
            
    # summarize
    table = df_tmp.pivot_table(index = by_vars, aggfunc = aggdict).reset_index()
    
    # write to excel
    table.to_excel(writer, sheet_name=outsheet, index = False)
    # writer.save()

In [5]:
# aggregations
aggregation_dict = {'QCN': 'count',
#           'EXPECTED_LOSS_RATIO': 'mean', 
#           'OL_EXPECTED_LOSS_RATIO': 'mean', 
           'ULM_eLRR': 'mean',
#           'ULM_OL_eLRR': 'mean',
           }

In [6]:
# insurance score deciles
df.loc[(df['INS_SCR_CD'] ==0)   | (df['INS_SCR_CD'] >= 998), 'cv_decile'] = 'NH/NS'
df.loc[(df['INS_SCR_CD'] > 0)   & (df['INS_SCR_CD'] <= 557), 'cv_decile'] = 'Decile 10'
df.loc[(df['INS_SCR_CD'] > 557) & (df['INS_SCR_CD'] <= 654), 'cv_decile'] = 'Decile 09'
df.loc[(df['INS_SCR_CD'] > 654) & (df['INS_SCR_CD'] <= 705), 'cv_decile'] = 'Decile 08'
df.loc[(df['INS_SCR_CD'] > 705) & (df['INS_SCR_CD'] <= 735), 'cv_decile'] = 'Decile 07'
df.loc[(df['INS_SCR_CD'] > 735) & (df['INS_SCR_CD'] <= 757), 'cv_decile'] = 'Decile 06'
df.loc[(df['INS_SCR_CD'] > 757) & (df['INS_SCR_CD'] <= 776), 'cv_decile'] = 'Decile 05'
df.loc[(df['INS_SCR_CD'] > 776) & (df['INS_SCR_CD'] <= 793), 'cv_decile'] = 'Decile 04'
df.loc[(df['INS_SCR_CD'] > 793) & (df['INS_SCR_CD'] <= 810), 'cv_decile'] = 'Decile 03'
df.loc[(df['INS_SCR_CD'] > 810) & (df['INS_SCR_CD'] <= 832), 'cv_decile'] = 'Decile 02'
df.loc[(df['INS_SCR_CD'] > 832) & (df['INS_SCR_CD'] <  998), 'cv_decile'] = 'Decile 01'
df['cv_decile'].value_counts()

cv_decile
Decile 01    10694
Decile 02     9937
Decile 05     9523
Decile 04     9184
Decile 06     9112
Decile 03     9049
Decile 07     8706
Decile 08     8562
Decile 09     7985
Decile 10     5976
NH/NS          672
Name: count, dtype: int64

In [7]:
# insurance score quintiles
df.loc[(df['cv_decile'] == "Decile 01") | (df['cv_decile'] == "Decile 02"), 'cv_quintile'] = 'Quintile 1'
df.loc[(df['cv_decile'] == "Decile 03") | (df['cv_decile'] == "Decile 04"), 'cv_quintile'] = 'Quintile 2'
df.loc[(df['cv_decile'] == "Decile 05") | (df['cv_decile'] == "Decile 06"), 'cv_quintile'] = 'Quintile 3'
df.loc[(df['cv_decile'] == "Decile 07") | (df['cv_decile'] == "Decile 08"), 'cv_quintile'] = 'Quintile 4'
df.loc[(df['cv_decile'] == "Decile 09") | (df['cv_decile'] == "Decile 10"), 'cv_quintile'] = 'Quintile 5'
df.loc[(df['cv_decile'] == "NH/NS") , 'cv_quintile'] = 'NH/NS'
df['cv_quintile'].value_counts()

cv_quintile
Quintile 1    20631
Quintile 3    18635
Quintile 2    18233
Quintile 4    17268
Quintile 5    13961
NH/NS           672
Name: count, dtype: int64

In [8]:
# coverage A & sq ft
df['cov_a_sq_ft'] = 'no_coverage'
df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] = pd.to_numeric(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'].replace('None', 0))
df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] = pd.to_numeric(df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'].replace('None', 0))

#Group A
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <300000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 0) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 1500), 'cov_a_sq_ft'] = '<$300k and sqft 0-1500'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <300000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 1501) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 2000), 'cov_a_sq_ft'] = '<$300k and sqft 1501-2000'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <300000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 2001) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 2500), 'cov_a_sq_ft'] = '<$300k and sqft 2001-2500'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <300000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] > 2500), 'cov_a_sq_ft'] = '<$300k and sqft 2501+'

#Group B
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=300000) & (df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <450000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 0) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 1500), 'cov_a_sq_ft'] = '$300k-$449k and sqft 0-1500'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=300000) & (df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <450000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 1501) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 2000), 'cov_a_sq_ft'] = '$300k-$449k and sqft 1501-2000'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=300000) & (df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <450000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 2001) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 2500), 'cov_a_sq_ft'] = '$300k-$449k and sqft 2001-2500'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=300000) & (df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <450000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] > 2500), 'cov_a_sq_ft'] = '$300k-$449k and sqft 2501+'

#Group C
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=450000) & (df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <650000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 0) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 1500), 'cov_a_sq_ft'] = '$450k-$649k and sqft 0-1500'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=450000) & (df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <650000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 1501) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 2000), 'cov_a_sq_ft'] = '$450k-$649k and sqft 1501-2000'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=450000) & (df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <650000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 2001) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 2500), 'cov_a_sq_ft'] = '$450k-$649k and sqft 2001-2500'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=450000) & (df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] <650000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] > 2500), 'cov_a_sq_ft'] = '$450k-$649k and sqft 2501+'

#Group D
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=650000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 0) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 1500), 'cov_a_sq_ft'] = '$650k+ and sqft 0-1500'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=650000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 1501) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 2000), 'cov_a_sq_ft'] = '$650k+ and sqft 1501-2000'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=650000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] >= 2001) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] <= 2500), 'cov_a_sq_ft'] = '$650k+ and sqft 2001-2500'
df.loc[(df['CV_HIGI_HOMEARD_DWELLING_COV_LMT'] >=650000) & (df['CV_HIGI_HOMEARD_CMB_SQFTGRP2_ARD'] > 2500), 'cov_a_sq_ft'] = '$650k+ and sqft 2501+'
df['cov_a_sq_ft'].value_counts()

cov_a_sq_ft
<$300k and sqft 0-1500            25079
$300k-$449k and sqft 1501-2000    16551
$300k-$449k and sqft 0-1500       10401
$300k-$449k and sqft 2001-2500     8581
<$300k and sqft 1501-2000          7102
$450k-$649k and sqft 2501+         5439
$450k-$649k and sqft 2001-2500     4983
$450k-$649k and sqft 1501-2000     3740
$650k+ and sqft 2501+              3250
$300k-$449k and sqft 2501+         1832
$450k-$649k and sqft 0-1500         968
$650k+ and sqft 2001-2500           728
<$300k and sqft 2001-2500           436
$650k+ and sqft 1501-2000           256
$650k+ and sqft 0-1500               43
<$300k and sqft 2501+                11
Name: count, dtype: int64

In [9]:
# dwelling age
df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] = pd.to_numeric(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'].replace('None', 0))
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=0)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=5), 'dwell_age_grp'] = '0-5'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=6)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=10), 'dwell_age_grp'] = '6-10'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=11)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=15), 'dwell_age_grp'] = '11-15'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=16)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=20), 'dwell_age_grp'] = '16-20'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=21)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=25), 'dwell_age_grp'] = '21-25'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=26)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=30), 'dwell_age_grp'] = '26-30'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=31)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=35), 'dwell_age_grp'] = '31-35'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=36)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=40), 'dwell_age_grp'] = '36-40'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=41)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=50), 'dwell_age_grp'] = '41-50'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=51)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=60), 'dwell_age_grp'] = '51-60'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=61)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=70), 'dwell_age_grp'] = '61-70'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=71)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=80), 'dwell_age_grp'] = '71-80'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=81)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=90), 'dwell_age_grp'] = '81-90'
df.loc[(df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] >=91)  & (df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP'] <=100), 'dwell_age_grp'] = '91-100'
df.loc[df['CV_HIGI_HOMEARD_RMP30_DWELLING_AGE_CAP']  >100, 'dwell_age_grp'] = '100+'
df['dwell_age_grp'].value_counts()

dwell_age_grp
61-70     11158
41-50     10715
51-60      9801
16-20      8481
21-25      7655
26-30      6368
91-100     5960
71-80      5621
0-5        5258
36-40      4856
31-35      4601
6-10       3523
11-15      3205
81-90      2198
Name: count, dtype: int64

In [10]:
# Total Bath Count
df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'] = pd.to_numeric(df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'].replace('None', 0))
df.loc[(df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'] == 1), 'ttl_bath_cnt'] = '1'
df.loc[(df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'] == 2), 'ttl_bath_cnt'] = '2'
df.loc[(df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'] == 3), 'ttl_bath_cnt'] = '3'
df.loc[(df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'] == 4), 'ttl_bath_cnt'] = '4'
df.loc[(df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'] == 5), 'ttl_bath_cnt'] = '5'
df.loc[(df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'] == 6), 'ttl_bath_cnt'] = '6'
df.loc[(df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'] == 7), 'ttl_bath_cnt'] = '7'
df.loc[(df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'] == 8), 'ttl_bath_cnt'] = '8'
df.loc[(df['CV_HIGV_ATM_SA_NBR_BATH_CALC_2'] >= 9), 'ttl_bath_cnt'] = '9+'
df['ttl_bath_cnt'].value_counts()

ttl_bath_cnt
2     47573
3     22107
1     14231
4      4653
5       671
6       142
7        17
8         4
9+        2
Name: count, dtype: int64

In [11]:
# Roof Condition Derived Num (doesn't need to be done since it's a direct segment)
df.loc[(df['ROOF_COND_DERIV_SCR_NUM'] == "-2"), 'roof_cond_deriv_num'] = '-2'
df.loc[(df['ROOF_COND_DERIV_SCR_NUM'] == "-1"), 'roof_cond_deriv_num'] = '-1'
df.loc[(df['ROOF_COND_DERIV_SCR_NUM'] == "0"), 'roof_cond_deriv_num'] = '0'
df.loc[(df['ROOF_COND_DERIV_SCR_NUM'] == "1"), 'roof_cond_deriv_num'] = '1'
df.loc[(df['ROOF_COND_DERIV_SCR_NUM'] == "2"), 'roof_cond_deriv_num'] = '2'
df.loc[(df['ROOF_COND_DERIV_SCR_NUM'] == "not_available"), 'roof_cond_deriv_num'] = 'not_available'
df.loc[(df['ROOF_COND_DERIV_SCR_NUM'] == "unknown"), 'roof_cond_deriv_num'] = 'unknown'
df['roof_cond_deriv_num'].value_counts()

roof_cond_deriv_num
2                26162
1                24872
0                18065
not_available    10023
-1                7195
unknown           2179
-2                 904
Name: count, dtype: int64

In [12]:
writer = pd.ExcelWriter('./SM_Home_ULM_2024_2Q_refresh.xlsx', engine = 'xlsxwriter')
summarize_data(dataframe = df, by_vars = ['cv_decile', ], aggdict = aggregation_dict, writer = writer, outsheet = 'cv_decile')
summarize_data(dataframe = df, by_vars = ['cv_quintile', ], aggdict = aggregation_dict, writer = writer, outsheet = 'cv_quintile')
summarize_data(dataframe = df, by_vars = ['cov_a_sq_ft', ], aggdict = aggregation_dict, writer = writer, outsheet = 'cov_a_sq_ft')
summarize_data(dataframe = df, by_vars = ['dwell_age_grp', ], aggdict = aggregation_dict, writer = writer, outsheet = 'dwell_age_grp')
summarize_data(dataframe = df, by_vars = ['ttl_bath_cnt', ], aggdict = aggregation_dict, writer = writer, outsheet = 'ttl_bath_cnt')
summarize_data(dataframe = df, by_vars = ['ROOF_COND_DERIV_SCR_NUM', ], aggdict = aggregation_dict, writer = writer, outsheet = 'roof_score')
writer.close()