In [1]:
import snowflake.connector
import os
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.hazmat.primitives.asymmetric import dsa
from cryptography.hazmat.primitives import serialization
import pandas as pd
import numpy as np
import copy

In [2]:
# Snowflake connection - DSC
with open('/etc/security/snowflake/' + "/rsa_plbiap01dy.p8", "rb") as key:
    p_key = serialization.load_pem_private_key(
        key.read(),
        password='snowflake'.encode(),
        backend=default_backend())

pkb = p_key.private_bytes(
    encoding=serialization.Encoding.DER,
    format=serialization.PrivateFormat.PKCS8,
    encryption_algorithm=serialization.NoEncryption())

ctx = snowflake.connector.connect(
        user='plbiap01dy',
        account='hfsg_prod.us-east-1.privatelink',
        private_key=pkb,
        warehouse='DSC_PLBI_PRD_MFG_WHS',
        role='plbiap01dy_prd_pii_role'
)

# Snowflake queries
query_scoremart = """SELECT * from DSC_PLDS_DB.APP_AUTOMATA_PRD.PREVAIL_AUTO_ULM_QUOTE_POL_AGG_NB_QUALITY where trans_dt between '2022-01-01' and '2024-06-30' and issue_ind = 'Y'"""

# Get data from Snowflake
cs = ctx.cursor()
try:
    cs.execute(query_scoremart)
    df = cs.fetch_pandas_all()
finally:
    cs.close()
ctx.close()

df.shape

(331290, 71)

In [3]:
# calculate relativities
all_state_mean_lr_dict = df.groupby('STATE').agg(all_state_mean_lr_dict = ('EXPECTED_LOSS_RATIO', 'mean')).to_dict()
df["all_state_lr_mean"] = df["STATE"].map(all_state_mean_lr_dict['all_state_mean_lr_dict'])
df['ULM_eLRR'] = df['EXPECTED_LOSS_RATIO'] / df['all_state_lr_mean']

all_state_mean_ol_lr_dict = df.groupby('STATE').agg(all_state_mean_ol_lr_dict = ('OL_EXPECTED_LOSS_RATIO', 'mean')).to_dict()
df["all_state_ol_lr_mean"] = df["STATE"].map(all_state_mean_ol_lr_dict['all_state_mean_ol_lr_dict'])
df['ULM_OL_eLRR'] = df['OL_EXPECTED_LOSS_RATIO'] / df['all_state_ol_lr_mean']

In [4]:
def summarize_data(dataframe, by_vars, aggdict, writer, outsheet):
    
    # create deep copy
    df_tmp = copy.deepcopy(dataframe)        
            
    # summarize
    table = df_tmp.pivot_table(index = by_vars, aggfunc = aggdict).reset_index()
    
    # write to excel
    table.to_excel(writer, sheet_name=outsheet, index = False)
    # writer.save()

In [5]:
# aggregations
aggregation_dict = {'COMPL_QTE_IND': 'count',
           'ULM_eLRR': 'mean',
#           'ULM_OL_eLRR': 'mean',
           }

In [6]:
# Household Comp
df.loc[df['CF_HOUSEHOLD_COMPOSITION'] == 'single_car_one_driver', 'hh_comp'] = '1 Car, 1 Driver'
df.loc[df['CF_HOUSEHOLD_COMPOSITION'] == 'single_car_more_than_one_driver', 'hh_comp'] = '1 Car, 2+ Drivers'
df.loc[df['CF_HOUSEHOLD_COMPOSITION'] == 'multi_car_more_cars_than_drivers', 'hh_comp'] = 'Multi Car, Cars > Drivers'
df.loc[df['CF_HOUSEHOLD_COMPOSITION'] == 'multi_car_cars_equals_drivers', 'hh_comp'] = 'Multi Car, Cars = Drivers'
df.loc[df['CF_HOUSEHOLD_COMPOSITION'] == 'multi_car_more_drivers_than_cars', 'hh_comp'] = 'Multi Car, Cars < Drivers'
df['hh_comp'].value_counts()

hh_comp
1 Car, 1 Driver              156135
Multi Car, Cars = Drivers     73678
1 Car, 2+ Drivers             52415
Multi Car, Cars > Drivers     44646
Multi Car, Cars < Drivers      4416
Name: count, dtype: int64

In [7]:
# Primary Named Insured age
df.loc[df['PNI_AGE'] <50, 'PNI_AGE_grp'] = '<50'
df.loc[(df['PNI_AGE'] >=50) & (df['PNI_AGE'] <60), 'PNI_AGE_grp'] = '50-59'
df.loc[(df['PNI_AGE'] >=60) & (df['PNI_AGE'] <70), 'PNI_AGE_grp'] = '60-69'
df.loc[(df['PNI_AGE'] >=70) & (df['PNI_AGE'] <80), 'PNI_AGE_grp'] = '70-79'
df.loc[df['PNI_AGE'] >=80, 'PNI_AGE_grp'] = '80+'
df['PNI_AGE_grp'].value_counts()

PNI_AGE_grp
60-69    117410
70-79     98937
50-59     75642
80+       37901
<50        1390
Name: count, dtype: int64

In [8]:
# Account Credit
df['acct_credit'] = 'Yes'
df.loc[df['CF_ACCOUNT_CREDIT'] == 'None', 'acct_credit'] = 'No'
df.loc[df['CF_ACCOUNT_CREDIT'] == 'H', 'acct_credit'] = 'No'
df['acct_credit'].value_counts()

acct_credit
No     243389
Yes     87901
Name: count, dtype: int64

In [9]:
# Advance Quote Days
df['CV_ADV_QUOTE_DAYS_NEW'] = pd.to_numeric(df['CV_ADV_QUOTE_DAYS_NEW'])
df.loc[(df['CV_ADV_QUOTE_DAYS_NEW'] < 0)   | (df['CV_ADV_QUOTE_DAYS_NEW'] > 365), 'adv_qte_days'] = 'exclude'
df.loc[(df['CV_ADV_QUOTE_DAYS_NEW'] == 0), 'adv_qte_days'] = '0'
df.loc[(df['CV_ADV_QUOTE_DAYS_NEW'] >= 1) & (df['CV_ADV_QUOTE_DAYS_NEW'] <= 13), 'adv_qte_days'] = '1-13'
df.loc[(df['CV_ADV_QUOTE_DAYS_NEW'] >= 14) & (df['CV_ADV_QUOTE_DAYS_NEW'] <= 60), 'adv_qte_days'] = '14-60'
df.loc[(df['CV_ADV_QUOTE_DAYS_NEW'] >= 61) & (df['CV_ADV_QUOTE_DAYS_NEW'] <= 75), 'adv_qte_days'] = '61-75'
df.loc[(df['CV_ADV_QUOTE_DAYS_NEW'] >= 76) & (df['CV_ADV_QUOTE_DAYS_NEW'] <= 365), 'adv_qte_days'] = '76-365'
df['adv_qte_days'].value_counts()

adv_qte_days
14-60      137761
76-365     105922
1-13        69888
61-75       11378
0            4616
exclude      1725
Name: count, dtype: int64

In [10]:
writer = pd.ExcelWriter('./SM_Auto_ULM_2024_2Q_refresh.xlsx', engine = 'xlsxwriter')
summarize_data(dataframe = df, by_vars = ['hh_comp', ], aggdict = aggregation_dict, writer = writer, outsheet = 'hh_comp')
summarize_data(dataframe = df, by_vars = ['PNI_AGE_grp', ], aggdict = aggregation_dict, writer = writer, outsheet = 'PNI_AGE_grp')
summarize_data(dataframe = df, by_vars = ['acct_credit', ], aggdict = aggregation_dict, writer = writer, outsheet = 'acct_credit')
summarize_data(dataframe = df, by_vars = ['CF_VEH_TELEMATIC_IND', ], aggdict = aggregation_dict, writer = writer, outsheet = 'telematics_enrollment')
summarize_data(dataframe = df, by_vars = ['adv_qte_days', ], aggdict = aggregation_dict, writer = writer, outsheet = 'adv_qte_days')
writer.close()