# Import packages and data

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from category_encoders import OrdinalEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score

In [2]:
POLICY_DATA = '../data/datastorm_policy_data.csv'
AGENT_DATA = '../data/datastorm_agent_data.csv'
TEST_DATA = '../data/testset.csv'

In [3]:
df_pred = pd.read_csv(TEST_DATA, index_col='map_client_cd')

In [4]:
df_agent = pd.read_csv(AGENT_DATA)

In [5]:
df = pd.read_csv(POLICY_DATA, parse_dates=['next_due_dt', 'termination_dt', 
                                           'main_holder_dob', 'spouse_dob', 
                                           'child1_dob', 'child2_dob', 
                                           'child3_dob', 'child4_dob', 
                                           'child5_dob', 
                                           'run_date', 'commencement_dt'])

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df = df.merge(df_agent, on='agent_code', how='left')

In [7]:
df = df.drop(columns=['main_holder_occupation_cd', 'product_code'])

# Data Cleaning

## Removing duplicate columns 

In [8]:
df = df.drop_duplicates(subset=['policy_code', 'client_code', 'policy_snapshot_as_on'], keep='first')

## Remove policy snapshot with very small interval

In [9]:
df = df[df.policy_snapshot_as_on != 20190831]

In [10]:
df.head()

Unnamed: 0,policy_term,policy_payment_mode,policy_status,commencement_dt,next_due_dt,termination_dt,termination_reason,main_holder_gender,main_holder_dob,main_holder_entry_age,...,date_joined,status,substatus,termination_date,city_name,cluster_code,supervisor_code,zone_code,region_code,designation
0,20,M,INFORCE,2011-08-21,2019-02-21,NaT,,M,1983-10-01,28,...,2018/02/27,TERMINATED,AUTOTERMINATED,2018/10/31,Colombo 03,LA01002,AG107931,ZONE105,RA262,Advisor
1,15,M,INFORCE,2006-07-14,2019-09-14,NaT,,F,1958-01-01,48,...,1998/10/02,INFORCED,ACTIVE,,Moratuwa,LA01001,AG100875,ZONE110,RA248,Advisor
2,20,Y,INFORCE,2018-12-28,2019-12-28,NaT,,M,1967-11-01,51,...,2014/06/25,INFORCED,ACTIVE,,Batticaloa,LA01004,AG101697,ZONE107,RA231,Team Leader
3,15,Q,INFORCE,2018-11-06,2020-02-06,NaT,,M,1989-10-01,29,...,2014/03/20,INFORCED,ACTIVE,,Ratnapura,LA01002,AG104896,ZONE114,RA269,Advisor
4,15,M,LAPSED,2012-04-28,2015-06-28,NaT,OTHERS,M,1988-05-01,24,...,2001/05/24,INFORCED,ACTIVE,,Wadduwa,LA01002,AG104545,ZONE114,RA257,Advisor


# Create Labels

In [11]:
def create_labels(df, train_snap_date, label_last_date):
    snap_df = df[(df.policy_snapshot_as_on == train_snap_date) 
                    & (df.policy_status == 'INFORCE')][['client_code', 
                                                        'policy_code', 
                                                        'product_name']]
    cust_df = df[(df.policy_snapshot_as_on == train_snap_date)][[
        'client_code']].drop_duplicates().set_index('client_code')

    label_df = df[(df.policy_snapshot_as_on > train_snap_date) 
                     & (df.policy_snapshot_as_on <= label_last_date) 
                     & (df.policy_status == 'INFORCE')][['client_code', 
                                                         'product_name', 
                                                         'policy_code', 
                                                         'policy_snapshot_as_on']]
    
    join_df = label_df.merge(snap_df[['policy_code', 'client_code']], 
                             on=['policy_code', 'client_code'], how='left', 
                             indicator=True)
    join_df['is_prev'] = join_df._merge.map({'left_only':0, 'both':1}).astype(int)
    
    #customers who can be c
    cross_sell_eligible = join_df.groupby(['client_code', 
                                           'policy_snapshot_as_on']).is_prev.sum()
    
    cross_sell_eligible.name = 'cross_sell_eligible'
    cross_sell_eligible = cross_sell_eligible[cross_sell_eligible > 0].reset_index()
    
    joined_cs_eligible = join_df.merge(cross_sell_eligible, 
                                       on=['client_code', 
                                           'policy_snapshot_as_on'], 
                                       how='left').dropna()
    
    labels = joined_cs_eligible[joined_cs_eligible.is_prev == 0][[
        'client_code', 'product_name']].drop_duplicates()
    labels['value'] = 1
    pivot_labels = labels.pivot_table(values='value', index='client_code', 
                                      aggfunc='sum',
                                      columns='product_name').fillna(0).astype(int)

    return cust_df.merge(pivot_labels, on='client_code', how='left').fillna(0)


# Create Client Dataset

In [12]:
def client_dataset(df, snap_date):
    df_snap = df[(df.policy_snapshot_as_on == snap_date)]
    
    df_snap.status = df_snap.status + '_AGENT'
    
    customer_specific_features = ['main_holder_gender', 'main_holder_dob',
                               'main_holder_smoker_flag', 'spouse_gender',
                               'spouse_dob', 'spouse_smoker_flag',
                               'child1_dob', 'child1_gender', 'child2_dob',
                               'child2_gender', 'child3_dob', 'child3_gender', 
                               'child4_dob', 'child4_gender', 'child5_dob', 
                               'child5_gender', 'main_holder_occupation', 
                               'client_code']
    
    df_cust = df_snap[customer_specific_features]

    dob_feat = ['main_holder_dob', 'spouse_dob', 'child1_dob', 'child2_dob',
           'child3_dob', 'child4_dob', 'child5_dob']

    age_feat = []
    for col in dob_feat:
        new_col = col[:-4] + '_age'
        df_cust[new_col] = pd.to_datetime(
            snap_date, format='%Y%m%d').year - df_cust[col].dt.year
        df_cust = df_cust.drop(columns=col)
        age_feat.append(new_col)

    df_cust_numeric = df_cust.groupby('client_code')[age_feat].median()

    df_cust_cat = df_cust.select_dtypes('object').groupby('client_code').first()

    df_cust_fin = df_cust_cat.join(df_cust_numeric)
    
    pivot_cols = ['policy_payment_mode', 'policy_status', 'payment_method', 
                  'status', 'gender']
    
    for col in pivot_cols:
        df_filt = df_snap[['client_code'] + [col]].copy()
        df_filt['value'] = 1
        df_pivot = df_filt.pivot_table(values='value', 
                                       index='client_code', aggfunc='sum', 
                                       columns=col).fillna(0).astype(int)
        df_cust_fin =df_cust_fin.join(
            df_pivot.divide(df_pivot.sum(axis=1), axis=0), how='left')
    
    #policy age
    df_policy_age = df_snap[['client_code', 'policy_snapshot_as_on', 'commencement_dt']]
    
    df_policy_age['policy_age'] = pd.cut(
        ((pd.to_datetime(df_snap.policy_snapshot_as_on, format='%Y%m%d') - df_snap.commencement_dt).dt.days/365),
        bins=[-5,1,5,10,100], labels=['policy_age_1_yr', 'policy_age_5_yr', 'policy_age_10_yr', 'policy_age_40_yr'])
    df_policy_age = df_policy_age.drop(columns=['policy_snapshot_as_on', 'commencement_dt'])
    print(df_policy_age.count())
    df_policy_age['value'] = 1
    df_pivot = df_policy_age.pivot_table(values='value', 
                                           index='client_code', aggfunc='sum', 
                                       columns='policy_age').fillna(0).astype(int)
    df_cust_fin = df_cust_fin.join(
        df_pivot.divide(df_pivot.sum(axis=1), axis=0), how='left')
    
    #next due date
    df_next_due = df_snap[['client_code', 'policy_snapshot_as_on', 'next_due_dt']]
    df_next_due['next_due_age'] = pd.cut(((pd.to_datetime(
        df_snap.policy_snapshot_as_on, format='%Y%m%d') - df_snap.next_due_dt).dt.days/365), 
                                         bins=[-100,-1,-0.2, 0, 0.2, 1, 100], 
                                         labels=['next_due_min_1_yr', 
                                                 'next_due_min_0.2_yr', 
                                                 'next_due_0_yr', 
                                                 'next_due_0.2_yr', 
                                                 'next_due_1_yr', 
                                                 'next_due_100_yr'])

    df_next_due = df_next_due.drop(columns=['policy_snapshot_as_on', 'next_due_dt'])
    df_next_due['value'] = 1
    df_pivot = df_next_due.pivot_table(values='value', 
                                           index='client_code', aggfunc='sum', 
                                           columns='next_due_age').fillna(0).astype(int)
    df_cust_fin = df_cust_fin.join(
        df_pivot.divide(df_pivot.sum(axis=1), axis=0), how='left')
    
    ## Rider information
    rider_sum_cols = ['rider1_sum_assuared', 'rider2_sum_assuared', 
                  'rider3_sum_assuared', 'rider4_sum_assuared', 
                  'rider5_sum_assuared', 'rider6_sum_assuared',
                  'rider7_sum_assuared', 'rider8_sum_assuared', 
                  'rider9_sum_assuared', 'rider10_sum_assuared']

    rider_prem_cols = ['rider1_prem', 'rider2_prem',
           'rider3_prem', 'rider4_prem', 'rider5_prem',
           'rider6_prem', 'rider7_prem', 'rider8_prem',
           'rider9_prem', 'rider10_prem']

    df_rider = df_snap[rider_sum_cols + rider_prem_cols + ['client_code']].copy()

    df_rider['rider_count'] = (df_rider[rider_prem_cols] > 0).sum(axis=1)

    df_rider['rider_prem'] = df_rider[rider_prem_cols].sum(axis=1)

    df_rider['rider_sum_assured'] = df_rider[rider_sum_cols].sum(axis=1)

    df_cust_fin = df_cust_fin.join(
        df_rider.groupby('client_code')['rider_count', 
                                        'rider_prem', 'rider_sum_assured'
                                       ].sum(), how='left')
    
    #Premium information
    df_prem = df_snap[['client_code', 'premium_value', 'total_sum_assuared', 
                       'policy_payment_mode']].copy()
    df_prem['monthly_premium'] = (df_prem.premium_value / 
                                  df_prem.policy_payment_mode.map({'M':1, 
                                                                   'Y':12, 
                                                                   'Q':3, 
                                                                   'H':6, 
                                                                   'S':float('inf')}))
    df_prem = df_prem.drop(columns='policy_payment_mode')
    df_cust_fin = df_cust_fin.join(df_prem.groupby('client_code').sum(), 
                                   how='left')

    return df_cust_fin

# Fill NA values

In [13]:
def fillna_vals(df_cust):
    cat_cols = ['main_holder_gender', 'main_holder_smoker_flag', 'spouse_gender',
       'spouse_smoker_flag', 'child1_gender', 'child2_gender', 'child3_gender',
       'child4_gender', 'child5_gender', 'main_holder_occupation']

    df_cust[cat_cols] = df_cust[cat_cols].fillna('unk')

    zero_fill_cols = ['spouse_age', 'child1_age', 'child2_age',
           'child3_age', 'child4_age', 'child5_age']
    df_cust[zero_fill_cols] = df_cust[zero_fill_cols].fillna(0)
    
    return df_cust

# Categorical Encoding

In [14]:
def cat_encoding(df_cust):
    cat_cols = ['main_holder_gender', 'main_holder_smoker_flag', 'spouse_gender',
           'spouse_smoker_flag', 'child1_gender', 'child2_gender', 'child3_gender',
           'child4_gender', 'child5_gender', 'main_holder_occupation']

    cat_enc = OrdinalEncoder(cols=cat_cols, verbose=False)

    cat_enc.fit(df_cust)

    df_cust = cat_enc.transform(df_cust)
    
    return df_cust

# Create Train Dataset

In [15]:
def create_train_set_rec(df, snap_date, last_label_date):
    df_labels = create_labels(df, snap_date, last_label_date)

    df_cust = client_dataset(df, snap_date)

    df_cust = fillna_vals(df_cust)
    
    df_cust = cat_encoding(df_cust)
    df_cust = df_cust.drop(columns=['child4_age', 
                           'child5_age', 
                           'child5_gender', 
                           'child4_gender'])

    df_cross_sell = df_labels    

    return df_cust.join(df_cross_sell)

In [16]:
def create_train_set(df, snap_date, last_label_date):
    df_labels = create_labels(df, snap_date, last_label_date)

    df_cust = client_dataset(df, snap_date)

    df_cust = fillna_vals(df_cust)
    
    df_cust = cat_encoding(df_cust)
    df_cust = df_cust.drop(columns=['child4_age', 
                           'child5_age', 
                           'child5_gender', 
                           'child4_gender'])

    df_cross_sell = (df_labels.sum(axis=1) > 0).astype(int)
    df_cross_sell.name = 'is_cross_sell'
    

    return df_cust.join(df_cross_sell)

In [17]:
def create_pred_set(df, snap_date):
    df_cust = client_dataset(df, snap_date)

    df_cust = fillna_vals(df_cust)
    
    df_cust = cat_encoding(df_cust)
    df_cust = df_cust.drop(columns=['child4_age', 
                           'child5_age', 
                           'child5_gender', 
                           'child4_gender'])
    
    return df_cust

# Cross sell prediction Model Training

## Train/test datasets

train/test selected so that there are no overlapping time regions

In [1005]:
policy_snaps = sorted(df.policy_snapshot_as_on.unique())

In [1034]:
df_train = pd.concat([create_train_set(df, policy_snaps[0], policy_snaps[6]),
                      create_train_set(df, policy_snaps[1], policy_snaps[7]),
                      create_train_set(df, policy_snaps[2], policy_snaps[8]),
                      create_train_set(df, policy_snaps[3], policy_snaps[9]),
                      create_train_set(df, policy_snaps[4], policy_snaps[10]),
                      create_train_set(df, policy_snaps[5], policy_snaps[11]),
                      create_train_set(df, policy_snaps[6], policy_snaps[12])
                     ])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

client_code    26616
policy_age     26616
dtype: int64
client_code    27091
policy_age     27091
dtype: int64
client_code    27515
policy_age     27515
dtype: int64
client_code    28032
policy_age     28032
dtype: int64
client_code    28501
policy_age     28501
dtype: int64
client_code    29169
policy_age     29169
dtype: int64
client_code    29841
policy_age     29841
dtype: int64


In [1035]:
df_train.columns

Index(['main_holder_gender', 'main_holder_smoker_flag', 'spouse_gender',
       'spouse_smoker_flag', 'child1_gender', 'child2_gender', 'child3_gender',
       'main_holder_occupation', 'main_holder_age', 'spouse_age', 'child1_age',
       'child2_age', 'child3_age', 'H', 'M', 'Q', 'S', 'Y', 'INFORCE',
       'LAPSED', 'TERMINATED', 'CASH', 'CHEQUE', 'INFORCED_AGENT',
       'SUSPENDED_AGENT', 'TERMINATED_AGENT', 'Female', 'Male',
       'policy_age_1_yr', 'policy_age_5_yr', 'policy_age_10_yr',
       'policy_age_40_yr', 'next_due_min_1_yr', 'next_due_min_0.2_yr',
       'next_due_0_yr', 'next_due_0.2_yr', 'next_due_1_yr', 'next_due_100_yr',
       'rider_count', 'rider_prem', 'rider_sum_assured', 'premium_value',
       'total_sum_assuared', 'monthly_premium', 'is_cross_sell'],
      dtype='object')

In [1036]:
df_train.is_cross_sell.value_counts(dropna=False)

0    159246
1      4259
Name: is_cross_sell, dtype: int64

In [1037]:
df_test = create_train_set(df, policy_snaps[12], policy_snaps[18])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


client_code    33490
policy_age     33490
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [1038]:
train_X = df_train.drop(columns='is_cross_sell')
train_y = df_train['is_cross_sell']

test_X = df_test.drop(columns='is_cross_sell')
test_y = df_test['is_cross_sell']

## Model Training

In [1039]:
model = LGBMClassifier(learning_rate=0.1, max_depth=6, min_child_samples=1000, n_estimators=300, num_leaves=3, random_state=2, class_weight='balanced')
model.fit(train_X, train_y)

pred_y = model.predict(test_X)
pred_y_prob = model.predict_proba(test_X)[:,1]

pred_y_train = model.predict(train_X)
pred_y_train_prob = model.predict_proba(train_X)[:,1]

## Model Evaluation

In [1040]:
print(confusion_matrix(test_y, pred_y))

[[20944  5902]
 [  211   356]]


In [1041]:
print(classification_report(train_y, pred_y_train))

              precision    recall  f1-score   support

           0       0.99      0.80      0.89    159246
           1       0.09      0.70      0.15      4259

    accuracy                           0.80    163505
   macro avg       0.54      0.75      0.52    163505
weighted avg       0.97      0.80      0.87    163505



In [1042]:
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           0       0.99      0.78      0.87     26846
           1       0.06      0.63      0.10       567

    accuracy                           0.78     27413
   macro avg       0.52      0.70      0.49     27413
weighted avg       0.97      0.78      0.86     27413



In [1043]:
print(roc_auc_score(train_y, pred_y_train_prob))

0.8497478197303218


In [1044]:
print(roc_auc_score(test_y, pred_y_prob))

0.796146280023456


# Recommendation prediction model training

In [1045]:
policy_snaps = sorted(df.policy_snapshot_as_on.unique())

In [1130]:
df_train = pd.concat([create_train_set_rec(df, policy_snaps[0], policy_snaps[6]),
                      create_train_set_rec(df, policy_snaps[1], policy_snaps[7]),
                      create_train_set_rec(df, policy_snaps[2], policy_snaps[8]),
                      create_train_set_rec(df, policy_snaps[3], policy_snaps[9]),
                      create_train_set_rec(df, policy_snaps[4], policy_snaps[10]),
                      create_train_set_rec(df, policy_snaps[5], policy_snaps[11]),
                      create_train_set_rec(df, policy_snaps[6], policy_snaps[12])
                     ])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

client_code    26616
policy_age     26616
dtype: int64
client_code    27091
policy_age     27091
dtype: int64
client_code    27515
policy_age     27515
dtype: int64
client_code    28032
policy_age     28032
dtype: int64
client_code    28501
policy_age     28501
dtype: int64
client_code    29169
policy_age     29169
dtype: int64
client_code    29841
policy_age     29841
dtype: int64


In [1131]:
df_train.columns

Index(['main_holder_gender', 'main_holder_smoker_flag', 'spouse_gender',
       'spouse_smoker_flag', 'child1_gender', 'child2_gender', 'child3_gender',
       'main_holder_occupation', 'main_holder_age', 'spouse_age', 'child1_age',
       'child2_age', 'child3_age', 'H', 'M', 'Q', 'S', 'Y', 'INFORCE',
       'LAPSED', 'TERMINATED', 'CASH', 'CHEQUE', 'INFORCED_AGENT',
       'SUSPENDED_AGENT', 'TERMINATED_AGENT', 'Female', 'Male',
       'policy_age_1_yr', 'policy_age_5_yr', 'policy_age_10_yr',
       'policy_age_40_yr', 'next_due_min_1_yr', 'next_due_min_0.2_yr',
       'next_due_0_yr', 'next_due_0.2_yr', 'next_due_1_yr', 'next_due_100_yr',
       'rider_count', 'rider_prem', 'rider_sum_assured', 'premium_value',
       'total_sum_assuared', 'monthly_premium', 'EDUCATION', 'HEALTH',
       'INVESTMENT', 'PROTECTION', 'RETIREMENT'],
      dtype='object')

In [1132]:
df_test = create_train_set_rec(df, policy_snaps[12], policy_snaps[18])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


client_code    33490
policy_age     33490
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [1133]:
label_cols = ['EDUCATION', 'HEALTH',
       'INVESTMENT', 'PROTECTION', 'RETIREMENT']

In [1134]:
train_X = df_train.drop(columns=label_cols)
train_y = df_train[label_cols]

test_X = df_test.drop(columns=label_cols)
test_y = df_test[label_cols]

## Model Training

In [1135]:
model_rec = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced')
# model = LGBMClassifier(learning_rate=0.1, max_depth=6, min_child_samples=1000, n_estimators=300, num_leaves=3, random_state=2, class_weight='balanced')
model_rec.fit(train_X, train_y)

pred_y = model_rec.predict(test_X)
pred_y_prob = model_rec.predict_proba(test_X)

pred_y_train = model_rec.predict(train_X)
pred_y_train_prob = model_rec.predict_proba(train_X)

## Model Evaluation

In [1136]:
pred_class = pd.DataFrame(pred_y_prob[0][:,1])
pred_class['HEALTH'] = pred_y_prob[1][:,1]
pred_class['INVESTMENT'] = pred_y_prob[2][:,1]
pred_class['PROTECTION'] = pred_y_prob[3][:,1]
pred_class['RETIREMENT'] = pred_y_prob[4][:,1]

In [1137]:
pred_class.columns = ['EDUCATION', 'HEALTH', 'INVESTMENT', 'PROTECTION', 'RETIREMENT']

In [1138]:
pred_class

Unnamed: 0,EDUCATION,HEALTH,INVESTMENT,PROTECTION,RETIREMENT
0,0.056909,0.235426,0.216432,0.002034,0.089868
1,0.032200,0.207885,0.178971,0.001970,0.088257
2,0.023886,0.397527,0.228580,0.024635,0.217437
3,0.021387,0.266877,0.437143,0.012756,0.202285
4,0.017716,0.287291,0.406415,0.020229,0.226465
...,...,...,...,...,...
27408,0.082060,0.316706,0.260323,0.002426,0.104207
27409,0.050943,0.281560,0.173310,0.008082,0.192402
27410,0.059480,0.229513,0.174828,0.010439,0.074415
27411,0.048483,0.267479,0.173610,0.009736,0.142501


In [1139]:
accuracy_score(test_y, pred_y)

0.9691022507569401

# Submission set

In [968]:
df_sub = create_pred_set(df, policy_snaps[18])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

In [969]:
df_sub = df_pred.join(df_sub, how='inner')

In [970]:
pred_sub = model.predict_proba(df_sub)[:,1]

In [971]:
df_sub['probability_of_cross_sell'] = pred_sub

In [972]:
df_sub = df_sub[['probability_of_cross_sell']]

In [974]:
df_sub.index.name = 'map_client_cd'

In [1169]:
df_sub

Unnamed: 0_level_0,probability_of_cross_sell
map_client_cd,Unnamed: 1_level_1
C100003,0.596445
C100004,0.679340
C100009,0.531178
C100014,0.015594
C100015,0.533946
...,...
C154448,0.432619
C154449,0.422577
C154450,0.521595
C154454,0.677655


In [1160]:
df_sub_rec = create_pred_set(df, policy_snaps[18])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


client_code    35821
policy_age     35821
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [1161]:
df_sub_rec = df_pred.join(df_sub_rec, how='inner')

In [1162]:
sub_y_prob_rec = model_rec.predict_proba(df_sub_rec)

In [1163]:
pred_class = pd.DataFrame(sub_y_prob_rec[0][:,1])
pred_class['HEALTH'] = sub_y_prob_rec[1][:,1]
pred_class['INVESTMENT'] = sub_y_prob_rec[2][:,1]
pred_class['PROTECTION'] = sub_y_prob_rec[3][:,1]
pred_class['RETIREMENT'] = sub_y_prob_rec[4][:,1]

In [1164]:
pred_class.columns = ['EDUCATION', 'HEALTH', 'INVESTMENT', 'PROTECTION', 'RETIREMENT']

In [1165]:
df_sub_rec['recommendation'] = pred_class.idxmax(axis=1).values

In [1171]:
df_sub_rec = df_sub_rec[['recommendation']]

In [1172]:
df_sub_final = df_sub.join(df_sub_rec)

In [1176]:
df_sub_final

Unnamed: 0_level_0,probability_of_cross_sell,recommendation
map_client_cd,Unnamed: 1_level_1,Unnamed: 2_level_1
C100003,0.596445,INVESTMENT
C100004,0.679340,INVESTMENT
C100009,0.531178,INVESTMENT
C100014,0.015594,HEALTH
C100015,0.533946,INVESTMENT
...,...,...
C154448,0.432619,HEALTH
C154449,0.422577,INVESTMENT
C154450,0.521595,INVESTMENT
C154454,0.677655,PROTECTION


In [1177]:
df_sub_final.to_csv('../data/randomforestrangers_case_study_submission.csv')

# Model Explanations

In [864]:
pd.Series(model.feature_importances_, index=train_X.columns).sort_values(ascending=False)

premium_value              285
main_holder_age            283
monthly_premium            275
rider_prem                 261
total_sum_assuared         261
main_holder_occupation     256
rider_sum_assured          241
next_due_100_yr             94
LAPSED                      94
spouse_age                  88
INFORCE                     75
policy_age_1_yr             65
child1_age                  47
rider_count                 46
child2_age                  39
Female                      39
next_due_1_yr               38
policy_age_10_yr            37
main_holder_gender          35
next_due_0.2_yr             35
SUSPENDED_AGENT             30
policy_age_5_yr             28
Q                           27
INFORCED_AGENT              27
child1_gender               26
M                           26
policy_age_40_yr            26
next_due_min_0.2_yr         22
spouse_gender               21
S                           19
TERMINATED_AGENT            18
main_holder_smoker_flag     17
child2_g

In [None]:
#Shap Values
import shap
shap_train = train_X.sample(2000)
shap_values = shap.TreeExplainer(model).shap_values(shap_train)
shap.summary_plot(shap_values, shap_train, plot_type="bar")