In [87]:
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import math
from sklearn.metrics import confusion_matrix 
from scipy import stats
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

import statsmodels.api as sm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [88]:
# read data
claim_df =  pd.read_csv("./data/ClaimLevel.csv")
policy_df = pd.read_csv("./data/PolicyLevel.csv")

"""
                        # Data Preparation & Feature generation#
Generate a single record per policy that retains max information possible
"""
claim_df['CustomerPaidAmount'] = claim_df['ClaimedAmount'] - claim_df['PaidAmount']
claim_df_grouped = claim_df.groupby(['PolicyId'])
claim_per_policy_df = claim_df_grouped[['ClaimedAmount', 'PaidAmount', 'CustomerPaidAmount']].sum()
# claim_per_policy_df['ClaimsCount'] = claim_df_grouped.size()
# claim_per_policy_df['AvgClaimAmount'] = claim_df_grouped[['ClaimedAmount']].mean()
# claim_per_policy_df['AvgPaidAmount'] = claim_df_grouped[['PaidAmount']].mean()
# claim_per_policy_df['AvgCustomerPaidAmount'] = claim_df_grouped[['CustomerPaidAmount']].mean()
# claim_per_policy_df['stdClaimAmount'] = claim_df_grouped[['ClaimedAmount']].std()
# claim_per_policy_df['stdPaidAmount'] = claim_df_grouped[['PaidAmount']].std()
# claim_per_policy_df['stdCustomerPaidAmount'] = claim_df_grouped[['CustomerPaidAmount']].std()
# claim_per_policy_df['VarianceClaimAmount'] = claim_df_grouped[['ClaimedAmount']].var()
# claim_per_policy_df['VariancePaidAmount'] = claim_df_grouped[['PaidAmount']].var()
# claim_per_policy_df['VarianceCustomerPaidAmount'] = claim_df_grouped[['CustomerPaidAmount']].var()
# claim_per_policy_df['maxClaimAmount'] = claim_df_grouped[['ClaimedAmount']].max()
# claim_per_policy_df['minClaimAmount'] = claim_df_grouped[['ClaimedAmount']].min()
# claim_per_policy_df['medianClaimAmount'] = claim_df_grouped[['ClaimedAmount']].median()
# claim_per_policy_df['maxCustomerPaidAmount'] = claim_df_grouped[['ClaimedAmount']].max()
# claim_per_policy_df['minCustomerPaidAmount'] = claim_df_grouped[['ClaimedAmount']].min()
# claim_per_policy_df['medianCustomerPaidAmount'] = claim_df_grouped[['ClaimedAmount']].median()

policy_df = policy_df.join(claim_per_policy_df, on=['PolicyId'])


# Last seen Date: CancelDate if not null else '2016-12-31' (Last date in the data set)
policy_df['LastSeenDate'] = policy_df['CancelDate']
policy_df['LastSeenDate'].fillna(pd.to_datetime('2016-12-31'), inplace=True)

# Date conversions
policy_df['EnrollDate'] = pd.to_datetime(policy_df['EnrollDate'])
policy_df['CancelDate'] = pd.to_datetime(policy_df['CancelDate'])
policy_df['LastSeenDate'] = pd.to_datetime(policy_df['LastSeenDate'])

# Length of Stay "LOS"
policy_df['LOS'] = ((policy_df['LastSeenDate'] - policy_df['EnrollDate'])/np.timedelta64(1, 'M'))
policy_df = policy_df.round(decimals=2)

# Insurance Status: Active means 1 else 0
policy_df['InsuranceStatus'] = policy_df['CancelDate'].apply(lambda x: 0 if pd.notnull(x) else 1)

# drop null values
policy_df.dropna(axis='index', subset=['MonthlyPremium'], inplace=True)

# If no claims were made then all amounts are equal to 0 instead of NULL
for column in policy_df.columns:
    policy_df[column].fillna(0, inplace=True)

# display(claim_df.head())
display(policy_df.head(6))

Unnamed: 0,PolicyId,EnrollDate,CancelDate,MonthlyPremium,ClaimedAmount,PaidAmount,CustomerPaidAmount,LastSeenDate,LOS,InsuranceStatus
0,92597,2010-12-07,1970-01-01,34.54,0.0,0.0,0.0,2016-12-31,72.81,1
1,92808,2010-11-09,1970-01-01,18.54,0.0,0.0,0.0,2016-12-31,73.73,1
2,93090,2010-11-11,1970-01-01,23.53,762.49,329.46,433.03,2016-12-31,73.66,1
3,93122,2010-11-11,1970-01-01,44.02,0.0,0.0,0.0,2016-12-31,73.66,1
4,93133,2010-11-11,1970-01-01,32.46,0.0,0.0,0.0,2016-12-31,73.66,1
5,93258,2010-11-11,2016-05-24,21.66,33.99,30.6,3.39,2016-05-24,66.4,0


In [89]:
"""
                   # Data Description #
Churn --> 'C' 
Non-Churn --> 'NC'
"""
total_policies = policy_df.shape[0]
curr_NC = policy_df[policy_df['InsuranceStatus']==1].shape[0]/float(total_policies)
curr_C = policy_df[policy_df['InsuranceStatus']==0].shape[0]/float(total_policies)
print("Total polices", total_policies)
print("NC:C = ",curr_NC, ":", curr_C)

Total polices 99995
NC:C =  0.8678433921696085 : 0.13215660783039151


In [90]:
churn_data = policy_df[policy_df['InsuranceStatus']==0]
non_churn_data = policy_df[policy_df['InsuranceStatus']==1]
train_data = pd.concat([churn_data, non_churn_data.sample(n=len(churn_data))], axis=1)
test_data = policy_df.loc[np.setdiff1d(policy_df.index.values, churn_data.index.values),:]
len(train_data)+len(test_data)
# len(policy_df)

113210

In [91]:
select_columns = list(
    set(policy_df.columns) - set(['PolicyId', 'EnrollDate', 'CancelDate', 'InsuranceStatus', 'LastSeenDate',
#                                   'LOS', 'ClaimedAmount', 'PaidAmount', 'CustomerPaidAmount' , 
#                                  'AvgClaimAmount', 'AvgPaidAmount', 'stdClaimAmount', 'stdPaidAmount'
#                                   'minClaimAmount', 'maxClaimAmount', 'medianClaimAmount'
                                 ]))

train_data = sample_train_data(churn_percent=0.5)
X_train = train_data[select_columns]
y_train = train_data[['InsuranceStatus']]

# Prediction for data in Jan will for only people who are still enrolled and their length of stay is +1 month
X_test = test_data[select_columns]
# X_test['LOS'] += 1
y_test = test_data[['InsuranceStatus']]

In [92]:
"""
                                        # RF Model #
"""
from sklearn.ensemble import RandomForestClassifier as RF

X_train, X_test = scaler.fit_transform(X_train), scaler.fit_transform(X_test)
model = RF(n_estimators=10, class_weight={0:1,1:1})
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
print("Test accuracy",model.score(X_test, y_test))
print("tn, fp, fn, tp ",confusion_matrix(y_test, y_predicted).ravel())

Test accuracy 0.444503341784
tn, fp, fn, tp  [    0     0 48206 38574]


In [93]:
"""
                                        # Logistic Model #
"""
from sklearn.linear_model import LogisticRegression

# point_weights = dataset[0].InsuranceStatus.apply(lambda x: weight if x == 0 else 1)
model = LogisticRegression(penalty='l1',)
model.fit(X_train, y_train, )#point_weights)
y_predicted = model.predict(X_test)
# print("Weight on disenrolled polcies", weight)
print("Test Accuracy",model.score(X_test, y_test))
print("tn, fp, fn, tp ",confusion_matrix(y_test, y_predicted).ravel())
# X2 = sm.add_constant(X_train)
# est = sm.Logit(y_train, X2)
# est2 = est.fit()
# print(est2.summary())

Test Accuracy 0.401670891911
tn, fp, fn, tp  [    0     0 51923 34857]


In [94]:
"""
                                        # neural nets Model #
"""
from sklearn.neural_network import MLPClassifier

for dataset in datasets:
    X_train = dataset[0][select_columns]
    X_test = dataset[1][select_columns]
    y_train = dataset[0][['InsuranceStatus']]
    y_test = dataset[1][['InsuranceStatus']]
    X_train, X_test = scaler.fit_transform(X_train), scaler.fit_transform(X_test)
    model = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(16, 32, 24, 8), random_state=1, 
                         activation='relu')
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
#         print("\t Train",model.score(X_train, y_train))
    print("Test",model.score(X_test, y_test))
#     print(confusion_matrix(y_test, y_predicted))

NameError: name 'datasets' is not defined

In [99]:
# len(train_data[train_data['InsuranceStatus']==1])/len(train_data)
train_data = sample_train_data(churn_percent=0.5)
X_train = train_data[select_columns]
y_train = train_data[['InsuranceStatus']]

test_data['LOS'] = test_data['LOS'] + 1
X_test = test_data[select_columns]
y_test = test_data[['InsuranceStatus']]

weight = 0.75
point_weights = train_data.InsuranceStatus.apply(lambda x: weight if x == 0 else 1)
model = LogisticRegression(penalty='l1',)
model.fit(X_train, y_train, point_weights)
y_predicted = model.predict(X_test)
print("Weight on disenrolled polcies", weight)
print("Test Accuracy",model.score(X_test, y_test))
print("tn, fp, fn, tp ",confusion_matrix(y_test, y_predicted).ravel())
# X2 = sm.add_constant(X_train)

Weight on disenrolled polcies 0.75
Test Accuracy 0.866178843051
tn, fp, fn, tp  [    0     0 11613 75167]


Index(['PolicyId', 'EnrollDate', 'CancelDate', 'MonthlyPremium',
       'ClaimedAmount', 'PaidAmount', 'CustomerPaidAmount', 'LastSeenDate',
       'LOS', 'InsuranceStatus'],
      dtype='object')