In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
from typing import Dict, Text

In [2]:
xls = pd.ExcelFile('InsuranceTransactionsData.xlsx')
df1 = pd.read_excel(xls, 'CustomerTransactions')
df2 = pd.read_excel(xls, 'PolicyInfo')

In [3]:
df1.head()

Unnamed: 0,LeadId,Age,Gender,Income,Residence,Diabetes,HeartAilment,Hypertension,OtherIllness,SurgicalProcedure,COVID19,WhenRecoveredFromCovid,PolicyName
0,176,43,Male,300000,Metro,0,0,1,0,0,0,0,Policy_1
1,58,32,Female,300000,Non-Metro,0,0,1,0,0,0,0,Policy_11
2,17,29,Male,1100000,Metro,0,0,0,1,1,1,4,Policy_3
3,215,50,Male,300000,Metro,0,0,1,0,0,1,1,Policy_1
4,34,25,Male,1400000,Non-Metro,0,0,0,0,1,0,0,Policy_4


In [4]:
df2.head()

Unnamed: 0,PolicyName,Name,Insurer,Cover(lac),Premium(annual),Pre-Existing Waiting Period,ClaimSettlementRatio,Maternity,OPD Benefits
0,Policy_1,Activ Assure – Diamond,AdityaBirla,5,5197,4,70.81,No,No
1,Policy_2,Reassure,NivaBupa,5,8102,2,90.7,No,Yes
2,Policy_3,Health Guard Gold,BajajAllianz,10,9083,3,92.24,No,No
3,Policy_4,Optima Secure,HDFC Ergo,20,15340,3,94.7,No,Yes
4,Policy_5,Young Star Silver,StarHealth,20,9427,1,85.47,No,No


In [5]:
df1["LeadId"] = df1["LeadId"].astype("string")
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   LeadId                  250 non-null    string
 1   Age                     250 non-null    int64 
 2   Gender                  250 non-null    object
 3   Income                  250 non-null    int64 
 4   Residence               250 non-null    object
 5   Diabetes                250 non-null    int64 
 6   HeartAilment            250 non-null    int64 
 7   Hypertension            250 non-null    int64 
 8   OtherIllness            250 non-null    int64 
 9   SurgicalProcedure       250 non-null    int64 
 10  COVID19                 250 non-null    int64 
 11  WhenRecoveredFromCovid  250 non-null    int64 
 12  PolicyName              250 non-null    object
dtypes: int64(9), object(3), string(1)
memory usage: 25.5+ KB


In [6]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   PolicyName                   16 non-null     object 
 1   Name                         16 non-null     object 
 2   Insurer                      16 non-null     object 
 3   Cover(lac)                   16 non-null     int64  
 4   Premium(annual)              16 non-null     int64  
 5   Pre-Existing Waiting Period  16 non-null     int64  
 6   ClaimSettlementRatio         16 non-null     float64
 7   Maternity                    16 non-null     object 
 8   OPD Benefits                 16 non-null     object 
dtypes: float64(1), int64(3), object(5)
memory usage: 1.2+ KB


In [7]:
transaction = tf.data.Dataset.from_tensor_slices(dict(df1))
policy = tf.data.Dataset.from_tensor_slices(dict(df2))

transaction = transaction.map(lambda x: {"leadId": x["LeadId"], "policy": x["PolicyName"]})
policy = policy.map(lambda x: x["PolicyName"])

In [9]:
leadId_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
leadId_vocabulary.adapt(transaction.map(lambda x: x["leadId"]))

policy_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
policy_vocabulary.adapt(policy)

In [19]:
class InsurancePredModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      lead_model: tf.keras.Model,
      policy_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.lead_model = lead_model
    self.policy_model = policy_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    lead_embeddings = self.lead_model(features["leadId"])
    policy_embeddings = self.policy_model(features["policy"])

    return self.task(lead_embeddings, policy_embeddings)

In [20]:
# Define user and movie models.
lead_model = tf.keras.Sequential([
    leadId_vocabulary,
    tf.keras.layers.Embedding(leadId_vocabulary.vocab_size(), 64)
])
policy_model = tf.keras.Sequential([
    policy_vocabulary,
    tf.keras.layers.Embedding(policy_vocabulary.vocab_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    policy.batch(128).map(policy_model)
  )
)



In [21]:
# Create a retrieval model.
model = InsurancePredModel(lead_model, policy_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model.fit(transaction.batch(64), epochs = 5)

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.lead_model)
index.index_from_dataset(policy.batch(100).map(lambda name: (name, model.policy_model(name))))

# Get some recommendations.
_, name = index(np.array(["138"]))
print(f"Top 3 recommendations for lead 138: {name[0, :3]}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Top 3 recommendations for lead 42: [b'Policy_14' b'Policy_9' b'Policy_6']


In [22]:
_, name = index(np.array(["138"]))
print(f"Top 3 recommendations for lead 138: {name[0, :3]}")

Top 3 recommendations for lead 138: [b'Policy_12' b'Policy_6' b'Policy_11']
