# Defining Scope
## Hybrid Data Generation
1. Rules and stats-based generation
    - Single wide table with Customer/policies/payment etc
2. DataLLM - MostlyAI augmentation
    - Augmentation with prompts
    - Required API KEY, [MostlyAI](https://data.mostly.ai/docs/routes#authentication), and save it to .env as
3. Manually Clean Up
    - Date casting
    - Issue/Termination Date
    - Fraction(termination Date) to None

# Library

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta
from deltalake.writer import write_deltalake

import math

In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()
DataLLM_API = os.getenv("DataLLM")

## Rules and Stats based generation
- With Panda and numpy
- Potentially Augment also with Faker, but that would be purely synthetic PII data that has no relevant to statistics and ML.
- Encoded also assumptions and characteristics of the customers

In [3]:
# Generate initial attributes using rule-based approach
def generate_initial_data(n):
    data = {
        "CustomerID": range(1, n + 1),
        "Age": [generate_age() for _ in range(n)],
        "Gender": [generate_gender() for _ in range(n)],
        "Income": [generate_income(generate_age()) for _ in range(n)],
        "Marital_Status": [generate_marital_status() for _ in range(n)],
        "Dependents": [],
        "Claims": [generate_health_conditions() for _ in range(n)],
        "Policy": [],
        "Premium_Amount": [],
        "Insured_Amount": [],
        "Payment_Timeliness": [generate_payment_timeliness() for _ in range(n)],
        "Payment_Frequency": [generate_payment_frequency() for _ in range(n)],
        "Payment_Method": [generate_payment_method() for _ in range(n)],
    }

    for i in range(n):
        marital_status = data["Marital_Status"][i]
        dependents = generate_dependents(marital_status)
        data["Dependents"].append(dependents)

        age = data["Age"][i]
        income = data["Income"][i]
        policy_type = generate_policy_type(age, income)
        data["Policy"].append(policy_type)

        policy_premium = generate_policy_premium(age, policy_type)
        data["Premium_Amount"].append(policy_premium)

        policy_insured = generate_policy_insured(age, policy_type, policy_premium)
        data["Insured_Amount"].append(policy_insured)

    return pd.DataFrame(data)


### Helper
def round_to_50k(x):
    return math.floor(x / 50000) * 50000


### Generator
def generate_age():
    return int(np.clip(np.random.normal(40, 12), 20, 65))


def generate_gender():
    return np.random.choice(["Male", "Female"], p=[0.45, 0.55])


# Normal distributed income
def generate_income(age):
    if age < 30:
        return int(np.clip(np.random.normal(60000, 15000), 15000, 70000))
    elif age < 40:
        return int(np.clip(np.random.normal(70000, 25000), 20000, 11000))
    elif age < 50:
        return int(np.clip(np.random.normal(80000, 25000), 60000, 120000))
    else:
        return int(np.clip(np.random.normal(100000, 35000), 70000, 150000))


def generate_marital_status():
    return np.random.choice(["Single", "Married", "Divorced"], p=[0.4, 0.5, 0.1])


def generate_dependents(marital_status):
    if marital_status == "Single":
        return 0
    elif marital_status == "Married":
        return np.random.choice(range(0, 5), p=[0.3, 0.3, 0.2, 0.15, 0.05])
    else:
        return np.random.choice(range(0, 2), p=[0.7, 0.3])


def generate_health_conditions():
    return np.random.choice(["None", "Major", "Minor"], p=[0.9, 0.03, 0.07])


# Younger customers (age 20-40) predominantly choose affordable policies like Term Life.
# Older customers (age 50-65) tend to select Investment-linked or Whole Life policies for better coverage and benefits.
# Income and Policy Amount:
def generate_policy_type(age, income):
    if age < 30:
        return np.random.choice(
            ["Term Life", "Medical", "Whole Life", "Critical Illness"],
            p=[0.25, 0.5, 0.05, 0.2],
        )
    if age < 40 & income > 600000:
        return np.random.choice(
            ["Term Life", "Medical", "Whole Life", "Critical Illness"],
            p=[0.3, 0.4, 0.1, 0.2],
        )
    elif income > 100000:
        return np.random.choice(
            ["Investment", "Whole Life", "Endowment"], p=[0.3, 0.4, 0.3]
        )
    else:
        return np.random.choice(
            ["Critical Illness", "Medical", "Endowment"], p=[0.4, 0.4, 0.2]
        )


# Set premium and insured amount to that of typically market
def generate_policy_premium(age, policy_type):
    if policy_type in ["Term Life", "Medical", "Critical Illness"]:
        if age < 30:
            return int(np.random.uniform(1000, 2500))
        elif age < 50:
            return int(np.random.uniform(2000, 4000))
        elif age < 70:
            return int(np.random.uniform(3000, 5000))
    elif policy_type == "Endowment":
        return int(np.clip(np.random.normal(2000, 1000), 500, 10000))
    elif policy_type == "Investment":
        return int(np.random.uniform(1000, 15000))
    elif policy_type == "Whole Life":
        return int(np.random.uniform(1000, 5000))
    else:
        return int(np.random.uniform(2000, 4000))


def generate_policy_insured(age, policy_type, policy_premium):
    if policy_type in ["Term Life", "Critical Illness"]:
        if age < 30:
            return round_to_50k(policy_premium * 70)
        elif age < 50:
            return round_to_50k(policy_premium * 50)
        elif age < 70:
            return round_to_50k(policy_premium * 35)
    elif policy_type == "Endowment":
        return policy_premium * 10
    elif policy_type == "Whole Life":
        return int(np.random.uniform(300000, 1000000))
    else:
        return 0


def generate_payment_timeliness():
    return np.random.choice(["On-time", "Late Payment"], p=[0.8, 0.2])


def generate_payment_frequency():
    return np.random.choice(
        ["Monthly", "Quarterly", "Annually"], p=[0.4, 0.1, 0.5]
    )  # Could perhaps add Single here


def generate_payment_method():
    return np.random.choice(["Credit", "Cash", "Debit"], p=[0.7, 0.1, 0.2])

In [4]:
# Generate initial dataset
data = generate_initial_data(2000)

In [5]:
data.head(10)

Unnamed: 0,CustomerID,Age,Gender,Income,Marital_Status,Dependents,Claims,Policy,Premium_Amount,Insured_Amount,Payment_Timeliness,Payment_Frequency,Payment_Method
0,1,38,Female,78878,Divorced,0,,Medical,3158,0,On-time,Monthly,Cash
1,2,38,Male,60000,Single,0,,Medical,3982,0,On-time,Annually,Credit
2,3,36,Male,90616,Single,0,,Medical,2498,0,Late Payment,Monthly,Credit
3,4,59,Male,120000,Married,2,,Investment,4481,0,On-time,Monthly,Debit
4,5,44,Male,11000,Married,0,,Critical Illness,2785,100000,On-time,Annually,Credit
5,6,55,Female,56415,Married,1,,Endowment,2465,24650,On-time,Annually,Debit
6,7,24,Male,57029,Single,0,,Term Life,1990,100000,On-time,Annually,Credit
7,8,32,Female,75242,Single,0,,Endowment,500,5000,On-time,Annually,Cash
8,9,30,Female,70000,Single,0,,Medical,3872,0,On-time,Annually,Credit
9,10,44,Male,74436,Married,0,,Medical,2691,0,On-time,Annually,Credit


## Augment with DataLLM
- Enrich with bias/data from DataLLM
- This has some statistical distribution too (such as bias towards males/females or smoker/age ratio), although, this trend is not explicitly to the Insurance domain.

In [6]:
from datallm import DataLLM

# Initialize the DataLLM client
datallm = DataLLM(
    api_key=DataLLM_API,
    base_url="https://data.mostly.ai",
)

In [7]:
data["Recent_Payment_Months"] = datallm.enrich(
    data, prompt="Months since the last payment", dtype="integer"
)
data["Recent_Purchase_Months"] = datallm.enrich(
    data, prompt="Months since the last purchase", dtype="integer"
)
data["Policy_Purchase_Years"] = datallm.enrich(
    data, prompt="Years since the policy was purchased", dtype="integer"
)
data["Customer_Tenure_Years"] = datallm.enrich(
    data, prompt="Years the customer has been with the company", dtype="integer"
)
data["Smoker"] = datallm.enrich(
    data, prompt="Is the customer a smoker", dtype="boolean"
)
data["Policy_Issue_Date"] = datallm.enrich(
    data, prompt="Policy start date, beyond 1980 and before 2024", dtype="date"
)
data["Policy_Terminate_Date"] = datallm.enrich(
    data, prompt="Policy end date, bigger than policy_issue_date", dtype="date"
)

Output()

  warn("Process interrupted. Returning the completions generated so far.")


Output()

## Manual Casting and Clean up

In [None]:
# Casting
data["Policy_Terminate_Date"] = pd.to_datetime(data["Policy_Terminate_Date"])
data["Policy_Issue_Date"] = pd.to_datetime(data["Policy_Issue_Date"])

data["Policy_Issue_Date"] = [i.date() for i in data["Policy_Issue_Date"]]
data["Policy_Terminate_Date"] = [i.date() for i in data["Policy_Terminate_Date"]]

In [None]:
# Required for default 'positive' constraint in SVD to work
# the 2 dates need to be inequal.
data["Policy_Terminate_Date"] = [
    (x + relativedelta(days=1)) if x >= y else y
    for x, y in zip(data["Policy_Issue_Date"], data["Policy_Terminate_Date"])
]

In [None]:
# Set it so that half of the policies has no termination date
# Create Random Mask for filling and set it to NOne
rand_zero_one_mask = data.sample(1000).index
data.loc[rand_zero_one_mask, "Policy_Terminate_Date"] = None

In [None]:
data.head(5)

Unnamed: 0,CustomerID,Age,Gender,Income,Marital_Status,Dependents,Claims,Policy,Premium_Amount,Insured_Amount,Payment_Timeliness,Payment_Frequency,Payment_Method,Recent_Payment_Months,Recent_Purchase_Months,Policy_Purchase_Years,Customer_Tenure_Years,Smoker,Policy_Issue_Date,Policy_Terminate_Date
0,1,37,Male,11000,Divorced,0,,Medical,2861,0,On-time,Monthly,Debit,9,28,1,1,False,1991-01-01,
1,2,57,Male,70000,Married,2,,Medical,3148,0,On-time,Monthly,Credit,0,12,3,3,True,1996-05-01,
2,3,43,Female,44340,Married,2,Minor,Endowment,1455,14550,Late Payment,Annually,Credit,19,21,1,1,False,1995-03-03,2010-03-03
3,4,59,Female,11000,Single,0,,Endowment,1324,13240,On-time,Annually,Debit,13,11,15,14,False,1991-03-29,2005-03-29
4,5,37,Male,11000,Married,2,,Critical Illness,2857,100000,Late Payment,Annually,Cash,12,0,2,2,False,2014-04-01,2015-04-01


In [None]:
col_to_cast = [
    "Recent_Payment_Months",
    "Recent_Purchase_Months",
    "Policy_Purchase_Years",
    "Customer_Tenure_Years",
]

for i in col_to_cast:
    data[i] = data[i].astype(int)


data["Smoker"] = data["Smoker"].astype(bool)

# Output

In [None]:
data.to_parquet("data/L0/mock_rules_enriched_data.parquet", index=False)
# data.to_csv("data/L0/mock_rules_enriched_data.csv", index=False)