In [3]:
import os
os.getcwd()

'/home/nitesh/Desktop/Challenge_LLM_for_Insurance_Claim_Automation/code'

In [5]:
import random
import json
import nltk
from nltk.corpus import wordnet
import re
from faker import Faker
import pandas as pd

fake = Faker()

class InsuranceClaimGenerator:
    def __init__(self, faker_generator=fake):
        self.fake = faker_generator


    def load_templates(self, template_type, claim_type):
        templates = []
        file_name = f"../dataset/templates/{template_type}_templates_{claim_type}.txt"
        with open(file_name, 'r') as file:
            template = ''
            for line in file:
                if line.startswith('['):
                    if template:
                        templates.append(template.strip())
                    template = ''
                else:
                    template += line
        return templates

    def generate_insurance_claim(self):
        incident_class = random.choice(["Auto", "Health", "Property"])
        incident_class = incident_class.lower()
        policy_num = self.fake.random_int(100000, 999999)
        incident_date = str(self.fake.date_time_between(start_date="-1y", end_date="now").date())
        policy_claim_no = self.fake.random_int(1000, 9999)
        claim_amount = random.randint(1000, 100000)
        claim_status = random.choice(["Pending", "In Progress", "Approved", "Denied"])

        patient_name, incident_description = self.generate_incident_summary(incident_date, incident_class)

        communication_history = self.generate_communication_history(incident_description, incident_class)

        return {
            "incident_class": incident_class,
            "policy_num": policy_num,
            "patient_name": patient_name,
            "date": incident_date,
            "policy_claim_no": policy_claim_no,
            "incident_description": incident_description,
            "claim_amount": claim_amount,
            "claim_status": claim_status,
            "communication_history": communication_history,
        }

    def generate_incident_summary(self, date, incident_type):
        incident_template = random.choice(self.load_templates('incident', incident_type))

        replacements = {
            'vehicle': random.choice(["car", "motorcycle", "truck", "bus", "bicycle", "scooter", "RV", "boat", "train", "airplane"]),
            'action': random.choice(["collided", "crashed", "rammed into", "rear-ended", "swerved into", "skidded into"]),
            'damage': random.choice(["minor damage", "significant damage", "extensive damage", "total loss", "structural damage", "cosmetic damage", "mechanical damage", "electrical damage"]),
            'location': random.choice(["on a busy street", "at an intersection", "near a parking lot", "in a residential area", "on a highway", "in a construction zone", "in a tunnel", "in a parking garage"]),
            'property_class': random.choice(["house", "apartment building", "office building", "shopping mall", "hotel", "restaurant", "warehouse", "farm", "museum", "church", "theater", "school", "hospital", "library"]),
            'cause': random.choice(["fire", "flood", "vandalism", "earthquake", "storm", "explosion", "theft", "structural failure", "power outage", "pipe burst"]),
            'injury_type': random.choice(["fracture", "burn", "concussion", "laceration", "sprain", "bruise", "dislocation", "strain", "puncture", "whiplash", "head injury", "spinal injury", "internal injury"]),
            'vehicle_name': random.choice(  [ "Vehicle A",    "Vehicle B",    "Vehicle C",    "Vehicle D",    "Vehicle E",    "Vehicle F",    "Vehicle G",    "Vehicle H",    "Vehicle I",    "Vehicle J",    "Vehicle K",    "Vehicle L",    "Vehicle M",    "Vehicle N",    "Vehicle O",    "Vehicle P",    "Vehicle Q",    "Vehicle R",    "Vehicle S",    "Vehicle T",    "Vehicle U",    "Vehicle V",    "Vehicle W",    "Vehicle X",    "Vehicle Y",    "Vehicle Z"]),
            'date': date,
            'medications' : random.choice( ["ibuprofen",    "amoxicillin",    "paracetamol",    "aspirin",    "metformin",    "levothyroxine",    "simvastatin",    "lisinopril",    "omeprazole",    "atorvastatin",    "prednisone",    "citalopram",    "metoprolol",    "fluoxetine",    "gabapentin",    "tramadol",    "pantoprazole",    "sertraline",    "losartan",    "azithromycin"]),
            'patient_name': self.generate_patient_name(),
            'class_name': incident_type,
        }

        incident_template = re.sub(r'\[(.*?)\]', lambda match: replacements.get(match.group(1), match.group(0)), incident_template)
        incident_template = self.paraphrase_sentence(incident_template)

        return replacements['patient_name'], incident_template

    def generate_patient_name(self):
        first_names = ["Emma", "Liam", "Olivia", "Noah", "Ava", "Isabella", "Sophia", "Mia", "Charlotte", "Amelia", "Elijah", "James", "Benjamin", "Lucas", "Henry", "Alexander", "Sebastian", "Jack", "William", "Daniel"]
        last_names = ["Smith", "Johnson", "Williams", "Jones", "Brown", "Davis", "Miller", "Wilson", "Moore", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin", "Lee", "Walker", "Hall", "Allen"]

        first_name = random.choice(first_names)
        last_name = random.choice(last_names)
        return f"{first_name} {last_name}"

    def generate_communication_history(self, incident_description, incident_type):
        communication_template = random.choice(self.load_templates('communication', incident_type))
        communication_template = communication_template.replace('[incident_description]', incident_description)
        return self.paraphrase_sentence(communication_template)

    def paraphrase_sentence(self, sentence):
        tokens = nltk.word_tokenize(sentence)
        paraphrased_sentence = []
        lemmatizer = nltk.WordNetLemmatizer()
        for word in tokens:
            paraphrased_word = lemmatizer.lemmatize(word)
            paraphrased_sentence.append(paraphrased_word)
        paraphrased_sentence = ' '.join(paraphrased_sentence)
        return paraphrased_sentence

if __name__ == "__main__":
    insurance_claims=[]
    for _ in range(30000):
        insurance_claim = InsuranceClaimGenerator().generate_insurance_claim()
        
#         insurance_claim = json.dumps(insurance_claim, indent=4)
        insurance_claims.append(insurance_claim)
        

    df = pd.DataFrame(insurance_claims)

    # Save the DataFrame as a CSV file
    df.to_csv('../dataset/insurance_claims_data/insurance_claims.csv', index=False)


In [11]:
# # Convert the JSON data to a DataFrame
# df = pd.DataFrame(insurance_claims)

# # Save the DataFrame as a CSV file
# df.to_csv('./insurance_claims.csv', index=False)

In [6]:
# import pandas as pd
# import json

# df  = pd.read_csv('insurance_claims.csv')
# df

In [7]:
# df['incident_class'].value_counts()