# Synthetic document generation

### Import the required packages

In [2]:
import common
import json

# Get a configured model
client = common.get_openai_client(api_key=common.api_KEY,
        api_version=common.api_version,
        azure_endpoint=common.api_URI)

### Read the content from several files

In [19]:
def read_file(file):
    with open(file, 'r') as f:
        return f.read()

def read_files_in_folder(folder):
    import os
    content = ""
    for file in os.listdir(folder):
        if file.endswith('.txt'):
            print(file)
            content += read_file(os.path.join(folder, file))
    return content

content = read_files_in_folder('data/')
content

401k.txt
benefits.txt


"401K Policy\nContoso\n\nA 401k plan is a retirement savings plan that allows you to contribute a portion of your pre-tax income to a designated account. The company offers a 401k plan to all eligible employees as part of its benefits package. The company also matches your contributions up to a certain percentage, depending on your years of service and salary level. This means that the company will add money to your 401k account for every dollar you contribute, up to the matching limit.\n\nThe 401k plan is administered by a third-party provider, who is responsible for investing your contributions and providing you with online access to your account. You can choose from a variety of investment options, ranging from conservative to aggressive, depending on your risk tolerance and retirement goals. You can also change your contribution amount and investment allocation at any time, subject to certain restrictions and fees.\n\nThe 401k plan is a great way to save for your future and take ad

### Template to extract formatted data from one document

In [41]:
summarize_template = """system: 
You are an agent that can help create a synthetic document.

user:

summarize document:

{{input}}

Sample JSON output format:

{
    "title": "", // The title of the document.
    "summary": "", // The summary of the document.
    "faq":"Q: Where is the company located?\nA: The company headquarters are located in San Francisco, CA.", // A list of frequenty asked quetions and answers
    "keywords": [], // The keywords of the document.
}

Respond in JSON format only.
"""


### Extract the formatted data for the 401K policy and benefits document

- With larger context windows this could be done without having to split the documents
- Otherwise, a summarizer techniquie would have to be used

In [42]:
file1 = read_file('data/401k.txt')
template = common.render_template(summarize_template, input=file1)
ans1 = common.Call_OpenAI(client,common.gpt_api_deployment,template, max_tokens=2000)
print(ans1)

{
    "title": "401K Policy - Contoso",
    "summary": "Contoso offers a 401k retirement savings plan to eligible employees with company matching contributions based on years of service and salary level. The plan is managed by a third-party provider, offering a range of investment options and allowing for contribution and investment changes. Withdrawals are subject to restrictions, with potential penalties and taxes for early withdrawal or loan default. Employees are encouraged to consult the SPD and PDN documents for detailed information and to seek assistance from the provider's customer service or a financial advisor for retirement planning.",
    "faq": "Q: How does the company match contributions?\nA: The company matches contributions up to a certain percentage based on the employee's years of service and salary level.\n\nQ: Can I withdraw money from my 401k account at any time?\nA: Withdrawals are permitted under certain circumstances, such as reaching age 59.5, leaving the compa

In [43]:
file2 = read_file('data/benefits.txt')
template = common.render_template(summarize_template, input=file2)
ans2 = common.Call_OpenAI(client,common.gpt_api_deployment,template, max_tokens=2000)
print(ans2)

{
    "title": "Company Benefits - Contoso",
    "summary": "Contoso offers comprehensive benefits including 100% premium coverage for health insurance, 50% match on 401(k) contributions, generous paid time off, professional development opportunities, and support for work-life balance with flexible work arrangements.",
    "faq": "Q: What kind of health insurance does Contoso provide?
A: Contoso covers 100% of the premiums for medical, dental, and vision plans for employees and dependents.
Q: Does Contoso offer retirement savings plans?
A: Yes, Contoso matches 50% of employees' 401(k) contributions up to 6% of their salary.
Q: What is Contoso's policy on paid time off?
A: Contoso offers vacation, sick leave, holidays, parental leave, and sabbatical leave.
Q: Are there opportunities for professional development at Contoso?
A: Contoso provides tuition reimbursement, online courses, mentoring, and internal mobility for career advancement.
Q: How does Contoso support work-life balance?
A: 

### Template to combine the extracted data

In [44]:
summaries_template = """system: 
You are an agent that can help create a synthetic document.

user:

Combine the following sections into a single document:

{{input}}

Sample JSON output format:

{
    "title": "", // The title of the document.
    "summary": "", // The summary of the document.
    "faq":"Q: What is a 401k plan?\nA: A 401k plan is a retirement savings plan that allows employees to contribute pre-tax income to a designated account, with potential matching contributions from the employer.\n" // A section of frequently asked questions and answers.
    "keywords": [], // The keywords of the document.
}

Respond in JSON format only.
"""

### Generate the Synthetic document

In [45]:

template = common.render_template(summaries_template, input=ans1+ans2)
combined = common.Call_OpenAI(client,common.gpt_api_deployment,template, max_tokens=2000)
print(combined)

{
    "title": "Employee Benefits & 401K Policy - Contoso",
    "summary": "Contoso is dedicated to providing its employees with a comprehensive benefits package, which includes 100% premium coverage for health insurance, a 401k retirement savings plan with company matching contributions, generous paid time off, and various professional development opportunities. The 401k plan is managed by a third-party provider, offering a range of investment options and allowing for contribution and investment changes. Employees are encouraged to seek assistance from the provider's customer service or a financial advisor for retirement planning.",
    "faq": "Q: How does the company match contributions?\nA: The company matches 50% of employees' 401(k) contributions up to 6% of their salary, based on the employee's years of service and salary level.\n\nQ: Can I withdraw money from my 401k account at any time?\nA: Withdrawals are permitted under certain circumstances, such as reaching age 59.5, leavin