In [4]:
import warnings, csv, os, sys
import pandas as pd
import json_repair
from typing import List
warnings.filterwarnings("ignore")
from langchain_openai import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain_core.prompts import (
    ChatPromptTemplate
)
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

In [5]:
def write_csv(output_file, data):
    with open(output_file, 'a', newline='', encoding='utf-8') as outfile:  
        writer = csv.writer(outfile)
        writer.writerow(data)

In [6]:
def write_txt(txt_path, data):
    with open(txt_path, 'a') as txf:
        txf.write(data)

In [7]:
#variables

model_name = 'gpt-4-1106-preview'  #gpt model name
title = 'Survey for brand awareness'  #title
audience = 'Normal People living in US' #audience
output_file = 'hawkRun1.csv'
txt_file = 'output.txt'

#you can change questions and traits/count as you want, this part has become dynamic in this notebook.
questions = [
"Which city do you live in?",
"Which state do you live in?"
]

traits_and_counts = [
    ('Woman 60+ that owns 1 dog', 100)
]


In [8]:
# This block of code is not needed, it is there to know how function calling works.

# To learn more about function calling, check this out https://www.datacamp.com/tutorial/open-ai-function-calling-tutorial
#Answer is the class which contains response given by each user, it also contain traits to know which person answered the questions.
#llm is inteligent enough to give unique response each time based on the given system and user prompt, we should not worry about it.
class Answer(BaseModel):
    """Human-like set of answers told by each person while being surveyed. 
    Each person should have their own unique set of answers that should be different from other person's set of answers."""

    answer1: str = Field(description="This is the human like answer to Question1.")
    answer2: str = Field(description="This is the human like answer to Question2.")
    answer3: str = Field(description="This is the human like answer to Question3.")
    answer4: str = Field(description="This is the human like answer to Question4.")
    
    
    
#list of answers, the formatted list that the llm model will generate which contains dictionaries of Answer class
#llm model will make sure that each dictionary in the list is unique from each other as it will treat the list like list of human responses.
class Answers(BaseModel):
    """Human-like survey answers to tell user."""

    answer: List[Answer]

In [9]:
# This is standard pattern of list of functions used for function calling.
# To know the pattern, you can print out openai_functions variable in previous New.ipynb file
# This function is responsible to give the output list of answers of each human that we want, based on the parameters provided.
# To learn more about function calling, check this out https://www.datacamp.com/tutorial/open-ai-function-calling-tutorial
def create_openai_functions(questions: List):
    fn  = [{'name': 'Answers',
            'description': 'Human-like survey answers to tell user.',
            'parameters': {'type': 'object',
            'properties': {'answer': {'type': 'array',
                'items': {'description': "Human-like set of answers told by each person while being surveyed. \nEach person should have their own unique set of answers that should be different from other person's set of answers.",
                'type': 'object',
                'properties': {
                },
                'required': []}}},
            'required': ['answer']}}]
    answers = fn[0]['parameters']['properties']['answer']['items']['properties']
    required = fn[0]['parameters']['properties']['answer']['items']['required']
    for ind, _ in enumerate(questions, 1):
        answers[f'answer{ind}'] = {'description': f'This is the human like answer to Question{ind}.', 'type': 'string'}
        required.append(f'answer{ind}')
    return fn
        

In [10]:
#sample openai function example
openai_functions = create_openai_functions(questions)

In [11]:
parser = JsonKeyOutputFunctionsParser(key_name="answer")  # to parse the output, it will return list of answers based on function calling

In [12]:
#bind openai_functions to llm for function calling
llm = ChatOpenAI(
    temperature=0.7,
    model_name=model_name,
    max_tokens=4095
).bind(functions=openai_functions)

In [13]:
#this is default system message, you can change it based on your need.
system_message = """
You are a survey answering bot that generates answers like a survey when questions are asked. The answer should be made as if you are a human. 
Give answers assuming you are a new human with different lifestyles while giving answers.
Give answers with new thoughts, new ideas, new moods, you can also choose to answer very rudely, but the main idea is to be random, do not try to repeat same answers since humans have different answers with different answer style.
While giving answers, you should be as creative as possible and you should deviate your answers as much as possible from previous answers.
In every answer, change styles of answers, change average sentence lengths of answer, change fk_grade_level of sentences of answer. Make it different from previous answers. But also make sure it is the answer given by a human. So, don't make it seem like it is AI generated. Add both simple and fancy words.
In 1 answer, give your answers assuming you are having a worse life, in another answer, give your answers assuming you are having best life. like this, keep on changing the lifestyle of human that you are.
Avoid same repeated answers as much as possible.
Do no repeat same pattern in each answers. Give short answers sometimes and sometimes long answers, be random.
Since, human can give both positive and negative answers, you should follow the same principles.
Your answers should be descriptive just like human answers.
Each set of answers should be different from another set of answers. 
If you are asked about 'top few things' or 'few things', each answers should have random number of comma separated sentences. For example:
sentence1, sentence2, and sentence3. (3 sentences)
sentence1 (1 sentence)
sentence1, sentence2, sentence3, sentence4, and sentence5. (5 sentences)
sentence1, sentence2. (2 sentences)
sentence1, sentence2, sentence3, and sentence4. (4 sentences)

If the output asks for a monetary output give just the dollar amount and no text before or after it unless prompted to do so. So for example if the answer is $40 just give $40.
If the output asks for a decimal output give just the value and no text before or after it unless prompted to do so. So for example if the answer is 40.5 just give 40.5.
If you are asked a question like where do you live? Be sure to just answer the place that you live. No text before or after it is needed unless prompted to do so.
If the answer is asking for a percentage output just give the value. So if the answer is 45% just give 45%. Do not give any text before or after that unless prompted to do so.
"""

In [14]:
#chat template
prompt = ChatPromptTemplate.from_messages(
    [("system", system_message), ("user", "{input}")]
)

In [15]:
#chain using prompt, llm and parser
#it uses the prompt with llm and generate the answer based on the parser

# chain = prompt | llm | parser

chain = prompt | llm 

In [16]:
def output_parser(response):
    try:
        return eval(response.additional_kwargs['function_call']['arguments'])['answer']
    except Exception as e:
        # print("Error in Eval\n")
        # print(e)
        # print('----------------------------------------------------------------------')
        try:
            return json_repair.loads(response.additional_kwargs['function_call']['arguments'])['answer']
        except Exception as e:
            # print("Error in Json loads")
            # print(e)
            # print('----------------------------------------------------------------------')
            write_txt(txt_file, "\nFailed-eval-json-loads---------------------------------------\n\n\n")
            return None

In [17]:
total_processed_row = 0  #total row processed
total_remaining_surveyed = 0  #total people remaining to be surveyed
person_number = 0  #person count
traits_dict = {}


for dta in traits_and_counts:
    traits_dict[dta[0]] = int(dta[1])  #creating dict of traits and counts
    total_remaining_surveyed += int(dta[1])  #total people remaining to be surveyed
        
if output_file not in os.listdir():  #if csv is not available
    write_csv(output_file, ['Traits', 'Person Number']+questions)
else: #if csv already exists
    df = pd.read_csv(output_file)
    old_traits_dict = df['Traits'].value_counts().to_dict() #get traits/counts of already surveyed people in csv
    for k,v in old_traits_dict.items():
        person_number += v #increasing the person number
        if k not in traits_dict.keys(): 
            continue
        traits_dict[k] = traits_dict[k]-v
        total_remaining_surveyed = int(total_remaining_surveyed - v)  #subtracting from the people that are already in csv. suppose 7 needed, 3 are already in csv, it will only process 4
        
        
print("Remaining Traits/counts dict:", traits_dict)
print("Total remaining people to be surveyed:", total_remaining_surveyed)
print()

if total_remaining_surveyed <= 0:
    print("All the people are surveyed, if you want to increase survey count, increase the count in traits_and_counts variable at top")
    sys.exit()
    
    
for traits, counts in traits_dict.items():
    if counts <= 0:
        continue
    while True:
        input_message = f"Generate survey answers from {counts} people.\nAll of the surveyed people are {audience}\n"
        input_message += f'The title of the survey: {title}\n'
        input_message += f'{counts} of the surveyed people have this trait: {traits}\n'
        input_message += f'Extremely Important Note: You must compulsory give answers to all the questions provided below. Do not skip any questions.\n'
        print(input_message)
        for ind, question in enumerate(questions, 1):
            input_message += f'Question{ind}: {questions[ind-1]}\n'
        total_surveyed = sum([c for _, c in traits_dict.items()])
        try:
            res = chain.invoke({"input": input_message})
        except Exception as e:
            print('OpenAI Error', e)
            print()
            print()
            continue
        write_txt(txt_file, str(res))
        responses = output_parser(res)
        if responses is None:
            continue
        final_responses = []
        for data in responses:
            continue_for = False
            if type(data) == dict:
                for ind, question in enumerate(questions):
                    try:
                        data[f'answer{ind+1}']
                    except:
                        write_txt(txt_file, f"\nanswer{ind+1}-not-found---------------------------------------\n")
                        continue_for = True
                        break
                if continue_for:
                    continue
                final_responses.append(data)
        for data in final_responses:
            person_number += 1
            lst = [traits, person_number]
            for ind, question in enumerate(questions):
                lst.append(data[f'answer{ind+1}'])
            write_csv(output_file, lst)
        write_txt(txt_file, "\nCompleted---------------------------------------\n\n\n")
        total_processed_row += len(final_responses)
        counts = counts - len(final_responses)
        print('Remaining traits:', traits_dict)
        print("Total responses given by model:", len(final_responses))
        print("Total processed responses:", total_processed_row)
        print(final_responses)
        print()
        print()
        if counts <= 0:
            break
    

Remaining Traits/counts dict: {'Woman 60+ that owns 1 dog': 100}
Total remaining people to be surveyed: 100

Remaining traits: {'Woman 60+ that owns 1 dog': 100}
Total responses given by model: 126
Total processed responses: 126
[{'answer1': 'Tucson', 'answer2': 'Arizona'}, {'answer1': 'Sarasota', 'answer2': 'Florida'}, {'answer1': 'Albuquerque', 'answer2': 'New Mexico'}, {'answer1': 'Charleston', 'answer2': 'South Carolina'}, {'answer1': 'Santa Fe', 'answer2': 'New Mexico'}, {'answer1': 'San Diego', 'answer2': 'California'}, {'answer1': 'Portland', 'answer2': 'Maine'}, {'answer1': 'Fort Lauderdale', 'answer2': 'Florida'}, {'answer1': 'Tallahassee', 'answer2': 'Florida'}, {'answer1': 'Asheville', 'answer2': 'North Carolina'}, {'answer1': 'Austin', 'answer2': 'Texas'}, {'answer1': 'Savannah', 'answer2': 'Georgia'}, {'answer1': 'Scottsdale', 'answer2': 'Arizona'}, {'answer1': 'Colorado Springs', 'answer2': 'Colorado'}, {'answer1': 'Bellingham', 'answer2': 'Washington'}, {'answer1': 'Euge