In [None]:
import os
import random
import json
from openai import OpenAI

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# We are saving each conversation in a text file so that we can restart the processed from the last processed data point
fields = ["Technology", "Healthcare", "Finance", "legal"]

# Since there are 20 topics, we will create 20 folders where for each folder, 
# we are keeping on topic fixed and randomly selecting the remaining topics
for field in fields:
  field_path = f"/content/drive/MyDrive/{add_your_path_here}/{field}/Conversation_Data"
  for i in range(0, 20,1):
    folder_path = f"{field_path}/{i}"
    print(folder_path)
    if not os.path.exists(folder_path):
      os.makedirs(folder_path)

In [None]:
# Below code is for only 1 field
# If you have multiple fields of interest, you need to run it multiple times

In [None]:
field = "add-field-here"
type_of_conversations =  [],  # Your Choice your fields
topics = [] # your choice of topics

sentiments = ['positive', 'negative', 'neutral']

# We want to generate 3000 conversation
# Since we have 20 topics, we decided to keep 1 topic fixed for 3000/20 = 150 times 
first_topic_fixed_conversation_count = 150  

# Proportion of positive, negative and netural sentiments we want for the fixed topic
positive_proportion = 0.4
negative_proportion = 0.4
neutral_proportion = 0.2

In [None]:
class DataGenerator:

  # Constructor method to initialize the car object
  def __init__(
              self,
               field,
               type_of_conversations,
               topics, sentiments,
               first_topic_fixed_conversation_count,
               positive_proportion,
               negative_proportion,
               model_type,
               model_name = "gpt-4o"
               ):

      self.field = field
      self.type_of_conversations = type_of_conversations
      self.topics = topics
      self.sentiments = sentiments
      self.first_topic_fixed_conversation_count = first_topic_fixed_conversation_count
      self.total_conversation_count = self.first_topic_fixed_conversation_count * len(self.topics)
      self.positive_proportion = positive_proportion
      self.negative_proportion = negative_proportion
      self.model_type = model_type
      self.model_name = model_name

      self.client = OpenAI(
                    api_key="add-api-key"
      )


  def getSentiment(self, iter_count):
    if iter_count < self.first_topic_fixed_conversation_count * self.positive_proportion:
        sentiment = 'positive'
    elif iter_count < self.first_topic_fixed_conversation_count * (self.positive_proportion + self.negative_proportion):
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    return sentiment

  def generatePrompt(self, type_of_conversation, topic_sentiment_dictionary):

    return f"""You are tasked with generating a dialogue-based conversation with about 400 words based on the following input parameters:
    1. Field: <Field>
    2. Type of Conversation: <Type of Conversation>
    3. Topics and Sentiments: <Dictionary containing the topics and their corresponding sentiments. For example: {{"Topic1": "positive", "Topic2": "negative", "Topic3": "neutral"}}>


    **Instructions**:
    - Create a dialogue-based conversation that is specific to the field mentioned.
    - Ensure the conversation involves all the topics provided in the Topics and Sentiments dictionary.
    - Avoid using the topic names directly in the conversation.
    - Prefer mentioning about sub-topics related to the selected main topic
    - Annotate each speaker in the conversation simply by labeling them as "Speaker A", "Speaker B", etc.
    - Use the Topics and Sentiments dictionary to add a sentimental aspect around each topic, and apply the corresponding sentiment from the dictionary for each topic.
    - The conversation should reflect realistic exchanges, with each character responding naturally to the topics.
    - The sentiment for each topic must be directly reflected in the conversation, based on the Topics and Sentiments dictionary values.
    - Each topic mentioned in the Topics and Sentiments must be addressed in the dialogue, and each speaker's sentiment should align with the assigned sentiment for that topic.


    **Output Format**:
    Provide the conversation like this:


    "<Deepseek generated conversation>"




    ### Example:


    If the field is "Technology", and the conversation type is "technical", with the topics and sentiments dictionary as:
    {{"AI Development": "positive", "Data Security": "negative", "Software Bugs": "neutral"}}


    The output should look like:


    "Speaker A: 'AI development has made huge strides in the last few years.' Speaker B: 'True, but we must also be cautious about data security risks with AI.' Speaker A: 'Yes, but AI can also improve data security.' Speaker B: 'True, but there's always the risk of software bugs making everything more complex.'"


    Now your turn!
    1. Field: {self.field}
    2. Type of Conversation: {type_of_conversation}
    3. Topics and Sentiments: {topic_sentiment_dictionary}


    Output:
    """;

  def generateLLMResponse(self, prompt):
    response = self.client.responses.create(
        model=self.model_name,
        input=prompt
    )
    return response.output_text




In [None]:
def getFilePath(topic_index, fixed_topic_first_selection_count, topic_sentiment_dictionary):
    filename = f"conversation_{fixed_topic_first_selection_count}"
    for key, value in topic_sentiment_dictionary.items():
        index = topics.index(key)
        filename += f"_{index}_{value}"
    return f"/content/drive/MyDrive/Sentiment_Analysis/{field}/Conversation_Data/{topic_index}/{filename}.txt"


def getResponse(inputModel, prompt, topic_index, fixed_topic_first_selection_count,  topic_sentiment_dictionary):
  retry_count = 0
  while retry_count < 5:
    try:
      llm_response = inputModel.generateLLMResponse(prompt)
      filename = getFilePath(topic_index, fixed_topic_first_selection_count, topic_sentiment_dictionary)
      with open(filename, 'w') as file:
        file.write(llm_response)
      print(f"Saved to {filename} for topic {inputModel.topics[topic_index]} with count as {fixed_topic_first_selection_count}]")
      break
    except Exception as e:
      retry_count+=1
      if(retry_count == 5):
        raise e

  return llm_response

In [None]:
def generateData(inputModel, iter_start_count = 0, iter_end_count = 0, per_topic_addition = 0):
    print(inputModel.model_name)
    index = iter_start_count
    max_iterations = iter_end_count

    if max_iterations == 0:
      max_iterations = inputModel.total_conversation_count

    while( index < max_iterations):

      # Index of the topic we will use
      topic_index = index // inputModel.first_topic_fixed_conversation_count

      # Number of times this topic has been selected as first topic
      fixed_topic_first_selection_count = index % inputModel.first_topic_fixed_conversation_count

      # Topic Name
      fixed_topic = inputModel.topics[topic_index]

      # Type of converstaion
      type_of_conversation = random.choice(inputModel.type_of_conversations)


      # Assign a sentiment to the topic
      topic_sentiment_dictionary = {}


      # Get sentiment based on the number of times this element is selected as first topic
      topic_sentiment = inputModel.getSentiment(fixed_topic_first_selection_count)
      topic_sentiment_dictionary[fixed_topic] = topic_sentiment


      # Get Topic pool to select remaining topics
      topics_pool = [t for t in inputModel.topics if t != fixed_topic]

      # Randomly select additional topics
      extra_topics = random.sample(topics_pool, random.randint(1, 2))

      # Get Sentiment pool to select sentiment for remaining topics
      sentiments_pool = [s for s in inputModel.sentiments if s != topic_sentiment]


      # Assign different sentiments to these topics
      for extra_topic in extra_topics:
        extra_topic_sentiment = random.choice(sentiments_pool)
        topic_sentiment_dictionary[extra_topic] = extra_topic_sentiment



      prompt = inputModel.generatePrompt(type_of_conversation, topic_sentiment_dictionary)

      try:
        llm_response = getResponse(inputModel, prompt, topic_index, fixed_topic_first_selection_count, topic_sentiment_dictionary)
      except Exception as e:
        print(f"Found error while running for main iteration count: {index}")
        print(e)
        return

      print(f"Number of conversations completed: {index}")

      index+=1

In [None]:
model = DataGenerator(
    field,
    type_of_conversations,
    topics, sentiments,
    first_topic_fixed_conversation_count,
    positive_proportion,
    negative_proportion,
    "openAI",
    'gpt-4o'
    )

In [None]:
generateData2(model, 0 , 3000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Saved to /content/drive/MyDrive/Sentiment_Analysis/Finance/Conversation_Data/3/conversation_50_3_positive_7_neutral_19_negative.txt for topic Creditworthiness Evaluation with count as 50]
Number of conversations completed: 500
Saved to /content/drive/MyDrive/Sentiment_Analysis/Finance/Conversation_Data/3/conversation_51_3_positive_6_negative_5_negative.txt for topic Creditworthiness Evaluation with count as 51]
Number of conversations completed: 501
Saved to /content/drive/MyDrive/Sentiment_Analysis/Finance/Conversation_Data/3/conversation_52_3_positive_7_negative_12_negative.txt for topic Creditworthiness Evaluation with count as 52]
Number of conversations completed: 502
Saved to /content/drive/MyDrive/Sentiment_Analysis/Finance/Conversation_Data/3/conversation_53_3_positive_2_neutral.txt for topic Creditworthiness Evaluation with count as 53]
Number of conversations completed: 503
Saved to /content/drive/MyDrive/Sentim