In [None]:
from google.colab import userdata
gemini_key = userdata.get('gemini_key')
mistral_key = userdata.get('mistral_key2')

In [None]:
import os
import torch
import random
random.seed(28)
import json
import csv

In [None]:
def gemini_setup(api_key):
    os.system("pip install -q -U google-generativeai")
    import google.generativeai as genai
    genai.configure(api_key=gemini_key)
    glm_config = genai.GenerationConfig(temperature=0.99)
    safety_settings = [
        {"category": "HARM_CATEGORY_DANGEROUS", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
    ]
    gemini_model = genai.GenerativeModel('gemini-2.0-flash', generation_config=glm_config, safety_settings=safety_settings)
    return gemini_model

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [None]:
gemini_model = gemini_setup(gemini_key)

In [None]:
def populate_data_for_permutations(prompt, data_content):
  data_list = []
  response = gemini_model.generate_content(contents= prompt)
  for i in response.text.strip().split('\n'):
    data_list.append(i)
  print("The data generated for {}:{}\n The length of the list is: {}".format(data_content,data_list,len(data_list)))
  return data_list

In [None]:
def convert_list_to_txt(list_,filename):
  with open(filename,'w+') as f:
    for item in list_:
      f.write("%s\n" %item)
  print("List converted to {}".format(filename))


In [None]:
# prompt_cities = ("List 100 different cities from the US where events or incidents could happen. Make each city a separate line. Do not include any extra text.")
# cities = populate_data_for_permutations(prompt_cities,"cities")

In [None]:
# cities = cities[2:102]

In [None]:
# convert_list_to_txt(cities,"cities.txt")

In [None]:
prompt_locations= ("List 100 different locations like a café, office, park, or bus stop where people could have ambiguous conversations. Make each location a separate line. Do not include any extra text.")
locations = populate_data_for_permutations( prompt_locations,"locations")

The data generated for locations:['Café', 'Office', 'Park', 'Bus stop', 'Train station', 'Airport', 'Library', 'Gym', 'Restaurant', 'Bar', 'Movie theater lobby', 'Concert venue', 'Shopping mall', 'Grocery store', 'Hair salon', "Doctor's office waiting room", "Dentist's office waiting room", 'Auto repair shop waiting area', 'University campus', 'School hallway', 'Elevator', 'Stairwell', 'Hospital hallway', 'Art gallery', 'Museum', 'Hotel lobby', 'Convention center', 'Board meeting room', 'Construction site', 'Factory floor', 'Warehouse', 'Farm', 'Dockyard', 'Laundromat', 'Dry cleaner', 'Post office', 'Bank', 'Courthouse hallway', 'Prison visiting room', 'Zoo', 'Aquarium', 'Botanical garden', 'Amusement park', 'Water park', 'Bowling alley', 'Arcade', 'Skating rink', 'Swimming pool', 'Beach', 'Campground', 'Hiking trail', 'Ski resort', 'Golf course', 'Tennis court', 'Basketball court', 'Soccer field', 'Baseball field', 'Football stadium', 'Parking garage', 'Parking lot', 'Gas station', 'C

In [None]:
locations = locations[:100]

In [None]:
convert_list_to_txt(locations,"locations.txt")

List converted to locations.txt


In [None]:
# time_prompt = ("List 30 different time settings around a day, like morning, rush hour, midnight, or after work. Vary wording where possible. Make each time setting a separate line. Do not include any extra text.")
# times = populate_data_for_permutations(time_prompt,"time settings")

In [None]:
# convert_list_to_txt(times,"time_settings.txt")

In [None]:
number_of_messages_conv_prompt = "List 30 different numbers between 20 and 200, representing the number of messages in a conversation. Each number should be unique and in a separate line. Do not include any extra text."
number_of_messages_conv=populate_data_for_permutations(number_of_messages_conv_prompt,"no. of messages")


The data generated for no. of messages:['21', '25', '29', '33', '37', '41', '45', '49', '53', '57', '61', '65', '69', '73', '77', '81', '85', '89', '93', '97', '101', '105', '109', '113', '117', '121', '125', '129', '133', '199']
 The length of the list is: 30


In [None]:
convert_list_to_txt(number_of_messages_conv,"no_of_msgs.txt")

List converted to no_of_msgs.txt


In [None]:
number_of_people_conv_prompt = "List 5 numbers between 2 and 5, representing the number of people involved in a conversation. Each number should be in a separate line. Do not include any extra text."
number_of_people_conv = populate_data_for_permutations(number_of_people_conv_prompt, "Number of People")

The data generated for Number of People:['2', '3', '4', '5', '2.5']
 The length of the list is: 5


In [None]:
convert_list_to_txt(number_of_people_conv,"number_of_people.txt")

List converted to number_of_people.txt


In [None]:
vague_indicators_conv_prompt = "List 20 vague incidents or events that people might discuss in a way that feels suspicious but lacks explicit details. The events should create a sense of unease, secrecy, or misinterpretation without directly mentioning anything illegal. Examples include phrases like ""That wasn’t supposed to happen,"" ""We might have a problem,"" and ""Someone’s asking too many questions"". Each response should be short, natural, and sound like something someone might say in a tense or secretive conversation. Make each incident a separate line. Do not include any extra text."
vague_indicators_conv = populate_data_for_permutations(vague_indicators_conv_prompt, "Vague Incident Indication")

The data generated for Vague Incident Indication:['*   "The package got rerouted."', '*   "He knows more than he should."', '*   "They’re changing the protocol."', '*   "The numbers aren’t adding up."', '*   "It\'s being handled internally."', '*   "She missed the deadline... again."', '*   "That wasn\'t part of the agreement."', '*   "There\'s been a slight adjustment."', '*   "We\'re going off the record for a moment."', '*   "Things got… complicated."', '*   "Let\'s just say it\'s a delicate situation."', '*   "He\'s been reassigned to another project."', '*   "We\'re experiencing some unexpected delays."', '*   "There\'s been a security breach."', '*   "It appears some information is missing."', '*   "They\'re conducting a review of the process."', '*   "We\'ve had to make some… adjustments."', '*   "That meeting ran longer than expected."', '*   "He took early retirement."', '*   "Someone\'s been digging around."']
 The length of the list is: 20


In [None]:
convert_list_to_txt(vague_indicators_conv,"vague_incidents.txt")

List converted to vague_incidents.txt


In [None]:
vague_messages_count_prompt = "List 15 different numbers between 5 and 35, representing how many messages in a conversation contain vague or indirect statements. Each number should be in a separate line. Do not include any extra text."
vague_messages_count = populate_data_for_permutations(vague_messages_count_prompt,"vague message count")

The data generated for vague message count:['6', '7', '9', '11', '13', '15', '17', '19', '21', '23', '25', '27', '29', '31', '33']
 The length of the list is: 15


In [None]:
convert_list_to_txt(vague_messages_count,"vague_msg_count.txt")

List converted to vague_msg_count.txt


In [None]:
age_prompt= ("List 10 different ages between 15 and 40 that are commonly involved in ambiguous conversations. Each age should be in a separate line. Do not include any extra text.")
age = populate_data_for_permutations(age_prompt,"Age of Individual")

The data generated for Age of Individual:['16', '18', '21', '25', '27', '30', '32', '35', '37', '40']
 The length of the list is: 10


In [None]:
convert_list_to_txt(age,"indiv_age.txt")

List converted to indiv_age.txt


In [None]:
# gender_prompt="List 5 different gender identities. Each gender should be in a separate line. Do not include any extra text."
# gender = populate_data_for_permutations(gender_prompt,"Individual Gender")

In [None]:
# convert_list_to_txt(gender,"indiv_gender.txt")

In [None]:
conv_nature_prompt = "List 4 different conversation tones that affect how people perceive a discussion. Examples: casual, serious, urgent, cryptic. Each value should be in a separate line. Do not include any extra text."
conv_nature = populate_data_for_permutations(conv_nature_prompt,"Nature of Conversation")

The data generated for Nature of Conversation:['Casual', 'Serious', 'Urgent', 'Cryptic']
 The length of the list is: 4


In [None]:
convert_list_to_txt(conv_nature,"conv_nature.txt")

List converted to conv_nature.txt


In [None]:
disc_topic_prompt = "List 15 different topics people commonly discuss where their words could be misinterpreted. Examples: Business, Personal Issues, Unexpected Problems, Logistics, Debt, Health, Real Estate, Hiring. Each topic should be in a separate line. Do not include any extra text."
disc_topic = populate_data_for_permutations(disc_topic_prompt,"suspect_occupation")

The data generated for suspect_occupation:['Finances', 'Politics', 'Religion', 'Relationships', 'Parenting', 'Education', 'Social Issues', 'Technology', 'Legal Matters', 'Environmental Concerns', 'Dating', 'Nutrition', 'Fitness', 'Travel', 'World Events']
 The length of the list is: 15


In [None]:
convert_list_to_txt(disc_topic,"discussion_topics.txt")

List converted to discussion_topics.txt


In [None]:
### Binary Variables directly added to code
# Var susp_context = (Yes/No)
# Var use_coded_lang = (Yes/No)
# Var implic_ref_to_action = (Yes/No)

###Prompt variables - The variables involved to generate different settings of the murder stories.


1. Location → Sets the scene for the conversation.

2. Number of Messages in the Conversation → Controls length; ensures natural flow.

3. Number of People Involved in the Conversation → Impacts complexity of discussion & ambiguity.
4. Types of Vague Incidents in the Conversation → Key to making conversations feel unclear but suspicious.

5. Number of Vague Messages in a Conversation → Defines how much ambiguity is present.

6. Conversation Nature (Casual, Serious, Urgent, Cryptic) → Affects tone and perception.

7. Discussion Topic → Adds realism; makes ambiguity more natural.

8. Mentioned in Suspicious Context? (Yes/No) → Flags if someone is referenced in a way that raises suspicion.

9. Implicit Reference to an Action? (Yes/No) → Ensures indirect hints at something happening.

10. Use of Coded Language? (Yes/No) → Introduces double meanings in conversations.













In [None]:
import random
import json

def read_txt_to_list(filename):
    """Reads a text file and returns a list of lines."""
    with open(filename, "r") as f:
        return [line.strip() for line in f]

def list_of_dicts_to_json(list_of_dict, filename):
    """Writes a list of dictionaries to a JSON file."""
    with open(filename, "w+") as f:
        json.dump(list_of_dict, f, indent=4)
    print(f"Converted the list of dictionaries into {filename}")

def generate_permutations():
    """Generates ambiguous conversation settings based on predefined variables."""
    # Load text-based lists
    locations = read_txt_to_list("locations.txt")
    vague_incidents = read_txt_to_list("vague_incidents.txt")
    discussion_topics = read_txt_to_list("discussion_topics.txt")
    personality_traits = ["Paranoid", "Evasive", "Blunt", "Overly Curious"]

    # Define numerical variables
    num_messages = read_txt_to_list("no_of_msgs.txt")
    num_vague_messages = read_txt_to_list("vague_msg_count.txt")
    num_people = [2, 3, 4, 5]
    conversation_nature = ["Casual", "Serious", "Urgent", "Cryptic"]

    # Binary variables
    binary_flags = ["Yes", "No"]

    # Personality trait descriptions
    personality_descriptions = {
        "Overly Curious": "Asks too many questions, keeps pushing for more details.",
        "Paranoid": "Hesitant, speaks in short phrases, always worried about being watched or overheard.",
        "Evasive": "Frequently dodges questions, redirects conversations, and avoids clear statements.",
        "Blunt": "Speaks without filtering, often saying things in a way that could be misinterpreted.",
        "Neutral": "Speaks in a balanced, careful manner, rarely stands out in the conversation."
    }

    permutations = []

    for _ in range(200):
      num_participants = random.choice(num_people)
      participants = []
      for i in range(1, num_participants + 1):
          curr_personality = random.choice(personality_traits) if random.random() < 0.5 else "Neutral"
          curr_description = personality_descriptions[curr_personality]
          participants.append(
              f"Person {i}: **{curr_personality}** - {curr_description}\n"
              f"  Mentioned in Suspicious Context? {random.choice(binary_flags)}\n"
              f"  Implicitly References an Action? {random.choice(binary_flags)}\n"
              f"  Uses Coded Language? {random.choice(binary_flags)}"
          )

      permutation = {
          "location": random.choice(locations),
          "num_messages": random.choice(num_messages),
          "num_people_in_conversation": num_participants,
          "vague_incident_type": random.choice(vague_incidents).strip("* ").strip('"'),
          "num_vague_messages": random.choice(num_vague_messages),
          "conversation_nature": random.choice(conversation_nature),
          "discussion_topic": random.choice(discussion_topics),
          # "mentioned_in_suspicious_context": random.choice(binary_flags),
          # "implicit_reference_to_action": random.choice(binary_flags),
          # "use_of_coded_language": random.choice(binary_flags),
          "all_participants": "\n".join(participants)
      }
      permutations.append(permutation)

    # Save to JSON file
    list_of_dicts_to_json(permutations, "ambiguous_conversations.json")

generate_permutations()


Converted the list of dictionaries into ambiguous_conversations.json


In [None]:
# permutations()

In [None]:
def json_to_list_of_dicts(jsonfile):
  l=[]
  with open(jsonfile,"r") as f:
    data = json.load(f)
    for di in data:
      l.append(di)
    return l



In [None]:
permutations = json_to_list_of_dicts("ambiguous_conversations.json")

In [None]:
len(permutations)

200

In [None]:
permutations[0]

{'location': 'Flower shop',
 'num_messages': '33',
 'num_people_in_conversation': 5,
 'vague_incident_type': 'He took early retirement.',
 'num_vague_messages': '17',
 'conversation_nature': 'Serious',
 'discussion_topic': 'Relationships',
 'all_participants': 'Person 1: **Evasive** - Frequently dodges questions, redirects conversations, and avoids clear statements.\n  Mentioned in Suspicious Context? Yes\n  Implicitly References an Action? Yes\n  Uses Coded Language? Yes\nPerson 2: **Paranoid** - Hesitant, speaks in short phrases, always worried about being watched or overheard.\n  Mentioned in Suspicious Context? Yes\n  Implicitly References an Action? No\n  Uses Coded Language? No\nPerson 3: **Neutral** - Speaks in a balanced, careful manner, rarely stands out in the conversation.\n  Mentioned in Suspicious Context? Yes\n  Implicitly References an Action? Yes\n  Uses Coded Language? No\nPerson 4: **Neutral** - Speaks in a balanced, careful manner, rarely stands out in the conversati

In [None]:
prompt_content = """Location: {location}
Discussion Topic: {discussion_topic}
Conversation Nature: {conversation_nature}
Event: "{vague_incident_type}"

You are tasked with generating a **realistic, natural** conversation between {num_people_in_conversation} people in this setting.

### **Conversation Rules:**
- The conversation should feel **unscripted and authentic**, avoiding robotic or forced dialogue.
- Participants should **never explicitly state what happened**—only hint at it.
- **Tension and uncertainty** should be present, leaving the meaning **open to interpretation**.
- The conversation should include **coded language, half-sentences, and unfinished thoughts** where appropriate.

### **Participant Traits:**
Each participant has a **distinct personality** that influences how they speak:
{all_participants}

### **Construct a {num_messages}-message conversation with:**
- **At least {num_vague_messages} Vague/Indirect Messages**
- **Natural dialogue pacing (Dialogue should sound realistic and unplanned)**
- **Hidden tension between some participants**
- **Mixed levels of involvement** (some participants more engaged than others)
- **Make up natural-sounding names for participants**
- **Avoid excessive legal jargon** (conversations should feel natural)
- **Provide only the text messages—no additional comments or explanations.**
- **Leave gaps in information—avoid making things too obvious.**
- **Keep sentences short, indirect, and open-ended.**

Format STRICTLY as:
[Message 1] <Speaker Name>: "..."
[Message 2] <Speaker Name>: "..."
"""

filename = "/content/ambiguous_conversation_prompt.txt"
with open(filename, "w") as f:
  f.write(prompt_content)

print(f"File '{filename}' has been created successfully!!")

File '/content/ambiguous_conversation_prompt.txt' has been created successfully!!


In [None]:
def prompt(permutation):
    """Generates a structured prompt for LLM based on ambiguous conversation settings."""
    with open("ambiguous_conversation_prompt.txt", "r") as f:
        template = f.read()

    return template.format(
        location=permutation["location"],
        discussion_topic=permutation["discussion_topic"],
        conversation_nature=permutation["conversation_nature"],
        vague_incident_type=permutation["vague_incident_type"],
        num_people_in_conversation=permutation["num_people_in_conversation"],
        num_messages=permutation["num_messages"],
        num_vague_messages=permutation["num_vague_messages"],
        all_participants=permutation["all_participants"]
    )

generate_permutations()
print(prompt(permutations[0]))

Converted the list of dictionaries into ambiguous_conversations.json
Location: Flower shop
Discussion Topic: Relationships
Conversation Nature: Serious
Event: "He took early retirement."

You are tasked with generating a **realistic, natural** conversation between 5 people in this setting.

### **Conversation Rules:**
- The conversation should feel **unscripted and authentic**, avoiding robotic or forced dialogue.
- Participants should **never explicitly state what happened**—only hint at it.
- **Tension and uncertainty** should be present, leaving the meaning **open to interpretation**.
- The conversation should include **coded language, half-sentences, and unfinished thoughts** where appropriate.

### **Participant Traits:**
Each participant has a **distinct personality** that influences how they speak:
Person 1: **Evasive** - Frequently dodges questions, redirects conversations, and avoids clear statements.
  Mentioned in Suspicious Context? Yes
  Implicitly References an Action? Ye

In [None]:
# def prompt_string_formation(type_,data):
#   if type_=="victim":
#     filename = "/content/true_positives_victims.txt"
#     dictionary_variable_1 = "no_of_victims"
#     dictionary_variable_2="victim_{}"
#   else:
#     filename = "/content/true_positives_suspects.txt"
#     dictionary_variable_1 = "no_of_suspects"
#     dictionary_variable_2="suspect_{}"
#   with open(filename, "r") as f:
#     prompt_part = f.read()
#   str_=""
#   for i in range(1,data[dictionary_variable_1]+1):
#     if type_=="victim":
#       str_+= prompt_part.format(i,data[dictionary_variable_2.format(i)]["age"],data[dictionary_variable_2.format(i)]["gender"],data[dictionary_variable_2.format(i)]["occupation"],data[dictionary_variable_2.format(i)]["personality"])
#     else:
#       str_+=prompt_part.format(i,data[dictionary_variable_2.format(i)]["age"],data[dictionary_variable_2.format(i)]["gender"],data[dictionary_variable_2.format(i)]["physqiue"],data[dictionary_variable_2.format(i)]["occupation"],data[dictionary_variable_2.format(i)]["motivation"],random.choice(["low","medium","high"]))
#   return str_



In [None]:
# str_=prompt_string_formation("suspect",permutations[0])

In [None]:
# print(str_)

In [None]:
# def prompt(permutation):
#   with open("/content/true_postive_first_part.txt", "r") as f:
#     true_positives_first_part = f.read()
#   with open("/content/true_psoitives_last_part.txt", "r") as f:
#     true_positives_last_part = f.read()
#   true_positives_first_part = true_positives_first_part.format(random.randint(1,permutation["no_of_suspects"]),permutation["city"],permutation["location"],permutation["time"],permutation["no_of_messages"],permutation["no_of_people_in_conversation"],permutation["no_of_victims"],permutation["no_of_suspects"],permutation["murder_method"])
#   true_positives_victims = prompt_string_formation(type_="victim",data = permutation)
#   true_positives_suspects = prompt_string_formation(type_="suspect",data = permutation)
#   true_positives_last_part = true_positives_last_part.format(random.randint(1,permutation["no_of_suspects"]),permutation["no_of_suspects"]-1,random.randint(3,7),random.randint(3,7),permutation["time"],permutation["murder_method"], permutation["victim_1"]["occupation"],permutation["murder_method"],permutation["no_of_messages"],random.randint(1,4))
#   return true_positives_first_part+true_positives_victims+ true_positives_suspects+true_positives_last_part

In [None]:
# prompt(permutations[0])

In [None]:
def generate(prompt, gemini_model):
  response = gemini_model.generate_content(contents= prompt)
  return response.text

In [None]:
! pip install mistralai

Collecting mistralai
  Downloading mistralai-1.5.1-py3-none-any.whl.metadata (29 kB)
Collecting eval-type-backport>=0.2.0 (from mistralai)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting jsonpath-python>=1.0.6 (from mistralai)
  Downloading jsonpath_python-1.0.6-py3-none-any.whl.metadata (12 kB)
Collecting typing-inspect>=0.9.0 (from mistralai)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect>=0.9.0->mistralai)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading mistralai-1.5.1-py3-none-any.whl (278 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.3/278.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading eval_type_backport-0.2.2-py3-none-any.whl (5.8 kB)
Downloading jsonpath_python-1.0.6-py3-none-any.whl (7.6 kB)
Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Downloading mypy_extensions-1.0.0-py3-n

In [None]:
import time
import csv
import random
from mistralai import Mistral

# Token tracking
tokens_used_last_minute = 0
token_limit_per_minute = 500000
safety_threshold = 5000  # Enforce pause before hitting the hard limit

def ask_mistral_ambiguous(conversation, prompt_text):
    """Sends an ambiguous conversation to Mistral AI while handling rate limits."""

    global tokens_used_last_minute

    model = "mistral-large-latest"
    client = Mistral(api_key=mistral_key)

    evaluation_prompt = f"""
    You are an AI model evaluating a conversation for ambiguity and realism.

    Below is the **conversation prompt** that was used to generate the dialogue:
    {prompt_text}

    And here is the **generated conversation**:
    {conversation}

    Answer the following questions with only **"Yes"** or **"No"**, strictly following the format:

    **FORMAT:**
    Ambiguity Maintained: Yes or No
    Realistic Dialogue: Yes or No

    **QUESTIONS:**
    1️⃣ **Ambiguity Maintained** → Does the conversation leave room for multiple interpretations without making the meaning too obvious or too random?
    2️⃣ **Realistic Dialogue** → Does the conversation feel realistic and natural?

    **Your response MUST strictly follow this format (example response):**
    Ambiguity Maintained: Yes
    Realistic Dialogue: No
    """

    # Estimate tokens
    prompt_tokens = len(evaluation_prompt.split())  # Rough token count estimate
    response_tokens = 10  # Expected short response

    total_tokens = prompt_tokens + response_tokens

    # **Rate Limit Check: Full Pause if nearing the limit**
    if tokens_used_last_minute + total_tokens > safety_threshold:
        print(f"🚨 Nearing Token Limit ({tokens_used_last_minute} tokens used). Pausing for 60s...")
        time.sleep(60)
        tokens_used_last_minute = 0  # Reset counter after full pause

    # Call Mistral API
    chat_response = client.chat.complete(
        model=model,
        messages=[{"role": "user", "content": evaluation_prompt}]
    )

    # Extract response
    response_text = chat_response.choices[0].message.content.strip()

    # Track used tokens
    tokens_used_last_minute += total_tokens

    # Ensure proper format
    try:
        lines = response_text.split("\n")
        ambiguity_answer = lines[0].split(":")[1].strip()
        realism_answer = lines[1].split(":")[1].strip()
        return [ambiguity_answer, realism_answer]
    except:
        return ["Error", "Error"]



In [None]:
import csv

def generate_ground_truth_ambiguous():
  """Evaluates ambiguous conversations with rate limiting (1 request per second)."""

  ground_truth_data = []

  with open("conversations.csv", "r", encoding="utf-8") as f:
      reader = csv.reader(f)
      next(reader)  # Skip header

      for idx, row in enumerate(reader):
          prompt_text, conversation = row
          print(f"🔍 Evaluating Conversation {idx + 1}: {prompt_text[:50]}...")

          eval_results = ask_mistral_ambiguous(conversation, prompt_text)
          ground_truth_data.append([prompt_text, conversation, eval_results[0], eval_results[1]])

          time.sleep(1.1)  # Wait 1.1 seconds to prevent hitting Mistral's 1 req/sec limit

          # Batch Processing: Pause every 10 conversations to avoid being flagged
          if (idx + 1) % 10 == 0:
              pause_time = random.randint(5, 10)  # Random short break
              print(f"⏸️ Pausing for {pause_time} seconds to avoid rate limits...")
              time.sleep(pause_time)

  with open("ambiguous_ground_truth.csv", "w+", newline="", encoding="utf-8") as f:
      writer = csv.writer(f)
      writer.writerow(["prompt", "conversation", "ambiguity_maintained", "realistic_dialogue"])
      writer.writerows(ground_truth_data)

  print("✅ Ground truth results saved in 'ambiguous_ground_truth.csv'!")

In [None]:
generate_ground_truth_ambiguous()

🔍 Evaluating Conversation 1: Location: Gas station
Discussion Topic: World Even...
🔍 Evaluating Conversation 2: Location: Pet store
Discussion Topic: Relationship...
🔍 Evaluating Conversation 3: Location: Electronics store
Discussion Topic: Trav...
🔍 Evaluating Conversation 4: Location: Dockyard
Discussion Topic: Social Issues...
🔍 Evaluating Conversation 5: Location: School hallway
Discussion Topic: World E...
🔍 Evaluating Conversation 6: Location: Museum
Discussion Topic: Technology
Conv...
🚨 Nearing Token Limit (4288 tokens used). Pausing for 60s...
🔍 Evaluating Conversation 7: Location: Tennis court
Discussion Topic: Relations...
🔍 Evaluating Conversation 8: Location: Jewelry store
Discussion Topic: Fitness
...
🔍 Evaluating Conversation 9: Location: Beach
Discussion Topic: Travel
Conversat...
🔍 Evaluating Conversation 10: Location: Bar
Discussion Topic: Travel
Conversatio...
⏸️ Pausing for 7 seconds to avoid rate limits...
🔍 Evaluating Conversation 11: Location: Football stadium
Di

In [None]:
! pip install pandas



In [None]:
import pandas as pd

def analyze_results():
  df = pd.read_csv("ambiguous_ground_truth.csv")

  # Ensure column names are correct
  expected_columns = ["prompt", "conversation", "ambiguity_maintained", "realistic_dialogue"]
  if not all(col in df.columns for col in expected_columns):
      print("⚠️ Column names might be incorrect. Check CSV format.")
  else:
      # Total conversations evaluated
      total_conversations = len(df)

      # Count Yes/No for Ambiguity
      ambiguity_yes = (df["ambiguity_maintained"] == "Yes").sum()
      ambiguity_no = (df["ambiguity_maintained"] == "No").sum()
      ambiguity_error = (df["ambiguity_maintained"] == "Error").sum()

      # Count Yes/No for Realism
      realism_yes = (df["realistic_dialogue"] == "Yes").sum()
      realism_no = (df["realistic_dialogue"] == "No").sum()
      realism_error = (df["realistic_dialogue"] == "Error").sum()

      # Calculate percentages
      ambiguity_yes_pct = (ambiguity_yes / total_conversations) * 100
      ambiguity_no_pct = (ambiguity_no / total_conversations) * 100
      realism_yes_pct = (realism_yes / total_conversations) * 100
      realism_no_pct = (realism_no / total_conversations) * 100
      ambiguity_error_pct = (ambiguity_error / total_conversations) * 100
      realism_error_pct = (realism_error / total_conversations) * 100

      # Display results
      print(f"**Evaluation Results for {total_conversations} Conversations**")
      print(f"-----------------------------------------")
      print(f"🔹 **Ambiguity Maintained**: Yes = {ambiguity_yes} ({ambiguity_yes_pct:.2f}%) | No = {ambiguity_no} ({ambiguity_no_pct:.2f}%) | Error = {ambiguity_error} ({ambiguity_error_pct:.2f}%)")
      print(f"🔹 **Realistic Dialogue**: Yes = {realism_yes} ({realism_yes_pct:.2f}%) | No = {realism_no} ({realism_no_pct:.2f}%) | Error = {realism_error} ({realism_error_pct:.2f}%)")
      print(f"-----------------------------------------")

In [None]:
analyze_results()

**Evaluation Results for 200 Conversations**
-----------------------------------------
🔹 **Ambiguity Maintained**: Yes = 199 (99.50%) | No = 0 (0.00%) | Error = 1 (0.50%)
🔹 **Realistic Dialogue**: Yes = 199 (99.50%) | No = 0 (0.00%) | Error = 1 (0.50%)
-----------------------------------------


In [None]:
def save_messages_as_csv():
  prompts = []
  conversations = []

  with open("ambiguous_conversations.json", "r") as f:
      permutations = json.load(f)

  for num, permutation in enumerate(permutations):
      prompt_text = prompt(permutation)  # Generate the formatted prompt
      prompts.append(prompt_text)

      conversation = generate(prompt_text, gemini_model)  # Generate conversation
      conversations.append(conversation)

      print(f"✅ Conversation {num+1} generated!")

  # Save to CSV with UTF-8 encoding
  with open("conversations.csv", "w+", newline='', encoding="utf-8") as f:
      writer = csv.writer(f)
      writer.writerow(["prompt", "conversation"])
      for _prompt, conversation in zip(prompts, conversations):
          writer.writerow([_prompt, f'"""{conversation}"""'])  # Fixes newline issues

  print("✅ conversations.csv is saved!")


In [None]:
save_messages_as_csv()

✅ Conversation 1 generated!
✅ Conversation 2 generated!
✅ Conversation 3 generated!
✅ Conversation 4 generated!
✅ Conversation 5 generated!
✅ Conversation 6 generated!
✅ Conversation 7 generated!
✅ Conversation 8 generated!
✅ Conversation 9 generated!
✅ Conversation 10 generated!
✅ Conversation 11 generated!
✅ Conversation 12 generated!
✅ Conversation 13 generated!
✅ Conversation 14 generated!
✅ Conversation 15 generated!
✅ Conversation 16 generated!
✅ Conversation 17 generated!
✅ Conversation 18 generated!
✅ Conversation 19 generated!
✅ Conversation 20 generated!
✅ Conversation 21 generated!
✅ Conversation 22 generated!
✅ Conversation 23 generated!
✅ Conversation 24 generated!
✅ Conversation 25 generated!
✅ Conversation 26 generated!
✅ Conversation 27 generated!
✅ Conversation 28 generated!
✅ Conversation 29 generated!
✅ Conversation 30 generated!
✅ Conversation 31 generated!
✅ Conversation 32 generated!
✅ Conversation 33 generated!
✅ Conversation 34 generated!
✅ Conversation 35 gener