In [None]:
import yaml  
from typing import Any  
import os  
from sqlalchemy.ext.declarative import declarative_base  
from sqlalchemy.orm import sessionmaker, relationship  
from datetime import datetime  
import random  
from dotenv import load_dotenv  
from openai import AsyncAzureOpenAI, AzureOpenAI 
from pathlib import Path  
import json  
from scipy import spatial  # for calculating vector similarities for search  
import json  
import matplotlib.pyplot as plt  
from collections import Counter 
# Load YAML file  
def load_entity(file_path, entity_name):  
    with open(file_path, 'r') as file:  
        data = yaml.safe_load(file)  
    for entity in data['agents']:  
        if entity.get('name') == entity_name:  
            return entity  
    return None  
  
# Load environment variables  
env_path = Path('../app/backend') / '.env'  
load_dotenv(dotenv_path=env_path)  
client = AzureOpenAI(  
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),  
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),  
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),  
)  
chat_deployment=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT")
agents = {}
with open('../app/backend/prompt.yaml', 'r') as file:  
    data = yaml.safe_load(file)
    for entity in data['agents']:  
        if entity.get('domain_description'):
            agents[entity.get('name')] = entity.get('domain_description')
prompt = """
### Updated Prompt  
  
I am developing a model for intent change detection within customer service conversations. The goal is to identify when a customer's intent shifts to a different domain than the one currently being handled by the agent.  
  
### Information on all possible domains that agents can handle:  
  
- **hotel_agent**: "Deal with hotel reservations, confirmations, changes, and general hotel policy questions."  
- **flight_agent**: "Deal with flight reservations, confirmations, changes, and general airline policy questions."  
- **car_rental_agent**: "Deal with car rental reservations, confirmations, changes, and general car rental policy questions."  
- **general_agent**: "Deal with general customer inquiries, complaints, and requests."  
  
To achieve this, the model requires training data that includes:  
  
Please generate labeled training data for this model. Each data point should include:  
  
- **Conversation Transcripts**: The last few turns of conversations between customers and agents. Note that the conversation might not start from the beginning, and the user's transcript might be streaming with partial outputs. The conversation should have around 3-5 turns. 
- **Current Domain**: The domain the current agent is addressing.  
- **A label** indicating whether the customer's intent has shifted to a different domain or remains the same, with possible values being "no_change" or the name of the new domain.  
  
This data will be used to train the model to accurately detect intent changes in real-time customer service interactions.  
This time, I need to generate a lot more examples in "no_change", "general_agent" and "car_rental_agent"  categories so focus only on these. 
### Example Training Data in JSON Format  
  
```json  
{'training_data': [

    {  
        "conversation_transcript": [  
            "agent: How can I assist you with your flight today?",  
            "user: yesuser: I want to..uh",  
            "user: book a new flight"  
        ],  
        "current_domain": "flight_agent",  
        "intent_shift": "no_change"  
    },  
    {  
        "conversation_transcript": [  
            "user: ...and I also need to rent a car for my trip.",  
            "agent: I can help you with the car rental reservation.",  
            "user: great, I'd like to book a car for next week."  
        ],  
        "current_domain": "car_rental_agent",  
        "intent_shift": "no_change"  
    },  
    {  
        "conversation_transcript": [  
            "agent: Certainly! Here are your flight details:
- **Flight Number:** AA423
- **Departure Airport:** Airport A
- **Arrival Airport:** Airport B
- **Departure Time:** 21:00 on November 19, 2022
- **Arrival Time:** 23:00 on November 19, 2022
- **Seat Number:** 12A
- **Ticket Class:** Economy
- **Gate:** G5
- **Ticket Number:** 1602534303",  
            "user: Thank you, can I change to a new date? ."  
        ],  
        "current_domain": "flight_agent",  
        "intent_shift": "no_change"  
    }  
]  
}
"""
def generate_training_data():
        

    response = client.chat.completions.create( 
        model=chat_deployment,  
        messages=[ {"role": "user", "content": prompt}],
        temperature=0.9,
        response_format={ "type": "json_object" },
        
    )  
    return response.choices[0].message.content

output_data =[]
#write code to generate training 1000 data points, for each call, validate that the json is valid with keys conversation_transcript, current_domain, intent_shift.
for i in range(100):
    data = generate_training_data()
    try:
        data = json.loads(data)
        print(data)
        for d in data['training_data']:
            if "conversation_transcript" in d and "current_domain" in d and "intent_shift" in d and (d["intent_shift"] in agents or d["intent_shift"]=="no_change") and d["current_domain"] in agents:
                output_data.append(d)
            else:
                print(d)
    except Exception as e:
        print(e)

# Open the file in append mode  
with open('training_data_v2.jsonl', mode='a') as file:  
    for d in output_data:  
        transcript = '\n'.join(d['conversation_transcript'])  
        conversation = f"##current_domain:{d['current_domain']}\n##conversation:\n{transcript}"  
        json_line = {  
            "conversation": conversation,  
            "intent_shift": d["intent_shift"]  
        }  
        file.write(json.dumps(json_line) + '\n')  
from collections import Counter  
  
 
  
# Load data from the JSONL file  
intent_shifts = []  
  
with open('training_data_v2.jsonl', 'r') as file:  
    for line in file:  
        data = json.loads(line)  
        intent_shifts.append(data['intent_shift'])  
  
# Count the occurrences of each intent_shift  
intent_shift_counts = Counter(intent_shifts)  
  
# Visualize the distribution  
labels, values = zip(*intent_shift_counts.items())  
  
plt.figure(figsize=(10, 5))  
plt.bar(labels, values, color='skyblue')  
plt.xlabel('Intent Shift')  
plt.ylabel('Frequency')  
plt.title('Distribution of Intent Shifts')  
plt.xticks(rotation=45)  
plt.show()  


### new version of training data with just detected intent



In [4]:
import json  
  
def process_jsonl(input_file, output_file):  
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:  
        for line in infile:  
            # Load each line as a JSON object  
            record = json.loads(line)  
              
            # Extract and remove the prefix from the "conversation" field  
            conversation = record['conversation']  
            if conversation.startswith("##current_domain:"):  
                end_of_prefix = conversation.find("\n##conversation:\n") + len("\n##conversation:\n")  
                domain = conversation[len("##current_domain:"):conversation.find("\n")]  
                conversation = conversation[end_of_prefix:]  
                record['conversation'] = conversation.strip()  
                  
                # Change the key "intent_shift" to "intent"  
                record['intent'] = record.pop('intent_shift')  
                  
                # If "intent" is "no_change", update it to the extracted domain  
                if record['intent'] == "no_change":  
                    record['intent'] = domain  
              
            # Write the updated record back to the output file  
            json.dump(record, outfile)  
            outfile.write('\n')  
  
# Call the function with the appropriate file paths  
process_jsonl('training_data_v2.jsonl', 'all_training_data_v2.jsonl')  


import json  
from sklearn.model_selection import train_test_split  
  
# Load data from the JSONL file  
output_data = []  
with open('all_training_data_v2.jsonl', 'r') as file:  
    for line in file:  
        output_data.append(json.loads(line))  
  
# Extract labels for stratification  
labels = [d['intent'] for d in output_data]  
  
# Perform a stratified split  
train, test = train_test_split(output_data, test_size=0.2, stratify=labels)  
  
# Verify the split  
print(f"Training set size: {len(train)}")  
print(f"Test set size: {len(test)}")  
  
# Write the train data to training_data.jsonl  
with open('training_data_v2.jsonl', mode='w') as file:  
    for d in train:  
        file.write(json.dumps(d) + '\n')  
  
# Write the test data to validation_data.jsonl  
with open('validation_data_v2.jsonl', mode='w') as file:  
    for d in test:  
        file.write(json.dumps(d) + '\n')  


Training set size: 2602
Test set size: 651


In [61]:
import json  
from sklearn.model_selection import train_test_split  
  
# Load data from the JSONL file  
output_data = []  
with open('training_data_v2.jsonl', 'r') as file:  
    for line in file:  
        output_data.append(json.loads(line))  
  
# Extract labels for stratification  
labels = [d['intent_shift'] for d in output_data]  
  
# Perform a stratified split  
train, test = train_test_split(output_data, test_size=0.2, stratify=labels)  
  
# Verify the split  
print(f"Training set size: {len(train)}")  
print(f"Test set size: {len(test)}")  
  
# Write the train data to training_data.jsonl  
with open('training_data.jsonl', mode='w') as file:  
    for d in train:  
        file.write(json.dumps(d) + '\n')  
  
# Write the test data to validation_data.jsonl  
with open('validation_data.jsonl', mode='w') as file:  
    for d in test:  
        file.write(json.dumps(d) + '\n')  


Training set size: 2602
Test set size: 651


In [34]:
# # Create documents and label file  
# os.makedirs("documents", exist_ok=True)  
# documents = []  
  
# for i, d in enumerate(output_data):  
#     location = f"documents/{str(i).zfill(3)}.txt"  
#     transscript = '\n'.join(d['conversation_transcript'])
#     conversation = f"##current_domain:{d['current_domain']}\n##conversation:\n{transscript}"  
#     with open(location, 'w') as file:  
#         file.write(conversation)  
      
#     category = d["intent_shift"]  
#     documents.append({  
#         "location": f"{str(i).zfill(3)}.txt",  
#         "language": "en-us",  
#         "class": {  
#             "category": category  
#         }  
#     })  
  
# label_data = {  
#     "projectFileVersion": "2022-05-01",  
#     "stringIndexType": "Utf16CodeUnit",  
#     "metadata": {  
#         "projectName": "IntentChangeDetection",  
#         "storageInputContainerName": "example-data",  
#         "projectKind": "CustomSingleLabelClassification",  
#         "description": "Intent change detection in customer service",  
#         "language": "en",  
#         "multilingual": False,  
#         "settings": {}  
#     },  
#     "assets": {  
#         "projectKind": "CustomSingleLabelClassification",  
#         "classes": [{"category": agent} for agent in agents.keys()] + [{"category": "no_change"}],  
#         "documents": documents  
#     }  
# }  
  
# with open('documents/label_document.json', 'w') as file:  
#     json.dump(label_data, file, indent=2)  
