In [None]:
from openai import OpenAI
import openai
import boto3
import json
import uuid
import numpy as np
import torch
import torchaudio
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
import speech_recognition as sr
import gc
from speechbrain.pretrained import SpeakerRecognition
from heapq import nlargest

In [None]:
openai.api_key = "Key"
dynamodb_resource = boto3.resource("dynamodb", region_name="us-east-1")
dynamodb_client = boto3.client("dynamodb", region_name="us-east-1")

voice_model = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_model")

SIMILARITY_THRESHOLD = 1
CONVERSATION_LIMIT = 10
recognizer = sr.Recognizer()

embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [3]:
def process_audio(file):
    """Extract voice embedding from an audio file."""
    signal, sample_rate = torchaudio.load(file)
    mean_signal = signal.mean(dim=0) if signal.ndim > 1 else signal
    if sample_rate != 16000:
        mean_signal = torchaudio.transforms.Resample(sample_rate, 16000)(mean_signal)
    return voice_model.encode_batch(mean_signal.unsqueeze(0))[0].squeeze().detach()

In [4]:
# === 📌 Step 2: Convert Speech to Text ===
def transcribe_audio(file):
    """Convert speech to text from an audio file."""
    with sr.AudioFile(file) as source:
        audio_data = recognizer.record(source)
        return recognizer.recognize_google(audio_data)

In [5]:
def ensure_table_exists():
    """Ensure the persona_info table exists."""
    try:
         table = dynamodb_resource.Table("persona_info")
         table.load()  
    except dynamodb_resource.meta.client.exceptions.ResourceNotFoundException:
        # If table does not exist, create it
     #   print("Table persona_info not found, creating it...")
        dynamodb_resource.create_table(
            TableName='persona_info',
            KeySchema=[
                {'AttributeName': 'persona_id', 'KeyType': 'HASH'}  # Partition key
            ],
            AttributeDefinitions=[
                {'AttributeName': 'persona_id', 'AttributeType': 'S'}
            ],
            ProvisionedThroughput={'ReadCapacityUnits': 5, 'WriteCapacityUnits': 5}
        )
    #    print("persona_info table created.")
        # Wait for the table to become active before proceeding
        table = dynamodb_resource.Table("persona_info")
        table.wait_until_exists()  # Wait until the table is available
   #     print("persona_info table is now active.")


In [6]:
def find_or_create_person(embedding, query_text):
    """Ensure all embeddings are unique and assign to a new persona if the embedding is unique."""
    person_info_table = dynamodb_resource.Table("persona_info")
    ensure_table_exists()  # Ensure the table exists
    
    # Convert the embedding tensor to bytes for storage
    embedding_bytes = embedding.numpy().tobytes()

    response = person_info_table.scan()

    for item in response.get("Items", []):
        if "voice_embedding" in item:
            stored_embedding = np.frombuffer(bytes(item["voice_embedding"]), dtype=np.float32)

            if 1 - cosine(embedding, stored_embedding) >= SIMILARITY_THRESHOLD:
                print(f"Found matching persona_id: {item['persona_id']}")
                # Return the existing persona_id if a match is found
                return item["persona_id"]

    persona_id = str(uuid.uuid4())  # Generate a new persona_id
    persona_summary = extract_persona_from_query(query_text, None)

    person_info_table.put_item(
        Item={
            "persona_id": persona_id,
            "voice_embedding": embedding_bytes,  # Store the byte representation of the embedding
            "persona_summary": persona_summary
        }
    )
    return persona_id

In [7]:
def get_openai_embedding(text):
    response = openai.embeddings.create(
        model="text-embedding-ada-002",  # Use the OpenAI text embedding model
        input=text
    )
    return np.array(response.data[0].embedding)

In [8]:
def extract_persona_from_query(query_text, stored_persona=None):
    """Use GPT-3.5 Turbo to infer a structured persona from a query, refining existing persona if available."""

    previous_persona_text = stored_persona if stored_persona else "None"

    # Detailed prompt to guide GPT-3.5 Turbo to extract more comprehensive persona information
    prompt = f"""
    You are an AI that extracts and refines user personas from their queries. 
    The persona consists of the following attributes:
    
    1. **Demographics**: Age range, gender, location, and any other personal identifiers.
    2. **Career Information**: Job role, career industry, professional background, and ambitions.
    3. **Personality Traits**: Extroverted, introverted, goal-oriented, creative, analytical, etc.
    4. **Preferences**: Topics of interest, preferred interaction styles, hobbies, favorite activities, etc.
    5. **Decision-Making Style**: Logical, emotional, value-based decisions, etc.
    6. **Emotional Triggers**: Motivations, what excites or discourages them, dislikes inefficiency, etc.
    7. **Health and Lifestyle**: Any health-related information, fitness, diet, or lifestyle habits.
    8. **Challenges and Pain Points**: What obstacles or frustrations they face in daily life or work.
    9. **Goals and Aspirations**: What they want to achieve, both personally and professionally.
    10. **Current Context**: Current life situation (e.g., job change, relationship status, health concerns).
    
    **Existing Persona**: {previous_persona_text}

    **New User Query**: "{query_text}"

    **Instructions**:
    - Extract as much detailed information as possible from the user query.
    - If the persona is already available, refine it by adding new details.
    - If the persona is "None", create a new persona from scratch, including as much detail as you can.
    - The extracted persona should include clear and detailed attributes that give a deep understanding of the person.

    Provide the extracted persona in a structured JSON format with the following keys:
    - "demographics",
    - "career_information",
    - "personality_traits", 
    - "preferences",
    - "decision_making_style",
    - "emotional_triggers",
    - "health_and_lifestyle", 
    - "challenges",
    - "goals_and_aspirations",
    - "current_context",
    Be thorough, and include all possible relevant details in the persona.
    """
    client = OpenAI(api_key=openai.api_key)

    response = client.chat.completions.create(
        model="gpt-3.5-turbo", 
        messages=[{"role": "system", "content": "You are a helpful assistant."},
                  {"role": "user", "content": prompt}],
        max_tokens=500,  # Adjust the token count based on expected output size
        temperature=0.7  # Adjust the temperature for creativity in responses
    )

    response_content = response.choices[0].message.content.strip()
  #  print(response_content)
    return response_content

In [9]:
def update_persona(person_id, query_text, embedding):
    """Retrieve, update, and store persona description in DynamoDB."""
    person_info_table = dynamodb_resource.Table("persona_info")

    response = person_info_table.get_item(Key={"persona_id": person_id})

    if "Item" in response:
        item = response["Item"]
        embedding_previous = np.frombuffer(bytes(item["voice_embedding"]), dtype=np.float32)

        new_embedding = (embedding + embedding_previous) / 2

        new_embedding_bytes = new_embedding.numpy().tobytes()

        stored_persona = item.get("persona_summary", None)
        
    else:
        print(f"Persona with ID {person_id} does not exist.")
        return None

    new_persona = extract_persona_from_query(query_text, stored_persona)

    person_info_table.put_item(
        Item={
            "persona_id": person_id,
            "voice_embedding": new_embedding_bytes,  # Store the new embedding
            "persona_summary": new_persona  # Store the new persona summary
        }
    )

   # print(f"Updated persona with ID: {person_id}")
    return new_persona

In [10]:
def create_permanent_table_if_not_exists(person_id):
    """Create a permanent table to store summarized information."""
    table_name = f"permanent_{person_id}"
    
    # Check if the table exists
    existing_tables = dynamodb_client.list_tables()["TableNames"]
    
    if table_name not in existing_tables:
        # Create the table if it does not exist
        print(f"Creating permanent table {table_name}...")
        dynamodb_resource.create_table(
            TableName=table_name,
            KeySchema=[{'AttributeName': 'summary_id', 'KeyType': 'HASH'}],
            AttributeDefinitions=[{'AttributeName': 'summary_id', 'AttributeType': 'S'}],
            ProvisionedThroughput={'ReadCapacityUnits': 5, 'WriteCapacityUnits': 5}
        )
   #     print(f"Permanent table {table_name} created.")
        # Wait for the table to become active
        table = dynamodb_resource.Table(table_name)
        table.wait_until_exists()
     #   print(f"Permanent table {table_name} is now active.")


In [11]:
def create_temp_table_if_not_exists(person_id):
    """Create a temporary table for a person if it doesn't exist."""
    table_name = f"temp_{person_id}"
    
    # Check if the table exists
    existing_tables = dynamodb_client.list_tables()["TableNames"]
    
    if table_name not in existing_tables:
        # Create the table if it does not exist
  #      print(f"Creating temporary table {table_name}...")
        dynamodb_resource.create_table(
            TableName=table_name,
            KeySchema=[{'AttributeName': 'query_id', 'KeyType': 'HASH'}],
            AttributeDefinitions=[{'AttributeName': 'query_id', 'AttributeType': 'S'}],
            ProvisionedThroughput={'ReadCapacityUnits': 5, 'WriteCapacityUnits': 5}
        )
     #   print(f"Temporary table {table_name} created.")
        # Wait for the table to become active
        table = dynamodb_resource.Table(table_name)
        table.wait_until_exists()
     #   print(f"Temporary table {table_name} is now active.")


def store_in_temp_table(person_id, query, response):
    """Store query-response in a temporary table for conversation tracking."""
    create_temp_table_if_not_exists(person_id)  # Ensure the table exists

    table_name = f"temp_{person_id}"
    table = dynamodb_resource.Table(table_name)
    table.put_item(
        Item={
            "query_id": str(uuid.uuid4()),
            "query": query,
            "response": response,
            "query_embedding" : (get_openai_embedding(query.strip())).tobytes(),
            "response_embedding" :  (get_openai_embedding(response.strip())).tobytes()    ,
        }
    )

In [12]:
# Function to clear the content of the temporary table
def clear_temp_table(person_id):
    """Clear all content from the temporary table for a person."""
    temp_table_name = f"temp_{person_id}"
    temp_table = dynamodb_resource.Table(temp_table_name)
    
    # Scan the table to get all the items
    response = temp_table.scan()
    items = response.get("Items", [])
    
    # Delete each item
    for item in items:
        temp_table.delete_item(Key={'query_id': item['query_id']})

  #  print(f"All content cleared from temporary table: {temp_table_name}")

In [13]:
def summarize_conversations_and_store(person_id):
    """Summarize conversations stored in the temporary table and move them to the permanent table."""
    temp_table_name = f"temp_{person_id}"
    permanent_table_name = f"permanent_{person_id}"

    # Scan the temp table for all the conversations
    temp_table = dynamodb_resource.Table(temp_table_name)
    response = temp_table.scan()
    conversations = response.get("Items", [])
    
    if len(conversations) >= 10:
        # Summarize the conversations
        summarized_content = " ".join([f"Query: {item['query']} Response: {item['response']}" for item in conversations])


        prompt = f"""
            You are an AI that specializes in summarizing conversations and persona data. 
            Given the following conversations, provide a comprehensive summary that contains 
            the key details. Ensure that the summary includes relevant queries, and responses, 
            while omitting unnecessary data. give me like a pragraph, can you make it as the person is speaking about
            themself 
            Here are the details of previous conversations:
            {summarized_content}   """
        
    
        client = OpenAI(api_key=openai.api_key)
            
        response = client.chat.completions.create(
        model="gpt-3.5-turbo", 
        messages=[{"role": "system", "content": "You are a helpful assistant."},
                          {"role": "user", "content": prompt}],
        max_tokens=800, 
        temperature=0.7  
            )

        response_content = response.choices[0].message.content.strip()


        segments = response_content.split('. ')  # Split by sentences (you can customize this)

        # Ensure the permanent table exists
        create_permanent_table_if_not_exists(person_id)
        permanent_table = dynamodb_resource.Table(permanent_table_name)

        # Store each segment in the permanent table
        for segment in segments:
            permanent_table.put_item(
                Item={
                    "summary_id": str(uuid.uuid4()),  # Unique ID for each segment
                    "segment": segment.strip(),  # Store each segment separately
                    "embeddings_vector" : (get_openai_embedding(segment.strip())).tobytes(),
                }
            )

        # Clear the temporary table after moving the summary
        clear_temp_table(person_id)
   #     print(f"Summarized segments for {person_id} stored in permanent table and temp table cleared.")


In [14]:
# Function to calculate cosine similarity between two vectors
def calculate_cosine_similarity(embedding1, embedding2):
    return 1 - cosine(embedding1, embedding2)

In [15]:
def retrieval_similarity(person_id, query):
    temp_table_name = f'temp_{person_id}'
    permanent_table_name = f'permanent_{person_id}'

    # Try to scan the temp table
    try:
        temp_table = dynamodb_resource.Table(temp_table_name)
        response_temp = temp_table.scan()
        conversations_temp = response_temp.get("Items", [])
    except dynamodb_client.exceptions.ResourceNotFoundException:
        print(f"Temporary table {temp_table_name} does not exist. Skipping...")
        conversations_temp = []

    # Try to scan the permanent table
    try:
        perm_table = dynamodb_resource.Table(permanent_table_name)
        response_perm = perm_table.scan()
        conversations_perm = response_perm.get("Items", [])
    except dynamodb_client.exceptions.ResourceNotFoundException:
        print(f"Permanent table {permanent_table_name} does not exist. Skipping...")
        conversations_perm = []

    # Check if both tables are empty
    if not conversations_temp and not conversations_perm:
        print("Both the temporary and permanent tables are empty.")
        return []

    # Get the query embedding
    query_embedding = get_openai_embedding(query)
    similarities = []

    for item in conversations_temp:
        query_temp = item.get('query', "")
        response_temp = item.get('response', "")
        
        # Get embeddings for the query and response from the temp table
        query_temp_embedding = np.frombuffer(bytes(item.get('query_embedding')), dtype=np.float64)
        response_temp_embedding = np.frombuffer(bytes(item.get('response_embedding')), dtype=np.float64)
        
        # Calculate cosine similarity
        query_similarity = calculate_cosine_similarity(query_embedding, query_temp_embedding)
        response_similarity = calculate_cosine_similarity(query_embedding, response_temp_embedding)
        
        # Store the similarity and related information
        similarities.append({
            "source": "temp",
            "query": query_temp,
            "response": response_temp,
            "query_similarity": query_similarity,
            "response_similarity": response_similarity
        })

    for item in conversations_perm:
        query_perm = item.get('segment', "")
        query_perm_embedding = np.frombuffer(bytes(item.get('embeddings_vector')), dtype=np.float64)
        
        # Calculate cosine similarity
        query_similarity = calculate_cosine_similarity(query_embedding, query_perm_embedding)
        
        # Store the similarity and related information
        similarities.append({
            "source": "perm",
            "query": query_perm,
            "query_similarity": query_similarity,
            "response_similarity": None
        })

    if not similarities:
        print("No conversations found to compare.")
        return []

    sorted_similarities = sorted(
        similarities,
        key=lambda x: max(x['query_similarity'], x['response_similarity'] if x['response_similarity'] is not None else 0),
        reverse=True
    )

    top_results = sorted_similarities[:5]
    return top_results


In [16]:
def response_function_llm(previous_info, person_description, query):

  #  prompt = f"""
  #  Imagine you're a close friend of the person, acting as an empathetic friend-agent. 
  #  You have the task of responding to the user's query in a way that aligns with the person's unique personality traits and past experiences. 
  #  Take into account their character, preferences, and previous conversation data to craft a thoughtful, personalized response. 

  #  **Existing Persona**:
  #  {person_description}

#    **Previous Conversations**:
#    {previous_info}

 #   **New User Query**:
 #   "{query}"

  #  Please provide a response that reflects the person's personality and history.
  #  """


    prompt = f"""
        Imagine you're a close friend of the person, acting as an empathetic friend-agent. 
        You have the task of responding to the user's query in a way that aligns with the person's unique personality traits and past experiences. 
        Take into account their character, preferences, and previous conversation data to craft a thoughtful, personalized response. 

        **Existing Persona**:
         {person_description}
        
        **Previous Conversations**:
        {previous_info}
        
        **New User Query**:
        "{query}"
        
        Please provide a response that is **concise (within 3-4 lines only)** while still reflecting the person's personality and history.
    """


    # Interacting with OpenAI API to generate the response based on the prompt
    client = OpenAI(api_key=openai.api_key)

    response = client.chat.completions.create(
    model=  "gpt-4-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant. Keep responses short and concise, strictly 3-4 lines only."},
        {"role": "user", "content": prompt}
        ],
        max_tokens= 250,  # Reduce max tokens to limit response length
        temperature=0.7
    )

    # Calling the OpenAI GPT-3.5 model for response generation
 #   response = client.chat.completions.create(
 #       model="gpt-3.5-turbo", 
 #       messages=[{"role": "system", "content": "You are a helpful assistant."},
 #                 {"role": "user", "content": prompt}],
 #       max_tokens= 500,  # Adjusting the maximum token size for reasonable response length
 #       temperature=0.7  # Ensuring responses are creative but consistent with the persona
 #   )
    
    # Extracting and cleaning the response
    response_content = response.choices[0].message.content.strip()
    return response_content

In [17]:
def get_person_description_info_from_table(person_id):
    """Retrieve the person's description from the persona_info table."""
    # Get the persona_info table
    person_info_table = dynamodb_resource.Table("persona_info")

    # Retrieve the item based on person_id
    response = person_info_table.get_item(Key={"persona_id": person_id})

    if "Item" in response:
        # Extract the person description from the response
        return response["Item"].get("persona_summary", "No description available")
    else:
        print(f"No persona found for person_id {person_id}")
        return None

In [18]:
#def handle_audio_query(audio_file):
def handle_audio_query(embeddings, text_query):
#"""Process audio file, extract persona, and update database."""
    
    #embedding = process_audio(audio_file)  # Speaker recognition
    embedding = embeddings
    #query_text = transcribe_audio(audio_file)  # Speech-to-text
    query_text = text_query
    person_id = find_or_create_person(embedding, query_text)  # Identify speaker
    persona_summary = update_persona(person_id, query_text, embedding)

    previous_info = retrieval_similarity(person_id, query_text)
    persona_description =  """
        Age: 25
        Occupation: AI researcher and consultant
        Personality Traits: Logical, analytical, patient, and goal-oriented.
        Interests: AI, psychology, technology, and problem-solving.
        Decision-Making Style: Data-driven, prefers structured analysis.
        Communication Style: Concise, structured, and professional.
        Values: Accuracy, transparency, and fairness.
        """

    
    #get_person_description_info_from_table(person_id)
    
    response = response_function_llm(previous_info, persona_description, query_text)
    store_in_temp_table(person_id, query_text, response)
    summarize_conversations_and_store(person_id)
    gc.collect()

    return person_id, response
   # print(response)


   # return f"Processed query for {person_id} - Persona Updated: {json.dumps(persona_summary, indent=2)}"

In [None]:
file_name_final = 'path_name.json'

In [20]:
import json
with open(file_name_final, 'r') as file:
    data = json.load(file)

In [None]:
persona_generated = []
for n in list_data:
    torch.manual_seed(n)
    embedding = torch.randn(192)
    values = data[n]
    Generated_Orginal = []
    for m in range(0, len(data[n]['Questions'])):
        person_id, response = handle_audio_query(embedding, data[n]['Questions'][m])
        orginal_response = data[n]['Question_Answer'][m]
        Generated_Orginal.append([response, orginal_response])
        
    person_info_table = dynamodb_resource.Table("persona_info")
    persona_values_table = person_info_table.get_item(Key={"persona_id": person_id})
    if "Item" in persona_values_table:
        item = persona_values_table["Item"]
        stored_persona = item.get("persona_summary", None)

    entry = {
        'ExtractedPersona' : stored_persona,
        'Generated_Orginal' : Generated_Orginal,
        'Persona_Original' : data[n]['Persona_Discription'],
    }
    persona_generated.append(entry)