In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
from mistralai import Mistral
import time
import json
load_dotenv()

In [None]:
api_key = os.getenv("MISTRAL_API_KEY")
mistral = "mistral-large-latest"
client = Mistral(api_key=api_key)

In [3]:
df = pd.read_csv("C:/projetsAlternance/projetMaster/rag/e-commerce_customer_support_conversations.csv")

In [14]:
from sklearn.metrics import adjusted_rand_score

def compute_ari(true_labels, predicted_labels):   
    ari = adjusted_rand_score(true_labels, predicted_labels)
    return ari

In [15]:
from sklearn.metrics import normalized_mutual_info_score

def compute_nmi(true_labels, predicted_labels):
    nmi = normalized_mutual_info_score(true_labels, predicted_labels)
    return nmi

In [16]:
from scipy.optimize import linear_sum_assignment
import numpy as np
def compute_hungarian_accuracy(true_labels, predicted_labels):
    true_labels = pd.factorize(true_labels)[0]  
    predicted_labels = pd.factorize(predicted_labels)[0]
    
    max_label = max(max(true_labels), max(predicted_labels)) + 1
    cost_matrix = np.zeros((max_label, max_label))

    for i in range(len(true_labels)):
        cost_matrix[true_labels[i], predicted_labels[i]] += 1

    row_ind, col_ind = linear_sum_assignment(cost_matrix, maximize=True)
    accuracy = cost_matrix[row_ind, col_ind].sum() / len(true_labels)
    return accuracy

# Données à exploiter

In [11]:
def conversation_summary(conversation):
    prompt = """
    You are an expert in summarizing conversations between a user and an assistant in an e-commerce support context.

    Your task is to analyze the conversation and provide:
    - A summary of the user's question.
    - The main steps of the solution proposed by the agent.

    Instructions:
    - Be concise and clear.
    - Do not add any extra commentary or explanation.
    - Return only the structured summary.

    Here are 2 examples:

    Example 1 (Input Conversation):
    Agent: Hello, thank you for contacting BrownBox support, how can I assist you?
    Customer: Hi, I'm trying to log into my account but it keeps asking for mobile verification, and I don’t receive any code.
    Agent: I’m sorry for the trouble. Could you confirm your registered email address instead?
    Customer: Sure, it’s johndoe@email.com.
    Agent: Thanks. I see your email is verified. I'll resend a verification code to your email.
    Customer: Okay, got it! I entered the code and now it works.

    Expected Output:
    User Question: The customer cannot log in to their account due to issues with mobile verification.
    Solution Steps:
    1. Verify the customer's email address.
    2. Send a verification code to the customer's email.
    3. Ask the customer to input the code to complete login.

    Example 2 (Input Conversation):
    Agent: Thank you for contacting BrownBox support. How can I help you today?
    Customer: I received an email asking me to ship back a monitor I bought last week. Why is that?
    Agent: Let me check. (After a moment) The monitor has been recalled due to a technical defect. We’ll send you a prepaid return label.
    Customer: Okay, and how do I send it back?
    Agent: Please pack the monitor securely and drop it off at any UPS location. You’ll get a full refund once we receive it.

    Expected Output:
    User Question: The customer was asked to return a purchased monitor and wants to know the reason and return process.
    Solution Steps:
    1. Inform the customer about the product recall due to a defect.
    2. Send a prepaid return label by email.
    3. Guide the customer to return the product via UPS.
    4. Process the refund after receiving the product.
    
    Please do the same with : 
    """
    prompt += f"Conversation : {conversation}"
    chat_response = client.chat.complete(
        model=mistral,
        messages=[
            {
                "role": "system",
                "content": prompt,
            },
        ],
        temperature=0.5
    )

    return chat_response.choices[0].message.content


In [None]:
conversations = []
conversations_area = {}

for idx, row in df.iterrows():
    conversation_text = row["conversation"]
    issue_area = row["issue_area"]
    resume = conversation_summary(conversation_text)
    print(resume)
    conversations.append(resume)
    if issue_area not in conversations_area:
        conversations_area[issue_area] = []
    conversations_area[issue_area].append(resume)
    time.sleep(3)

User Question: The customer is unable to log in to their account to purchase an Oven Toaster Grill (OTG) due to issues with mobile number or email verification.

Solution Steps:
1. Ask for the customer's registered mobile number or email address.
2. Verify the provided mobile number in the records.
3. If the mobile number is incorrect, ask for the email address.
4. Send a verification code to the customer's email.
5. Instruct the customer to enter the verification code to complete the login process.
User Question: The customer wants to know why they are being asked to return a computer monitor and seeks guidance on the return process.

Solution Steps:
1. Request the customer's order number.
2. Inform the customer about the product recall due to a technical issue.
3. Send a prepaid shipping label to the customer via email.
4. Guide the customer to print the label, pack the monitor, and drop it off at a UPS store.
5. Initiate the refund process upon receiving the product, with the refund

In [None]:
# Sauvegarder conversations_area dans un fichier JSON
with open("conversations_area.json", "w", encoding="utf-8") as f:
    json.dump(conversations_area, f, ensure_ascii=False, indent=4)


# Clustering

In [49]:
def extract_user_question_and_solution(resume_text):
    user_question = ""
    lines = resume_text.split('\n')
    for line in lines:
        if line.strip().lower().startswith("user question:"):
            user_question = line.replace("User Question:", "").strip()
    return user_question,

In [133]:
with open("conversations_area.json", "r", encoding="utf-8") as f:
    conversations_area = json.load(f)

texts = []    
labels = []   

def extract_user_question(summary_text):
    lines = summary_text.split('\n')
    for line in lines:
        if line.strip().lower().startswith("user question:"):
            return line.replace("User Question:", "").strip()
    return None 

for issue_area, summaries in conversations_area.items():
    for summary in summaries:
        user_question = extract_user_question(summary)
        if user_question:
            texts.append(user_question)
            labels.append(issue_area)

In [86]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts, show_progress_bar=True)

Batches: 100%|██████████| 28/28 [00:02<00:00, 12.29it/s]


In [87]:
from sklearn.cluster import KMeans

n_clusters = len(set(labels))  

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

In [88]:
nmi = compute_nmi(labels, cluster_labels)
acc = compute_hungarian_accuracy(labels, cluster_labels)
ari = compute_ari(labels, cluster_labels)

print(f"NMI : {nmi:.4f}")
print(f"Acc : {acc:.4f}")
print(f"ARI : {ari:.4f}")

NMI : 0.5442
Acc : 0.6253
ARI : 0.4264


  true_labels = pd.factorize(true_labels)[0]


# Amélioration du clustering

In [225]:
def extract_keywords(text):
    prompt = """
    You are an expert in extracting keywords from a sentence describing a user's query.

    Your task is to:
    - Analyze the following sentence carefully.
    - Extract the most relevant keywords that represent the user's real intention.
    - Focus only on keywords that would help classify the query into one of the following categories:
    'Login and Account', 'Order', 'Shopping', 'Cancellations and returns', 'Warranty', "Shipping'

    Instructions:
    - Be concise and clear.
    - Do not add any extra commentary or explanation.
    - Return only the keywords, separated by commas if multiple.

    Here are 6 examples:

    Example 1 :
    User Question : The customer wants to deactivate their account due to dissatisfaction with a purchased food processor.
    Keywords : deactivate account

    Example 2 :
    User Question: The customer wants to know why their electric kettle order is delayed at the nearest hub and has not been sent out for delivery.
    Keywords : delayed order, order not sent out for delivery 

    Example 3 :
    User Question: The customer wants to know if an account is needed to purchase a refrigerator, what additional information is required, and the return policy.
    Keywords : purchase a refrigerator, return policy 

    Example 4 : 
    User Question: The customer wants to cancel a recent order for a smartwatch.
    Keywords : cancel order

    Example 5 : 
    User Question: The customer wants to register their recently purchased air cooler with the brand CoolAir for warranty benefits.
    Keywords : register warranty benefits, recently purchased air cooler

    Example 6 : 
    User Question: The customer wants to know about the delivery charges for an External Hard Disk and the options for faster shipping.
    Keywords : delivery charges, faster shipping

    Please do the same with : 
    """
    prompt += f"{text}"
    chat_response = client.chat.complete(
        model=mistral,
        messages=[
            {
                "role": "system",
                "content": prompt,
            },
        ],
        temperature=0.5
    )

    return chat_response.choices[0].message.content

In [None]:
keywords_texts = []
for text in texts:
    keywords = extract_keywords(text)
    print(f"text : {text}\n keywords: {keywords}")
    keywords_texts.append(keywords)
    time.sleep(4)

text : The customer is unable to log in to their account to purchase an Oven Toaster Grill (OTG) due to issues with mobile number or email verification.
 keywords: log in, account, purchase
text : The customer is unable to log into their account due to exceeding the number of attempts to enter the correct verification code.
 keywords: login, account
text : The customer cannot log in to their account to purchase a Wet Grinder because the account has been deactivated.
 keywords: login account, purchase
text : The customer, Jane, is unable to verify her mobile number and email address to place an order for a Kitchen Chimney because she is not receiving the OTP or verification code.
 keywords: verify, mobile number, email address, place an order, OTP, verification code
text : The customer is unable to log into their account due to not receiving a verification code for a recent purchase.
 keywords: login, account, verification code
text : The customer needs to change the email ID linked to 

SDKError: API error occurred: Status 429
{"message":"Requests rate limit exceeded"}

In [92]:
len(keywords_texts)

382

In [94]:

start_idx = len(keywords_texts)  # Par exemple, 382

for text in texts[start_idx:]:  # Ne traiter que les textes restants
    keywords = extract_keywords(text)
    print(f"text : {text}\n keywords: {keywords}")
    print("-"*100)
    keywords_texts.append(keywords)
    time.sleep(4)  


text : The customer received a T-shirt in the wrong size and wants to return it for a replacement.
 keywords: return, replacement
----------------------------------------------------------------------------------------------------
text : The customer wants to know the status of their refund for a Cash on Delivery order and the reimbursement for courier charges for a returned toy.
 keywords: refund, Cash on Delivery order, reimbursement, courier charges, returned toy
----------------------------------------------------------------------------------------------------
text : The customer has not received their refund for a mobile returned for Cash on Delivery.
 keywords: refund, Cash on Delivery
----------------------------------------------------------------------------------------------------
text : The customer wants to return a recently purchased water purifier and get a refund.
 keywords: return, refund
---------------------------------------------------------------------------------

In [95]:
print(len(keywords_texts))

878


In [96]:
new_data = [{"sentence": orig, "keywords": key_phrase, "issue_area": label}
            for orig, key_phrase, label in zip(texts, keywords_texts, labels)]

In [97]:
with open("keywords_ecommerce_user_queries.json", "w", encoding="utf-8") as f:
    json.dump(new_data, f, ensure_ascii=False, indent=4)

# KMeans with sentence + keywords

In [134]:
import json

# Charger ton fichier JSON
with open("keywords_ecommerce_user_queries.json", "r", encoding="utf-8") as f:
    data = json.load(f)


In [135]:
embeddings = []
true_labels = []

for item in data:
    sentence = item["sentence"].strip()
    keywords = item["keywords"].strip()
    print(keywords)
    issue_area = item["issue_area"]

    embedding_sentence = model.encode(sentence)
    embedding_keywords = model.encode(keywords)

    combined_embedding = np.concatenate((embedding_sentence, embedding_keywords))
    embeddings.append(combined_embedding)
    true_labels.append(issue_area)

log in, account, purchase
login, account
login account, purchase
verify, mobile number, email address, place an order, OTP, verification code
login, account, verification code
email ID, account
OTPs, mobile number and email verification, place an order
reactivate account, purchase a television
signing up, water purifier, US phone number
log into account, verification code
log in, BrownBox account, mobile app, mobile number, account password
sign up, corporate email ID
login, account, verify mobile number
change email ID, account
Login, account
login, account, verification code
Login and Account, password, BrownBox account
login, account, recently purchased
deactivate account
change password, update address
login, track order
deactivate account, cancel subscription
OTP, verification code, complete purchase
delivery address, change delivery date
log in, mobile verification
verify mobile number, email address, account login, verification process
verification code, mobile number, email, ve

In [136]:
from sklearn.cluster import KMeans

n_clusters = len(set(true_labels))  
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

In [137]:
nmi = compute_nmi(true_labels, cluster_labels)
acc = compute_hungarian_accuracy(true_labels, cluster_labels)
ari = compute_ari(true_labels, cluster_labels)

print(f"NMI : {nmi:.4f}")
print(f"Acc : {acc:.4f}")
print(f"ARI : {ari:.4f}")

NMI : 0.5551
Acc : 0.6321
ARI : 0.4234


  true_labels = pd.factorize(true_labels)[0]


# Pendant le clustering : K-LLMeans

In [315]:
from sklearn.metrics.pairwise import cosine_similarity

def get_embeddings(texts):
    embeddings = model.encode(texts)
    
    return embeddings

In [316]:
def generate_summary_with_llm(documents):

    prompt = """You are an expert in summarizing clusters of documents based on their underlying user intent. Below are examples of how to effectively summarize such clusters:
    Example 1:
    Document 1: "Artificial intelligence (AI) is a branch of computer science that focuses on creating intelligent machines capable of performing tasks that typically require human intelligence.
    Document 2: "Machine learning is a subset of AI that enables systems to learn from data and improve over time without being explicitly programmed.
    Document 3: "Deep learning is a further subset of machine learning, where neural networks are used to model complex patterns in large datasets.
    Summary: This cluster of documents discusses artificial intelligence (AI), with a focus on machine learning and deep learning technologies. The main theme is the development of AI systems and their ability to improve through data-driven learning processes.

    Example 2:
    Document 1: "Recommendation systems are widely used in e-commerce platforms to suggest products based on users' preferences and past behaviors.
    Document 2: "Collaborative filtering is a popular method in recommendation systems, which predicts a user's preferences based on the preferences of similar users.
    Document 3: "Content-based filtering is another technique, which suggests products based on the similarity between a user's previous interactions and the attributes of other products.
    Summary: This cluster of documents is focused on recommendation systems, detailing techniques like collaborative filtering and content-based filtering, which are used to personalize product suggestions for users based on their behavior and preferences.

    Now, based on the following documents, please generate a concise and informative summary of the cluster that captures its central theme and the key concepts discussed:"""
    
    prompt += "\n".join(documents)

    # Tokenizer le prompt et le passer au modèle
    chat_response = client.chat.complete(
        model= mistral,
        messages = [
            {
                "role": "system", "content": prompt,
            },
        ],
        temperature=0.5
    )
    return chat_response.choices[0].message.content

In [317]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def assign_clusters(embeddings, centroids):
    labels = []
    for emb in embeddings:
        similarities = cosine_similarity([emb], centroids)  # cosine_similarity attend des entrées sous forme de matrices
        labels.append(np.argmax(similarities))  # On sélectionne le centroïde avec la plus grande similarité
    return labels

In [319]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def mmr(doc_embeddings, centroid, k=5, lambda_param=0.5):
    doc_embeddings = np.array(doc_embeddings)
    centroid = np.array(centroid).reshape(1, -1)

    similarities = cosine_similarity(doc_embeddings, centroid).flatten()
    selected = []
    candidates = list(range(len(doc_embeddings)))

    k = min(k, len(doc_embeddings)) 
    
    for _ in range(k):
        if not selected:
            selected_idx = np.argmax(similarities)
        else:
            selected_similarities = cosine_similarity(doc_embeddings[candidates], doc_embeddings[selected])
            max_sim = np.max(selected_similarities, axis=1)
            mmr_score = lambda_param * similarities[candidates] - (1 - lambda_param) * max_sim
            selected_idx = candidates[np.argmax(mmr_score)]
        selected.append(selected_idx)
        candidates.remove(selected_idx)

    return selected

In [None]:
# K-Means avec mise à jour des centroids via LLM
def k_llmmeans(documents, n_clusters, update_iter, total_iter, emb_model_name):
    embeddings = get_embeddings(documents, emb_model_name)
    
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++')
    kmeans.fit(embeddings)
    centroids = kmeans.cluster_centers_
    labels = kmeans.labels_

    for iter in range(total_iter):
        print(f"iter = {iter}")
        if iter % update_iter == 0:
            print(">> Updating centroids via LLM summarization")
            # Mettre à jour les centroïdes avec LLM
            for j in range(n_clusters):
                # Extraire les indices des documents dans le cluster j
                cluster_indices = [i for i in range(len(documents)) if kmeans.labels_[i] == j]
                if not cluster_indices:
                    continue  

                cluster_embeds = [embeddings[i] for i in cluster_indices]
                cluster_texts = [documents[i] for i in cluster_indices]
                print(f"cluster_texts : {cluster_texts}")
                top_k = min(5, len(cluster_texts))  # adapte si moins de 5 docs
                selected_idxs = mmr(cluster_embeds, centroids[j], k=top_k, lambda_param=0.5)
                print(f"selected_idxs : {selected_idxs}")
                cluster_docs = [cluster_texts[i] for i in selected_idxs]
                time.sleep(3)
                # Générer un résumé avec le LLM
                summary = generate_summary_with_llm(cluster_docs)
                print(f"summary : {summary}")
                print("-"*200)
                # Re-calculer l'embedding du résumé (nouveau centroïde)
                summary_embedding = get_embeddings([summary])[0]
                centroids[j] = summary_embedding
                
        else:
            print(">> Updating centroids via mean of cluster embeddings")
            # Mise à jour des centroïdes avec l'algorithme classique (moyenne des embeddings)
            for j in range(n_clusters):
                cluster_embeddings = [embeddings[i] for i in range(len(documents)) if kmeans.labels_[i] == j]
                if cluster_embeddings:
                    centroids[j] = np.mean(cluster_embeddings, axis=0)

        # Réassignation manuelle des documents aux clusters
        labels = assign_clusters(embeddings, centroids)
        
    return labels, centroids

In [None]:
n_clusters = 6
update_iter = 5
total_iter = 20
labels, centroids = k_llmmeans(texts, n_clusters, update_iter, total_iter)

# LLM Corrector 

In [121]:
import re
def get_outliers(vectors, model, threshold=1.09):
    centroids = model.cluster_centers_
    labels = model.labels_
    distances = np.linalg.norm(vectors - centroids[labels], axis=1)
    print(f"distances : {distances}")
    outlier_indices = np.where(distances > threshold)[0]

    outlier_clusters = {idx: labels[idx] for idx in outlier_indices}
    return outlier_clusters

def get_top_k_nearest_clusters(doc_vector, centroids, k=5):
    dists = np.linalg.norm(centroids - doc_vector, axis=1)
    return np.argsort(dists)[:k]

def get_top_n_representative_docs(cluster_id, vectors, labels, centroids, documents, n=3):
    cluster_indices = np.where(labels == cluster_id)[0]
    cluster_vectors = vectors[cluster_indices]
    dists = np.linalg.norm(cluster_vectors - centroids[cluster_id], axis=1)
    top_n_indices = cluster_indices[np.argsort(dists)[:n]]

    top_n_documents = [documents[i] for i in top_n_indices]
    return top_n_documents

def classify_with_llm(outlier_doc, candidate_clusters, representatives, llm_decision_function):
    prompt = """You are an expert in clustering user's query and reassigning outliers. 
        You will be given:
        - A document to reassign
        - 5 clusters, each represented 3 typical documents
        Your task is to analyze the content and intent of the document, compare it with the cluster examples, and determine which existing cluster it should belong to.
        Please provide only the selected cluster.\n\n"""

    for i, cluster_id in enumerate(candidate_clusters):
        prompt += f"\nCluster {cluster_id}:\n"
        for doc_text in representatives[cluster_id]:
            prompt += f"- {doc_text}\n"

    prompt += f"Document :\n{outlier_doc}...\n\n" 
    prompt += "Which cluster should this super-point be assigned to?"
    return llm_decision_function(prompt)

def reassign_outliers(documents, vectors, model, llm_decision_function, threshold=1.09):
    centroids = model.cluster_centers_
    labels = model.labels_
    outliers = get_outliers(vectors, model, threshold)
    for idx, cluster in outliers.items():
        print(f"Index : {idx}, Cluster actuel : {cluster}")
    new_assignments = {}
    for idx in outliers:
        print(f"idx: {idx}")
        doc_vector = vectors[idx]
        candidate_clusters = get_top_k_nearest_clusters(doc_vector, centroids)
        
        representatives = {
            cluster_id: get_top_n_representative_docs(cluster_id, vectors, labels, centroids, documents)
            for cluster_id in candidate_clusters
        }
        print(f"representatives : {representatives}")
        time.sleep(5)
        response = classify_with_llm(
            documents[idx],
            candidate_clusters,
            representatives,
            llm_decision_function
        )
        print(f"response = {response}")
        match = re.search(r'\d+', response)
        if match:
            new_label= int(match.group())
        else:
            # Gérer le cas où aucun chiffre n'est trouvé
            print(f"Aucun chiffre trouvé dans la réponse : {response}")
            new_label = labels[idx]
        print(f"new_label = {new_label}")
        new_assignments[idx] = new_label
    return new_assignments


def ask_llm(prompt):
    chat_response = client.chat.complete(
        model= mistral,
        messages = [
            {
                "role": "system", "content": prompt,
            },
        ],
        temperature=0.3
    )
    return chat_response.choices[0].message.content

In [320]:
texts_to_encode = []
true_labels = []

for item in data:
    sentence = item["sentence"].strip()
    keywords = item["keywords"].strip()
    issue_area = item["issue_area"]

    # Fusionner proprement la phrase et les mots-clés
    full_text = f"{sentence}. Keywords: {keywords}"

    texts_to_encode.append(full_text)
    true_labels.append(issue_area)

In [324]:
embeddings = []
true_labels = []

for item in data:
    sentence = item["sentence"].strip()
    keywords = item["keywords"].strip()
    print(keywords)
    issue_area = item["issue_area"]

    embedding_sentence = model.encode(sentence)
    embedding_keywords = model.encode(keywords)

    combined_embedding = np.concatenate((embedding_sentence, embedding_keywords))
    embeddings.append(combined_embedding)
    true_labels.append(issue_area)

log in, account, purchase
login, account
login account, purchase
verify, mobile number, email address, place an order, OTP, verification code
login, account, verification code
email ID, account
OTPs, mobile number and email verification, place an order
reactivate account, purchase a television
signing up, water purifier, US phone number
log into account, verification code
log in, BrownBox account, mobile app, mobile number, account password
sign up, corporate email ID
login, account, verify mobile number
change email ID, account
Login, account
login, account, verification code
Login and Account, password, BrownBox account
login, account, recently purchased
deactivate account
change password, update address
login, track order
deactivate account, cancel subscription
OTP, verification code, complete purchase
delivery address, change delivery date
log in, mobile verification
verify mobile number, email address, account login, verification process
verification code, mobile number, email, ve

In [122]:
kmeans = KMeans(n_clusters=6, random_state=42)
embeddings = np.array(embeddings)

labels = kmeans.fit_predict(embeddings)
new_labels = reassign_outliers(texts_to_encode, embeddings, kmeans, ask_llm)

distances : [0.86598784 0.8457926  0.96894056 0.9921958  0.8149337  1.0856361
 1.1037848  1.1390526  1.2039719  0.8424351  0.91920584 1.0806729
 0.81745607 1.1318024  0.8633884  0.871693   1.0535064  0.89455396
 1.1323743  1.1060301  1.0117623  1.2696261  1.1429253  1.0840361
 0.88031644 0.92283386 1.053856   1.1845218  1.1008222  0.8884023
 1.033838   1.030133   1.039778   0.8071612  1.1684961  0.8595369
 0.83262044 1.1139972  0.9399943  1.1691762  0.8150248  1.1167346
 1.1680425  0.9660141  0.9009145  1.0830128  1.181104   1.078987
 0.797804   0.9498617  1.1800249  0.93479204 1.2199817  1.149447
 0.82068616 1.1045375  1.2171335  1.1581997  1.1501284  1.0721343
 1.055107   1.1153238  1.2110457  1.1316873  1.1258774  1.0303996
 1.2244772  1.13291    0.93880856 0.8839518  1.0465913  0.98212856
 1.0396136  0.98700875 1.1795911  0.87319183 1.1446862  1.1057413
 1.0815748  1.0311825  1.1021491  1.0184273  1.1683295  0.8664008
 1.1334089  1.040722   0.9575665  0.78468245 0.9503544  1.150642

In [125]:
for idx, new_label in new_labels.items():
    labels[idx] = new_label

In [126]:
print(len(labels))

878


In [127]:
nmi = compute_nmi(true_labels, labels)
acc = compute_hungarian_accuracy(true_labels, labels)
ari = compute_ari(true_labels, labels)

print(f"NMI : {nmi:.4f}")
print(f"Acc : {acc:.4f}")
print(f"ARI : {ari:.4f}")

NMI : 0.5166
Acc : 0.6344
ARI : 0.4221


  true_labels = pd.factorize(true_labels)[0]


# Correspondance entre clusters prédits et vrais labels

In [138]:
import pandas as pd
from collections import Counter

df_clusters = pd.DataFrame({
    "cluster": cluster_labels,
    "issue_area": true_labels,
    "text": texts
})

cluster_to_issue_area = {}

for cluster_id in df_clusters["cluster"].unique():
    cluster_issues = df_clusters[df_clusters["cluster"] == cluster_id]["issue_area"]
    major_issue = Counter(cluster_issues).most_common(1)[0][0]
    cluster_to_issue_area[cluster_id] = major_issue

cluster_to_issue_area = {int(k): v for k, v in cluster_to_issue_area.items()}
print(cluster_to_issue_area)

{2: 'Login and Account', 1: 'Order', 3: 'Warranty', 4: 'Shopping', 0: 'Cancellations and returns', 5: 'Cancellations and returns'}


# Gestion de la base vectorielle 

In [31]:
import chromadb
chroma_client = chromadb.Client()

In [32]:
collection = chroma_client.create_collection(name="support_conversations")

In [33]:
data_to_insert = []
id_counter = 0

for issue_area, summaries in conversations_area.items():
    for summary in summaries:
        embedding = model.encode(summary).tolist()
        
        data_to_insert.append({
            "id": id_counter,
            "vector": embedding,
            "text": summary,
            "issue_area": issue_area
        })
        id_counter += 1

In [34]:
print(data_to_insert[0])

{'id': 0, 'vector': [-0.03778662532567978, 0.0682968869805336, 0.014040471985936165, -0.06348167359828949, -0.04375509172677994, -0.048149917274713516, 0.08049701154232025, -0.007005991414189339, 0.03040977567434311, -0.0369914248585701, 0.05229750648140907, -0.06696675717830658, 0.01796921342611313, 0.015532026067376137, 0.05584755912423134, -0.08752481639385223, -0.017668621614575386, -0.002230148995295167, 0.0019570800941437483, -0.021036965772509575, -0.02936643734574318, -0.036036476492881775, 0.03303857892751694, 0.010246298275887966, -0.020354855805635452, -0.13560491800308228, 0.013655802235007286, 0.061314765363931656, -0.06703097373247147, -0.009521306492388248, 0.06807677447795868, 0.06780168414115906, -0.02507895976305008, -0.012935212813317776, 0.05934719741344452, -0.06628163903951645, 0.032467182725667953, -0.007497522979974747, -0.051534004509449005, -0.02828315459191799, -0.0510740801692009, -0.061482179909944534, -0.018846405670046806, 0.00771982129663229, 0.073832295

In [35]:
documents = []
ids = []
metadatas = []
embeddings = []

for item in data_to_insert:
    documents.append(item["text"])
    ids.append(str(item["id"])) 
    metadatas.append({"issue_area": item["issue_area"]})
    embeddings.append(item["vector"])

In [36]:
collection.add(
    documents=documents,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids
)

# Clustering par un LLM

In [213]:
def cluster_by_llm(text):
    prompt = """
    You are an expert in clustering user's query to 6 domains : 'Login and Account', 'Order', 'Shopping', 'Cancellations and returns', 'Warranty', "Shipping'

    Instructions:
    - Be concise and clear.
    - Do not add any extra commentary or explanation.
    - Return only the predicted cluster

    """
    prompt += f"{text}"
    chat_response = client.chat.complete(
        model=mistral,
        messages=[
            {
                "role": "system",
                "content": prompt,
            },
        ],
        temperature=0.5
    )

    return chat_response.choices[0].message.content

# Exemple d'exécution

In [274]:
def get_embedding(query):
    keywords = extract_keywords(query)
    print(keywords)
    query_emb = model.encode(query)
    keywords_emb = model.encode(keywords)
    emb = np.concatenate((query_emb, keywords_emb))
    emb = emb.astype(np.float64)
    emb = emb.reshape(1, -1)
    return emb

In [271]:
def get_clusters(query, embedding):
    cluster_llm = cluster_by_llm(query)
    predicted_cluster = kmeans.predict(embedding)[0]
    cluster_kmeans = cluster_to_issue_area[int(predicted_cluster)]

    return cluster_llm, cluster_kmeans

In [272]:
def get_documents(query, cluster_llm, cluster_kmeans):
    if cluster_llm == cluster_kmeans: 
        results = collection.query(
            query_texts=query,
            where={"issue_area": cluster_llm},
            n_results=3,
            include=["documents", "metadatas", "distances"]
        ) 
        documents_to_pass = results["documents"][0]
    else : 
        results_llm = collection.query(
            query_texts=query,
            where={"issue_area": cluster_llm},
            n_results=3,
            include=["documents", "metadatas", "distances"]
        ) 
        results_kmeans = collection.query(
            query_texts=query,
            where={"issue_area": cluster_kmeans},
            n_results=3,
            include=["documents", "metadatas", "distances"]
        ) 
        documents_to_pass = results_llm["documents"][0] + results_kmeans["documents"][0]
    
    return documents_to_pass


In [273]:
def get_response(query, documents): 
    system_prompt = (
        "You are an expert assistant in answering user queries related to 'Login and Account', 'Order', "
        "'Shopping', 'Cancellations and returns', 'Warranty', and 'Shipping'.\n\n"
        "You must carefully analyze the user's question and provide a clear, helpful answer based on the following past experiences (examples) provided.\n"
        "Use these documents to understand the user's need and the appropriate steps to assist.\n\n"
        "Documents:\n"
        f"{documents}\n\n"
        "Instructions:\n"
        "- These documents are just examples, do not answer exactly the same way (same objects)"
        "- Be concise and clear.\n"
        "- Do not add any extra commentary or explanation.\n"
        "- Provide a direct answer to the user's question based on the given experiences.\n"
    )

    chat_response = client.chat.complete(
        model=mistral,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query},
        ],
        temperature=0.7
    )
    return chat_response.choices[0].message.content

In [312]:
query = "I want to deactivate my account and remove the product from my order history."
embedding = get_embedding(query)
time.sleep(2)
cluster_llm, cluster_kmeans = get_clusters(query, embedding)
documents = get_documents(query, cluster_llm, cluster_kmeans)
time.sleep(2)
response = get_response(query, documents)

deactivate account, order history


In [313]:
print(f"cluster_llm: {cluster_llm}\ncluster_kmeans: {cluster_kmeans}")

cluster_llm: Login and Account
cluster_kmeans: Login and Account


In [314]:
print(response)

I'm sorry to hear that you want to deactivate your account. Let's proceed with your request. Could you please provide your email address and the order number.
