In [1]:
# %%
import pandas as pd
from pymongo import MongoClient
import json
import random
from tqdm import tqdm
import boto3
import certifi
import openai
from typing import List, Dict
import concurrent.futures
import os
import logging
from utils.prompts import PROMPT_GENERATE_TOPICS
from utils.request_utils import request_llm

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("httpx").setLevel(logging.WARNING)

# Constants
LANGUAGE = 'french'
BRAND_NAME = 'ditp_analysis'
BRAND_DESCR = 'Feedbacks are from French public services.'
MONGO_SECRET_ID = 'Prod/alloreview'
MONGO_REGION = 'eu-west-3'
MONGO_DATABASE = 'feedbacks_db'
MONGO_COLLECTION = 'feedbacks_Prod'
MODEL_NAME = 'gpt-4o-mini'  # To adjust as needed


In [2]:
# %%
def get_mongo_client():
    """
    Establishes a connection to the MongoDB client using credentials from AWS Secrets Manager or environment variables.
    """
    mongo_uri = os.getenv('MONGO_CONNECTION_STRING')
    if not mongo_uri:
        try:
            secrets_manager_client = boto3.client("secretsmanager", region_name=MONGO_REGION)
            secrets = json.loads(
                secrets_manager_client.get_secret_value(
                    SecretId=MONGO_SECRET_ID
                )["SecretString"]
            )
            password = secrets["mongodb"]["password"]
            mongo_uri = f"mongodb+srv://alloreview:{password}@feedbacksdev.cuwx1.mongodb.net"
        except Exception as e:
            print(f"Error fetching MongoDB credentials: {e}")
            raise
    return MongoClient(mongo_uri, tlsCAFile=certifi.where())

try:
    mongo_client = get_mongo_client()
    collection = mongo_client[MONGO_DATABASE][MONGO_COLLECTION]
except Exception as e:
    print(f"Error connecting to MongoDB: {e}")
    raise


In [3]:
def get_elementary_subjects_as_dataframe(collection, brand: str, sample_size=None):
    """
    Retrieves and counts elementary_subjects from feedbacks of a specific brand, returning the result as a DataFrame.

    :param collection: MongoDB collection object.
    :param brand: Name of the brand to filter feedbacks by.
    :param sample_size: Optional. If provided, randomly sample documents from the collection before aggregation.
    :return: A pandas DataFrame with elementary_subjects and their counts.
    """
    pipeline = [
        {"$match": {"brand": brand}},
        {"$unwind": "$extractions"},
        {"$project": {"elementary_subjects": "$extractions.elementary_subjects"}},
        {"$unwind": "$elementary_subjects"},
        {"$group": {"_id": "$elementary_subjects", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}}
    ]

    # Optional sampling stage if sample_size is provided
    if sample_size:
        pipeline.insert(1, {"$sample": {"size": sample_size}})

    # Run the aggregation pipeline
    result = list(collection.aggregate(pipeline))

    # Convert the result to a pandas DataFrame
    df = pd.DataFrame(result)
    df = df.rename(columns={"_id": "elementary_subjects", "count": "occurrences"})

    return df

# Example usage
elementary_subjects_df = get_elementary_subjects_as_dataframe(collection, BRAND_NAME, sample_size=80000)
elementary_subjects_df.head(5)


Unnamed: 0,elementary_subjects,occurrences
0,Suivi de dossier : Délai de traitement des dem...,166
1,Service : Incohérence des informations,133
2,Service : Frustration face aux démarches admin...,81
3,Accessibilité : Difficulté à obtenir un rendez...,78
4,Fonctionnement du site : Problèmes de connexion,50


In [4]:
topics = [
    "Les services > Qualité du service",
    "Les services > Rapidité de réponse",
    "Les services > Accessibilité des services",
    "Les services > Fiabilité des services",
    "Les services > Clarté des informations",
    
    "Les procédures > Complexité des démarches",
    "Les procédures > Exigences Documentaires",
    "Les procédures > Cohérence des procédures",
    "Les procédures > Rapidité des procédures",

    "Le personnel > Amabilité du personnel",
    "Le personnel > Disponibilité et compétence",
    "Le personnel > Empathie et écoute",

    "Site internet et applications > Facilité d'utilisation",
    "Site internet et applications > Disponibilité et fiabilité",
    "Site internet et applications > Accessibilité des services en ligne",
    "Site internet et applications > Ergonomie et fonctionnalités",
    "Site internet et applications > Fonctionnement et performance",

    "Les rendez-vous > Disponibilité des créneaux",
    "Les rendez-vous > Facilité de prise de rendez-vous",
    "Les rendez-vous > Notifications et rappels",
    "Les rendez-vous > Non honorés",

    "Les documents > Exactitude des documents délivrés",
    "Les documents > Délai de délivrance des documents",

    "Les versements et allocations > Délai de paiement",
    "Les versements et allocations > Exactitude des montants"
]


In [5]:
# Add only the first level before the first '>'
level1 = [topic.split('>')[0].strip() for topic in topics]

# Add only the second level if there are two or more levels
level2 = ['>'.join(topic.split('>')[:2]).strip() for topic in topics if '>' in topic]

# Add only the third level if there are three levels
level3 = ['>'.join(topic.split('>')[:3]).strip() for topic in topics if topic.count('>') >= 2]

# Remove duplicates by converting to a set
level1 = list(set(level1))
level2 = list(set(level2))
level3 = list(set(level3))

# Combine original topics with the new levels
topics = level1 + level2 + level3

# Mapping

Classify all the elementary_subject into topics.

In [7]:
prompt_classify = '''
**Context:**

The goal is to **classify** the given subject into a specific topic from the provided list.

**List of Subject:**
- {subject}

**List of Topics:**
- {topics}

**Instructions:**

1. **Format of Topics**: Topics can follow the structure of either:
   - **Two levels**: "level1 > level2" (e.g., "Customer Service > Responsiveness"`)
   - **Three levels**: "level1 > level2 : level3" (e.g., "Mobile Application > Features > Connection Issues").
2. **Selection Criteria**:
   - You can choose either:
     - A **full topic** with two or three levels.
     - OR **just a level1** (e.g., "Customer Service").
   - If there is **no perfect match**, the result should be "null" for the topic.

3. **Classification Process**:
   - **Step 1: Find the Closest Topics**:
     - Identify all topics that are most relevant to the given subject, even if they are not perfect matches.
     - List these closest topics with a brief explanation of their relevance.
   - **Step 2: Determine the Perfect Match**:
     - From the closest topics identified, select the topic that best fits the subject.
     - **Justify** why this is the most suitable match.
   - **Step 3: Handle Imperfect Matches**:
     - If no perfect match is found:
       - Try to select the most relevant **level1** category and provide it as the topic.
       - If even a relevant **level1** cannot be found, assign "null" as the topic.

**Examples of Previous Classifications**:
- {examples}

**Output**:

The output should be in JSON format:

{{
    "closest_ones": "<The most relevant topics are .... because...>",
    "justification": "<Your justification...>",
    "topic": "<topic>"
}}'''

In [8]:
def classify_subject(subject: str, topics: List[str], previous_classifications: Dict, model="gpt-4o-mini"):
    messages = [
        {"role" : "user", "content": prompt_classify.format(subject=subject, topics=topics, examples=previous_classifications)}
    ]

    response = ''
    try:
        response = request_llm(messages, model=model, max_tokens=1000)
        response = json.loads(response)

        return {
            'elementary_subject' : subject,
            'justification' : response.get('justification', ''),
            'mapping' : response.get('topic', '')
        }
    
    except Exception as e:
        print('classify_subject() : ', e)
        print('response : ', response)
        return {
            'elementary_subject' : subject,
        }

In [11]:
# Affiche le nombre total de sujets élémentaires dans le DataFrame
print(f"Nombre total de sujets élémentaires : {len(elementary_subjects_df)}")

# Filtre les sujets importants (ceux avec plus de 3 occurrences)
important_subjects = elementary_subjects_df[elementary_subjects_df['occurrences'] > 5]['elementary_subjects'].tolist()

# Récupère la liste de tous les sujets élémentaires
all_subjects = elementary_subjects_df['elementary_subjects'].tolist()

# Affiche le nombre de sujets importants
print(f"Nombre de sujets élémentaires avec plus de 3 occurrences (sujets importants) : {len(important_subjects)}")

Nombre total de sujets élémentaires : 503
Nombre de sujets élémentaires avec plus de 3 occurrences (sujets importants) : 105


### Example

In [12]:
elementary_subject = random.choice(important_subjects)

print(elementary_subject)

classify_subject(elementary_subject, topics, {}, model="claude-3-5-sonnet")
classify_subject(elementary_subject, topics, {}, model="gpt-4o-mini")

Service : Demande d'assistance pour problème de dossier
classify_subject() :  Expecting value: line 1 column 1 (char 0)
response :  Here's the classification for the given subject:

{
    "closest_ones": "The most relevant topics are 'Les services > Qualité du service' and 'Les services > Rapidité de réponse' because the subject mentions a request for assistance with a problem related to a file, which is directly related to service quality and response time.",
    "justification": "While both 'Les services > Qualité du service' and 'Les services > Rapidité de réponse' are closely related to the subject, 'Les services > Qualité du service' is the most suitable match. The subject 'Demande d'assistance pour problème de dossier' (Request for assistance with a file problem) is fundamentally about the quality of service provided. It encompasses not just the speed of response, but the overall ability of the service to address and resolve the issue with the file. The quality of service include

{'elementary_subject': "Service : Demande d'assistance pour problème de dossier",
 'justification': "The topic 'Les services > Clarté des informations' is the most suitable match as it directly addresses the need for assistance and clarity regarding the file issue mentioned in the subject. It emphasizes the importance of clear information in service delivery, which aligns with the request for assistance.",
 'mapping': 'Les services > Clarté des informations'}

1. Classify 20 first elementary_subject into topics with `claude-3-5-sonnet` to have strong and good examples.

2. Then classify the rest of the elementary_subject with `gpt-4o-mini` to finish the mapping.

In [14]:
examples = {}

for subject in important_subjects[:20]:
    res = classify_subject(subject, topics, examples, model="gpt-4o-mini")

    examples[subject] = res
    print(subject, '   --->    ',res.get('mapping'))

Suivi de dossier : Délai de traitement des demandes    --->     Les procédures > Rapidité des procédures
Service : Incohérence des informations    --->     Les services > Clarté des informations


In [62]:
import concurrent.futures
from tqdm import tqdm

def classify_parallel(
        elementary_subjects: List,
        topics: List,
        previous_classification: str,
        model='gpt-4o-mini',
        chunk_size=20,
):
    ''' 
    Run the classification of the extractions in parallel
        - Create n different clusters depending on the embeddings
        - For each cluster, launch the classification in parallel

    '''
    res = []


    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(
            classify_subject,
            elementary_subject,
            topics,
            previous_classification,
            model
            ) for elementary_subject in elementary_subjects]

        for i in tqdm(range(0, len(futures), chunk_size), desc="Processing chunks"):
            completed_futures, _ = concurrent.futures.wait(futures[i:i+chunk_size], return_when=concurrent.futures.ALL_COMPLETED)

            for future in completed_futures:
                prediction = future.result()
                res.append(prediction)

    
    return res


In [64]:
examples = {key : value['mapping'] for key, value in examples.items()}

Launch the following cell to classify all the elementary_subject into topics.

In [65]:
res = classify_parallel(all_subjects, topics, examples)

Processing chunks: 100%|██████████| 93/93 [04:40<00:00,  3.02s/it]


In [68]:
tmp = pd.DataFrame(res)

tmp.mapping.value_counts()

mapping
Communication avec les clients               237
Paiements et prélèvements                    177
Transmission de documents                    176
Mise à jour des informations personnelles    138
Gestion des contrats                         123
Rentes et versements                         118
Gestion de la retraite                       109
Informations sur les contrats                104
Rachat de contrat                             96
Suivi des dossiers                            86
Accès au compte en ligne                      60
Délais de traitement                          45
Transferts de fonds                           42
Procédures administratives                    39
Fiscalité des contrats                        39
Résiliation de contrat                        39
Documents fiscaux                             34
Rendez-vous et conseil                        30
Accès aux conseillers                         29
Déclaration de décès                          26
Épargne et i