In [1]:
import pandas as pd
from pymongo import MongoClient
from utils.request_utils import request_llm
from typing import List, Dict
import json
import random
from tqdm import tqdm
import boto3
import certifi

In [2]:
_secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-3")

_secrets = json.loads(
    _secrets_manager_client.get_secret_value(
        SecretId=f"Prod/alloreview"
    )["SecretString"]
)
MONGO_CONNECTION_STRING = (
    "mongodb+srv://alloreview:{}@feedbacksdev.cuwx1.mongodb.net".format(
        _secrets["mongodb"]["password"]
    )
)
mongo_client = MongoClient(MONGO_CONNECTION_STRING,tlsCAFile=certifi.where())

collection = mongo_client['feedbacks_db']['feedbacks_Prod']

In [3]:
OPENAI_API_KEY = _secrets["openai"]["api_key"]
LLM_API_KEY = _secrets["litellm"]["api_key"]

In [4]:
BRAND = 'ditp_analysis'

BRAND_DESCR = '''
Feedbacks are from French public services.
'''
TYPE = 'negative'

In [5]:

from_mongo = pd.DataFrame(list(collection.aggregate([
    {
        '$match': {
            'brand': BRAND,
        },
    },
#    { "$sample" : { "size": 2000 } }
])))

from_mongo.shape

KeyboardInterrupt: 

In [23]:
# Keeping only feedbacks with extractions and splitted_analysis_v2
subdf = from_mongo[from_mongo['extractions'].notna()]
subdf = subdf[subdf['splitted_analysis_v2'].notna()]
subdf = subdf[subdf['extractions'].apply(lambda x: any('elementary_subjects' not in item for item in x))] # Enlève ceux où le traitement des elementary_subjects a déjà été réalisé

print(f"Total number of rows in the Mongo database: {from_mongo.shape[0]}.")
print(f"Number of rows with successful extraction: {from_mongo['extractions'].notna().sum()}.")
print(f"Number of rows pending elementary_subjects processing: {subdf.shape[0]}.")

Total number of rows in the Mongo database: 2000.
Number of rows with successful extraction: 194.
Number of rows pending elementary_subjects processing: 186.


In [26]:
# Keeping only feedbacks with extractions and splitted_analysis_v2
subdf = from_mongo[from_mongo['extractions'].notna()]

# Checking if any dictionary in 'extractions' contains 'elementary_subjects'
subdf = subdf[subdf['extractions'].apply(lambda x: any('elementary_subjects' in item for item in x))]

print(f"Total number of rows in the Mongo database: {from_mongo.shape[0]}.")
print(f"Number of rows with successful extraction: {from_mongo['extractions'].notna().sum()}.")
print(f"Number of rows with elementary_subjects: {subdf.shape[0]}.")


Total number of rows in the Mongo database: 2000.
Number of rows with successful extraction: 194.
Number of rows with elementary_subjects: 0.


In [36]:
# Step 1: Dataframe with extractions and splitted_analysis_v2
subdf_with_extractions = from_mongo[from_mongo['extractions'].notna()]
subdf_with_extractions = subdf_with_extractions[subdf_with_extractions['splitted_analysis_v2'].notna()]

# Step 2: Dataframe with pending elementary_subjects processing
subdf_pending_elementary_subjects = subdf_with_extractions[subdf_with_extractions['extractions'].apply(lambda x: any('elementary_subjects' not in item for item in x))]

# Step 3: Subtract the rows to get those where elementary_subjects has been processed
subdf_processed_elementary_subjects = subdf_with_extractions.loc[~subdf_with_extractions.index.isin(subdf_pending_elementary_subjects.index)]

# Output the results
print(f"Total number of rows with successful extraction: {subdf_with_extractions.shape[0]}.")
print(f"Number of rows pending elementary_subjects processing: {subdf_pending_elementary_subjects.shape[0]}.")
print(f"Number of rows with elementary_subjects already processed: {subdf_processed_elementary_subjects.shape[0]}.")


Total number of rows with successful extraction: 194.
Number of rows pending elementary_subjects processing: 186.
Number of rows with elementary_subjects already processed: 8.


In [38]:
# Inspect the content of 'extractions' for a few specific rows
for i in range(5):
    print(f"Row {i}: {subdf_processed_elementary_subjects['extractions'].iloc[i]}")



Row 0: []
Row 1: []
Row 2: []
Row 3: []
Row 4: []


In [39]:
subdf_processed_elementary_subjects['extractions']

404     []
743     []
1157    []
1191    []
1222    []
1285    []
1678    []
1944    []
Name: extractions, dtype: object

# Generates topics

In [5]:
prompt = """
**Context:**

Your goal is to build a dashboard containing various graphs for a client, described as: {brand_description}. You have extracted a list of subjects based on the feedback received. You need to create professional and sector-appropriate topics in {language} for the graphs in your dashboard.

**Instructions:**

1. **List of Subjects**: You will be provided with a list of subjects.
2. **Generate Topics**: Use the provided list to generate a set of professional and neutral topics, structured into two or three levels.
   - **Topic Structure**: Each topic can have either:
     - **Two Levels**: Example: `"Customer Service : Responsiveness to Inquiries"`
     - **Three Levels**: Example: `"Mobile Application : Features : Connection Issues"`
   - **Hierarchical Structure**: Follow a clear hierarchy when creating topics:
     - **Level 1**: General domain (e.g., "Service", "Produits", "Application Mobile", "Suivi de dossier")
     - **Level 2**: Sub-domain or aspect (e.g., "Personnel", "Mises à jour", "Communication")
     - **Level 3 and Beyond**: Specific details (e.g., "Amabilité du personnel", "Fonctionnalités supprimées")
   - **Neutrality**: Ensure topics are unbiased and neutral.
     - Example of a **wrong** topic: `"Service : Problèmes avec la gestion des rendez-vous"`
     - Example of a **correct** topic: `"Service : Gestion des Rendez-vous"`
   - **Sector-Specific Language**: Adapt the terminology to the client's industry for a professional presentation.
     - Example of a **wrong** topic: `"Service : Comportement"`
     - Example of a **correct** topic: `"Service : Personnel : Qualité de Service et Réactivité"`
   - **Consolidation of Level 1 Topics**: Aim to create multiple Level 2 and Level 3 topics under each Level 1 category to avoid too many unique Level 1 entries.

**Input**:

- List of Subjects: `{subjects}`

**Output**:

- Format the generated topics as a JSON object, with the structure below:

{{
  "topics": [
    "Service : Personnel : Qualité de Service et Réactivité",
    "Service : Gestion des Rendez-vous",
    ...
  ]
}}"""

In [11]:
def get_most_occuring_elements(feedback_collection, brand: str, n: int=7000
):
    """
    Get the n most occuring elements in the list
    """

    pipeline = [
        {"$match": {"brand": brand}},
        {"$unwind": "$extractions"},
        {"$match": {"extractions.elementary_subjects": {"$exists": True}}},
        {"$unwind": "$extractions.elementary_subjects"},
        {"$group": {"_id": "$extractions.elementary_subjects", "count": {"$sum": 1}}},
        {"$match": {"count": {"$gt": 4}}},
        {"$sort": {"count": -1}},
        {"$limit": n},
        
    ]

    return list(feedback_collection.aggregate(pipeline)) 

from openai import OpenAI



def request_llm(messages, max_tokens=500, temperature=0, model="claude-3-haiku"):
    res = client_llm.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
    )

    return res.choices[0].message.content

In [12]:
e = get_most_occuring_elements(collection, BRAND)

len(e)

5

In [13]:
def parse_json(text: str) -> Dict:
    try:
        # remove text before and after the json
        text = text[text.find('{'):text.rfind('}')+1]
        return json.loads(text)
    except Exception as e:
        print('parse_json() : ', e)
        return {}

def generate_topics(brand_description: str, subjects: List[str],  model='gpt-4o-mini', language='french'):
    messages = [
        {"role" : "user", "content": prompt.format(brand_description=brand_description, subjects=subjects, language=language)}
    ]

    response = ''
    try:
        response = request_llm(messages,model=model, max_tokens=2000)
        res = parse_json(response)

        return res.get('topics', [])
    
    except Exception as e:
        print('generate_topics() : ', e)
        print('response : ', response)
        return []

In [7]:
from_mongo.shape

(4000, 85)

In [13]:
from_mongo[from_mongo['extractions'].notna()]['extractions'].apply(lambda x: any('elementary_subjects' in item for item in x)).value_counts()

extractions
False    428
Name: count, dtype: int64

In [6]:
subdf = from_mongo[from_mongo['extractions'].notna()]
subdf = subdf[subdf['splitted_analysis_v2'].notna()]
subdf = subdf[subdf['extractions'].apply(lambda x: any('elementary_subjects' in item for item in x))] # Enlève ceux où le traitement des elementary_subjects a déjà été réalisé
subdf.shape

(0, 85)

In [14]:
subjects = (from_mongo['extractions']
            .explode()
            .dropna()
            .apply(lambda x : x.get('elementary_subjects'))
            .dropna()
            .explode()
            .dropna()
            .value_counts()
)

# drop subjects with less than 2 occurences
subjects = subjects[subjects > 5].index.tolist()

print(len(subjects))
subjects

6


['Assistance : Manque de Réponses et de Support',
 'Récupération de Documents : Attente et Exigences Administratives',
 'Système de Notation : Inexactitude des Retours Utilisateurs',
 'Service : Personnel : Accueil Positif',
 'Fonctionnement du Site : Service en Panne',
 "Renouvellement CNI : Changement d'Adresse et Procédure"]

In [30]:
topics = generate_topics(BRAND_DESCR, subjects, model='gpt-4o-mini', language='french')

topics

['Assistance : Réactivité et Support',
 'Assistance : Qualité du Service',
 'Récupération de Documents : Délais et Exigences',
 'Récupération de Documents : Processus Administratifs',
 'Système de Notation : Précision des Retours',
 "Système de Notation : Méthodologie d'Évaluation",
 'Service : Personnel : Accueil et Disponibilité',
 'Service : Personnel : Compétences et Formation',
 'Fonctionnement du Site : Disponibilité et Pannes',
 'Fonctionnement du Site : Performance et Accessibilité',
 'Renouvellement CNI : Procédures et Exigences',
 "Renouvellement CNI : Changement d'Adresse et Suivi"]

Don't hesitate to modify them manually if needed.

In [31]:
# add to topics only the part before the '>'
level1 = [topic.split(':')[0].strip() for topic in topics]
level1 = list(set(level1))
topics = topics + level1

In [32]:
topics

['Assistance : Réactivité et Support',
 'Assistance : Qualité du Service',
 'Récupération de Documents : Délais et Exigences',
 'Récupération de Documents : Processus Administratifs',
 'Système de Notation : Précision des Retours',
 "Système de Notation : Méthodologie d'Évaluation",
 'Service : Personnel : Accueil et Disponibilité',
 'Service : Personnel : Compétences et Formation',
 'Fonctionnement du Site : Disponibilité et Pannes',
 'Fonctionnement du Site : Performance et Accessibilité',
 'Renouvellement CNI : Procédures et Exigences',
 "Renouvellement CNI : Changement d'Adresse et Suivi",
 'Assistance',
 'Renouvellement CNI',
 'Système de Notation',
 'Service',
 'Récupération de Documents',
 'Fonctionnement du Site']

# Mapping

Classify all the elementary_subject into topics.

In [33]:
prompt_classify = '''
**Context:**

The goal is to **classify** the given subject into a specific topic from the provided list.

**List of Subject:**
- `{subject}`

**List of Topics:**
- `{topics}`

**Instructions:**

1. **Format of Topics**: Topics can follow the structure of either:
   - **Two levels**: `"level1 : level2"` (e.g., `"Customer Service : Responsiveness"`)
   - **Three levels**: `"level1 : level2 : level3"` (e.g., `"Mobile Application : Features : Connection Issues"`).
2. **Selection Criteria**:
   - You can choose either:
     - A **full topic** with two or three levels.
     - OR **just a level1** (e.g., `"Customer Service"`).
   - If there is **no perfect match**, the result should be `"null"` for the topic.

3. **Classification Process**:
   - **Step 1: Find the Closest Topics**:
     - Identify all topics that are most relevant to the given subject, even if they are not perfect matches.
     - List these closest topics with a brief explanation of their relevance.
   - **Step 2: Determine the Perfect Match**:
     - From the closest topics identified, select the topic that best fits the subject.
     - **Justify** why this is the most suitable match.
   - **Step 3: Handle Imperfect Matches**:
     - If no perfect match is found:
       - Try to select the most relevant **level1** category and provide it as the topic.
       - If even a relevant **level1** cannot be found, assign `"null"` as the topic.

**Examples of Previous Classifications**:
- `{examples}`

**Output**:

The output should be in JSON format:

{{
    "closest_ones": "<The most relevant topics are .... because...>",
    "justification": "<Your justification...>",
    "topic": "<topic>"
}}'''

In [34]:
def classify_subject(subject: str, topics: List[str], previous_classifications: Dict, model="gpt-4o-mini"):
    messages = [
        {"role" : "user", "content": prompt_classify.format(subject=subject, topics=topics, examples=previous_classifications)}
    ]

    response = ''
    try:
        response = request_llm(messages, model=model, max_tokens=1000)
        res = parse_json(response)

        return {
            'elementary_subject' : subject,
            'justification' : res.get('justification', ''),
            'mapping' : res.get('topic', '')
        }
    
    except Exception as e:
        print('classify_subject() : ', e)
        print('response : ', response)
        return {
            'elementary_subject' : subject,
        }

In [36]:
subjects = (from_mongo['extractions']
            .explode()
            .dropna()
            .apply(lambda x : x.get('elementary_subjects'))
            .dropna()
            .explode()
            .dropna()
            .value_counts()
)
print(len(subjects))

important_subjects = subjects[subjects > 3].index.tolist()

all_subjects = subjects.index.tolist()

len(important_subjects)

12


6

### Example

In [39]:
elementary_subject = random.choice(important_subjects)

print(elementary_subject)

classify_subject(elementary_subject, topics, {}, model="gpt-4o-mini")

Récupération de Documents : Attente et Exigences Administratives
classify_subject() :  Request timed out.
response :  


{'elementary_subject': 'Récupération de Documents : Attente et Exigences Administratives'}

1. Classify 20 first elementary_subject into topics with `claude-3-5-sonnet` to have strong and good examples.

2. Then classify the rest of the elementary_subject with `gpt-4o-mini` to finish the mapping.

In [59]:
examples = {}

for subject in important_subjects[:20]:
    res = classify_subject(subject, topics, examples)

    examples[subject] = res
    print(subject, '   --->    ',res.get('mapping'))

examples

La question 'Mon contrat Obsèques est-il toujours valide ?' se rapporte directement à l'assurance obsèques. Parmi les topics proposés, 'Assurance obsèques' est le plus pertinent et correspond parfaitement à la question posée. Cette catégorie traite spécifiquement des contrats d'assurance liés aux obsèques, ce qui inclut naturellement les questions sur la validité de ces contrats. Les autres catégories comme 'Gestion des contrats' ou 'Informations sur les contrats' pourraient être pertinentes, mais 'Assurance obsèques' est plus spécifique et donc plus appropriée pour cette question.
Mon contrat Obsèques est-il toujours valide ?    --->     Assurance obsèques
La question 'Comment augmenter mon assurance habitation chez Garance assurance ?' se rapporte directement à l'assurance habitation. Parmi les topics proposés, 'Assurance habitation' est le plus pertinent et correspond parfaitement à la question posée. Cette catégorie traite spécifiquement des contrats d'assurance habitation, ce qui 

{'Mon contrat Obsèques est-il toujours valide ?': {'elementary_subject': 'Mon contrat Obsèques est-il toujours valide ?',
  'justification': "La question 'Mon contrat Obsèques est-il toujours valide ?' se rapporte directement à l'assurance obsèques. Parmi les topics proposés, 'Assurance obsèques' est le plus pertinent et correspond parfaitement à la question posée. Cette catégorie traite spécifiquement des contrats d'assurance liés aux obsèques, ce qui inclut naturellement les questions sur la validité de ces contrats. Les autres catégories comme 'Gestion des contrats' ou 'Informations sur les contrats' pourraient être pertinentes, mais 'Assurance obsèques' est plus spécifique et donc plus appropriée pour cette question.",
  'mapping': 'Assurance obsèques'},
 'Comment augmenter mon assurance habitation chez Garance assurance ?': {'elementary_subject': 'Comment augmenter mon assurance habitation chez Garance assurance ?',
  'justification': "La question 'Comment augmenter mon assurance 

In [62]:
import concurrent.futures
from tqdm import tqdm

def classify_parallel(
        elementary_subjects: List,
        topics: List,
        previous_classification: str,
        model='gpt-4o-mini',
        chunk_size=20,
):
    ''' 
    Run the classification of the extractions in parallel
        - Create n different clusters depending on the embeddings
        - For each cluster, launch the classification in parallel

    '''
    res = []


    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(
            classify_subject,
            elementary_subject,
            topics,
            previous_classification,
            model
            ) for elementary_subject in elementary_subjects]

        for i in tqdm(range(0, len(futures), chunk_size), desc="Processing chunks"):
            completed_futures, _ = concurrent.futures.wait(futures[i:i+chunk_size], return_when=concurrent.futures.ALL_COMPLETED)

            for future in completed_futures:
                prediction = future.result()
                res.append(prediction)

    
    return res


In [64]:
examples = {key : value['mapping'] for key, value in examples.items()}

Launch the following cell to classify all the elementary_subject into topics.

In [65]:
res = classify_parallel(all_subjects, topics, examples)

Processing chunks: 100%|██████████| 93/93 [04:40<00:00,  3.02s/it]


In [68]:
tmp = pd.DataFrame(res)

tmp.mapping.value_counts()

mapping
Communication avec les clients               237
Paiements et prélèvements                    177
Transmission de documents                    176
Mise à jour des informations personnelles    138
Gestion des contrats                         123
Rentes et versements                         118
Gestion de la retraite                       109
Informations sur les contrats                104
Rachat de contrat                             96
Suivi des dossiers                            86
Accès au compte en ligne                      60
Délais de traitement                          45
Transferts de fonds                           42
Procédures administratives                    39
Fiscalité des contrats                        39
Résiliation de contrat                        39
Documents fiscaux                             34
Rendez-vous et conseil                        30
Accès aux conseillers                         29
Déclaration de décès                          26
Épargne et i