In [1]:
# %%
import pandas as pd
from pymongo import MongoClient
import json
import random
from tqdm import tqdm
import boto3
import certifi
import openai
from typing import List, Dict
import concurrent.futures
import os
import logging
from utils.prompts import PROMPT_GENERATE_TOPICS
from utils.request_utils import request_llm

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("httpx").setLevel(logging.WARNING)

# Constants
LANGUAGE = 'french'
BRAND_NAME = 'ditp_analysis'
BRAND_DESCR = 'Feedbacks are from French public services.'
MONGO_SECRET_ID = 'Prod/alloreview'
MONGO_REGION = 'eu-west-3'
MONGO_DATABASE = 'feedbacks_db'
MONGO_COLLECTION = 'feedbacks_Prod'
MODEL_NAME = 'gpt-4o-mini'  # To adjust as needed


In [2]:
# %%
def get_mongo_client():
    """
    Establishes a connection to the MongoDB client using credentials from AWS Secrets Manager or environment variables.
    """
    mongo_uri = os.getenv('MONGO_CONNECTION_STRING')
    if not mongo_uri:
        try:
            secrets_manager_client = boto3.client("secretsmanager", region_name=MONGO_REGION)
            secrets = json.loads(
                secrets_manager_client.get_secret_value(
                    SecretId=MONGO_SECRET_ID
                )["SecretString"]
            )
            password = secrets["mongodb"]["password"]
            mongo_uri = f"mongodb+srv://alloreview:{password}@feedbacksdev.cuwx1.mongodb.net"
        except Exception as e:
            print(f"Error fetching MongoDB credentials: {e}")
            raise
    return MongoClient(mongo_uri, tlsCAFile=certifi.where())

try:
    mongo_client = get_mongo_client()
    collection = mongo_client[MONGO_DATABASE][MONGO_COLLECTION]
except Exception as e:
    print(f"Error connecting to MongoDB: {e}")
    raise


In [3]:
def get_most_occuring_elementary_subjects(feedback_collection, brand: str, min_count: int = 5, limit: int = 7000):
    """
    Retrieves the most frequently occurring elementary_subjects from feedbacks of a specific brand.

    :param feedback_collection: MongoDB collection object containing feedbacks.
    :param brand: Name of the brand to filter feedbacks by.
    :param min_count: Minimum count threshold for an elementary_subject to be included (default is 5).
    :param limit: Maximum number of elementary_subjects to return (default is 7000).
    :return: A list of the most frequently occurring elementary_subjects and their counts.
    """

    pipeline = [
        {"$match": {"brand": brand}},
        {"$unwind": "$extractions"},
        {"$unwind": "$extractions.elementary_subjects"},
        {"$group": {"_id": "$extractions.elementary_subjects", "count": {"$sum": 1}}},
        {"$match": {"count": {"$gt": min_count}}},
        {"$sort": {"count": -1}},
        {"$limit": limit}
    ]

    return list(feedback_collection.aggregate(pipeline))


data = get_most_occuring_elementary_subjects(collection, BRAND_NAME)
logger.info(f"Most occuring elementary_subjects retrieved: {len(data)}")

elementary_subjects = [item['_id'] for item in data]

INFO:__main__:Most occuring elementary_subjects retrieved: 106


# Generates topics

In [4]:
def generate_topics(brand_description: str, subjects: List[str],  model: str ='o1-preview', language: str =LANGUAGE):
    messages = [
        {"role" : "user", "content": PROMPT_GENERATE_TOPICS.format(brand_description=brand_description, subjects=subjects, language=language)}
    ]
    print(messages)
    response = ''
    try:
        response = request_llm(messages,model=model, max_tokens=16000, response_format={"type": "json_object"})
        response = json.loads(response)
        
        return response.get('topics', [])
    
    except Exception as e:
        print('generate_topics() : ', e)
        print('response : ', response)
        return []

In [5]:
formatted_elementary_subjects = "\n".join([f"{element}" for element in elementary_subjects])
topics = generate_topics(BRAND_DESCR, elementary_subjects, model=MODEL_NAME, language=LANGUAGE)

[{'role': 'user', 'content': '\n**Context**:\n\nYour goal is to take raw feedback subjects, which are often detailed or complaint-focused, and transform them into structured, professional, and neutral topics that are more appropriate for visual representation in dashboards. These topics need to reflect the client\'s industry and be organized in a clear hierarchy.\nFeedbacks are from French public services. The topics should be generated in french.\n\n**Instructions**:\n\n1. **List of Subjects**: You will be provided with a list of raw feedback subjects.\n2. **Generate Topics**: Using the provided subjects, generate professional and neutral topics structured into two or three hierarchical levels:\n   - **Two Levels**: Example: "Service Client > Réactivité aux Demandes"\n   - **Three Levels**: Example: "Application Mobile > Expérience Utilisateur > Problèmes de Connexion"\n   - **Topic Structure**:\n     - **Level 1**: General domain (e.g., "Service Client", "Produits", "Application Mobi

In [20]:
topics = [
    "Les services > Qualité du service",
    "Les services > Rapidité de réponse",
    "Les services > Accessibilité des services",
    "Les services > Fiabilité des services",
    "Les services > Clarté des informations",
    
    "Les procédures > Complexité des démarches",
    "Les procédures > Exigences Documentaires",
    "Les procédures > Cohérence des procédures",
    "Les procédures > Rapidité des procédures",

    "Le personnel > Amabilité du personnel",
    "Le personnel > Disponibilité et compétence",
    "Le personnel > Empathie et écoute",

    "Site internet et applications > Facilité d'utilisation",
    "Site internet et applications > Disponibilité et fiabilité",
    "Site internet et applications > Accessibilité des services en ligne",
    "Site internet et applications > Ergonomie et fonctionnalités",
    "Site internet et applications > Fonctionnement et performance",

    "Les rendez-vous > Disponibilité des créneaux",
    "Les rendez-vous > Facilité de prise de rendez-vous",
    "Les rendez-vous > Notifications et rappels",
    "Les rendez-vous > Non honorés",

    "Les documents > Exactitude des documents délivrés",
    "Les documents > Délai de délivrance des documents",

    "Les versements et allocations > Délai de paiement",
    "Les versements et allocations > Exactitude des montants"
]


Don't hesitate to modify them manually if needed.

In [23]:
# Add only the first level before the first '>'
level1 = [topic.split('>')[0].strip() for topic in topics]

# Add only the second level if there are two or more levels
level2 = ['>'.join(topic.split('>')[:2]).strip() for topic in topics if '>' in topic]

# Add only the third level if there are three levels
level3 = ['>'.join(topic.split('>')[:3]).strip() for topic in topics if topic.count('>') >= 2]

# Remove duplicates by converting to a set
level1 = list(set(level1))
level2 = list(set(level2))
level3 = list(set(level3))

# Combine original topics with the new levels
topics = level1 + level2 + level3