 # Extractions



 The objective of this notebook is to generate extractions from the verbatims.

 ## Imports and Constants

In [1]:
# %%
import os
import json
import pandas as pd
import boto3
import certifi
from pymongo import MongoClient
from tqdm import tqdm

from utils.extractions_utils import (
    extract_information_from_text,
    process_extractions_in_parallel
)

# Constants
LANGUAGE = 'french'
BRAND_NAME = 'ditp_analysis'
MONGO_SECRET_ID = 'Prod/alloreview'
MONGO_REGION = 'eu-west-3'
MONGO_DATABASE = 'feedbacks_db'
MONGO_COLLECTION = 'feedbacks_Prod'
SAMPLE_SIZE = 100  # To adjust as needed
MODEL_NAME = 'gpt-4o-mini'  # To adjust as needed
MAX_WORKERS = 20  # To adjust based on system and API rate limits


 ## MongoDB Connection

In [2]:
# %%
def get_mongo_client():
    """
    Establishes a connection to the MongoDB client using credentials from AWS Secrets Manager or environment variables.
    """
    mongo_uri = os.getenv('MONGO_CONNECTION_STRING')
    if not mongo_uri:
        try:
            secrets_manager_client = boto3.client("secretsmanager", region_name=MONGO_REGION)
            secrets = json.loads(
                secrets_manager_client.get_secret_value(
                    SecretId=MONGO_SECRET_ID
                )["SecretString"]
            )
            password = secrets["mongodb"]["password"]
            mongo_uri = f"mongodb+srv://alloreview:{password}@feedbacksdev.cuwx1.mongodb.net"
        except Exception as e:
            print(f"Error fetching MongoDB credentials: {e}")
            raise
    return MongoClient(mongo_uri, tlsCAFile=certifi.where())

try:
    mongo_client = get_mongo_client()
    collection = mongo_client[MONGO_DATABASE][MONGO_COLLECTION]
except Exception as e:
    print(f"Error connecting to MongoDB: {e}")
    raise


In [4]:
def get_most_occuring_elementary_subjects(feedback_collection, brand: str, min_count: int = 5, limit: int = 7000):
    """
    Retrieves the most frequently occurring elementary_subjects from feedbacks of a specific brand.

    :param feedback_collection: MongoDB collection object containing feedbacks.
    :param brand: Name of the brand to filter feedbacks by.
    :param min_count: Minimum count threshold for an elementary_subject to be included (default is 5).
    :param limit: Maximum number of elementary_subjects to return (default is 7000).
    :return: A list of the most frequently occurring elementary_subjects and their counts.
    """

    pipeline = [
        {"$match": {"brand": brand}},
        {"$unwind": "$extractions"},
        {"$unwind": "$extractions.elementary_subjects"},
        {"$group": {"_id": "$extractions.elementary_subjects", "count": {"$sum": 1}}},
        {"$match": {"count": {"$gt": min_count}}},
        {"$sort": {"count": -1}},
        {"$limit": limit}
    ]

    return list(feedback_collection.aggregate(pipeline))


data = get_most_occuring_elementary_subjects(collection, BRAND_NAME)


elementary_subjects = [item['_id'] for item in data]

In [5]:
elementary_subjects

['Suivi de dossier : Absence de réponse',
 "Administration : Insatisfaction générale envers l'efficacité des services",
 'Communication : Absence de réponse',
 'Suivi de dossier : Délai de traitement excessif',
 'Service : Insatisfaction générale',
 'Accessibilité : Difficulté à joindre un service par téléphone',
 "Site Internet : Facilité d'utilisation",
 'Service : Accueil et amabilité du personnel',
 'Site Internet : Problème de fonctionnalité',
 'Suivi de dossier : Retard dans le traitement des demandes de retraite',
 'Assistance : Personnel : Empathie des agents',
 'Accessibilité : Complexité des démarches en ligne',
 'Accessibilité : Injoignabilité par téléphone',
 'Assistance : Incohérence des informations fournies',
 'Suivi de dossier : Processus de demande efficace',
 'Démarches en ligne : Satisfaction et accompagnement',
 'Satisfaction : Reconnaissance du service',
 'Site Internet : Inefficacité des démarches en ligne',
 'Suivi de dossier : Délai de réponse',
 'Administration

 ## Load Feedbacks to Process

In [None]:
# %%
def get_feedbacks_to_process(collection, brand_name, sample_size):
    """
    Retrieves feedback documents from MongoDB that need processing.

    :param collection: MongoDB collection object.
    :param brand_name: Name of the brand to filter.
    :param sample_size: Number of documents to sample.
    :return: DataFrame containing feedbacks to process.
    """
    query = {'brand': brand_name}
    pipeline = [
        {'$match': query},
        {'$sample': {'size': sample_size}}
    ]

    try:
        feedbacks_cursor = collection.aggregate(pipeline)
        feedbacks = list(feedbacks_cursor)
        return pd.DataFrame(feedbacks)
    except Exception as e:
        print(f"Error fetching feedbacks: {e}")
        return pd.DataFrame()

df_feedbacks = get_feedbacks_to_process(collection, BRAND_NAME, SAMPLE_SIZE)


 ## Data Overview

In [None]:
# %%
if not df_feedbacks.empty:
    total_rows = df_feedbacks.shape[0]
    processed_rows = df_feedbacks['extractions'].notna().sum()
    ratio_processed = processed_rows * 100 / total_rows
    print(f"Total feedbacks: {total_rows}")
    print(f"Feedbacks with extractions: {processed_rows} ({ratio_processed:.2f}%)")
else:
    print("No feedbacks to process.")


Total feedbacks: 100
Feedbacks with extractions: 11 (11.00%)


 ## Generate Brand Context

In [None]:
# %%
def generate_brand_context(row):
    """
    Generates brand context from the given row.

    :param row: Pandas Series representing a row in DataFrame.
    :return: String representing the brand context.
    """
    context_lines = ["Feedbacks are from French public services."]
    fields = [
        ("Intitulé Structure 1", row.get("intitule_structure_1")),
        ("Intitulé Structure 2", row.get("intitule_structure_2")),
        ("Tags Métiers", row.get("tags_metiers")),
        ("Pays de la demande", row.get("pays"))
    ]

    for label, value in fields:
        if pd.notnull(value) and value != 'N/A':
            context_lines.append(f"{label}: {value}")

    return "\n".join(context_lines)


 ## Prepare Data for Extraction

In [None]:
# %%
if not df_feedbacks.empty:
    # Filter feedbacks that need extractions
    feedbacks_to_extract = df_feedbacks[df_feedbacks['extractions'].isna()]
    print(f"Number of feedbacks to extract: {feedbacks_to_extract.shape[0]}")

    # Sample feedbacks if necessary
    sample_size = min(1000, feedbacks_to_extract.shape[0])
    feedbacks_sample = feedbacks_to_extract.sample(sample_size)
    print(f"Processing {feedbacks_sample.shape[0]} feedbacks.")

    # Prepare data
    feedbacks_sample['text'] = feedbacks_sample['verbatims']
    feedbacks_sample['brand_context'] = feedbacks_sample.apply(generate_brand_context, axis=1)
    feedbacks_sample = feedbacks_sample[['text', '_id', 'brand_context']]
else:
    print("No feedbacks to process.")


Number of feedbacks to extract: 89
Processing 89 feedbacks.


 ## Extract Information

In [None]:
# %%
if not feedbacks_sample.empty:
    # For a single extraction (for testing purposes)
    sample_feedback = feedbacks_sample.iloc[0]
    extraction_result = extract_information_from_text(
        sample_feedback['text'],
        sample_feedback['_id'],
        sample_feedback['brand_context'],
        language=LANGUAGE,
        model=MODEL_NAME
    )
    print("Extraction result for one sample:")
    print(extraction_result)
else:
    print("No feedbacks to extract.")


Extraction result for one sample:
{'id': 'ditp_analysis/279289', 'splitted_analysis': [{'text': 'Retraite progressive et relevé de carrière', 'extractions': [{'sentiment': 'POSITIVE', 'extraction': 'Retraite progressive'}]}, {'text': 'Bonjour'}, {'text': 'Je suis à la retraite progressive depuis le 1er octobre 2018', 'extractions': [{'sentiment': 'POSITIVE', 'extraction': 'Retraite progressive'}]}, {'text': 'donc je continue a cotiser pour mes trimestres', 'extractions': [{'sentiment': 'POSITIVE', 'extraction': 'Cotisation pour les trimestres'}]}, {'text': "mais je suis dans l'impossibilité de consulter mon relevé de carrière afin de le mettre a jour ou juste pour savoir si j'ai atteint le nombres de trimestres nécessaires pour ma retraite pleine", 'extractions': [{'sentiment': 'NEGATIVE', 'extraction': 'Impossibilité de consulter le relevé de carrière'}]}, {'text': 'Et quand je pose la question à la Carsat', 'extractions': [{'sentiment': 'NEGATIVE', 'extraction': "Réponse de la Carsat

 ## Run Extraction Pipeline

In [None]:
# %%
if not feedbacks_sample.empty:
    extractions = process_extractions_in_parallel(
        feedbacks_sample,
        brand_name=BRAND_NAME,
        language=LANGUAGE,
        model=MODEL_NAME,
        save_to_mongo=True
    )
    print(f"Extractions completed for {len(extractions)} feedbacks.")
else:
    print("No feedbacks to process.")


Processing chunks:   0%|          | 0/2 [00:00<?, ?it/s]

JSON decoding error: Unterminated string starting at: line 69 column 20 (char 1703)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Expecting value: line 66 column 13 (char 1719)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Expecting ',' delimiter: line 62 column 4 (char 1724)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Expecting property name enclosed in double quotes: line 69 column 1 (char 1614)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  50%|█████     | 1/2 [00:30<00:30, 30.75s/it]

JSON decoding error: Expecting property name enclosed in double quotes: line 66 column 6 (char 1810)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Unterminated string starting at: line 66 column 15 (char 1668)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Expecting ',' delimiter: line 64 column 1 (char 1711)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Expecting value: line 68 column 1 (char 1736)
[extract_information_from_text()] Missing key: 'text'


Processing chunks: 100%|██████████| 2/2 [00:52<00:00, 26.30s/it]

Extractions completed for 89 feedbacks.





 ## Verify Results in MongoDB

In [None]:
# %%
if 'extractions' in locals() and extractions:
    extracted_ids = [extraction['id'] for extraction in extractions]
    try:
        extracted_documents = collection.find({
            'brand': BRAND_NAME,
            '_id': {'$in': extracted_ids}
        })
        extracted_df = pd.DataFrame(list(extracted_documents))
        print(f"Retrieved {extracted_df.shape[0]} documents from MongoDB.")

        # Sample to check extractions
        sample_doc = extracted_df.sample().iloc[0]
        print("Sample verbatim:")
        print(sample_doc.get('verbatims', 'N/A'))
        print("Sample extractions:")
        print(sample_doc.get('extractions', 'N/A'))
        print("Sample splitted_analysis_v2:")
        print(sample_doc.get('splitted_analysis_v2', 'N/A'))

    except Exception as e:
        print(f"Error retrieving documents from MongoDB: {e}")
else:
    print("No extractions to verify.")


Retrieved 89 documents from MongoDB.
Sample verbatim:
Obtenir un rendez vous
Si on ne connait pas les regle service consulaire passeport identité il faut attendre plusieurs semaines avant un rendez vous
Pas plusieurs semaine mais etre sur le site le 1er jour des prochains rendez vous
Se deplacer ne sert pas à grand chose car nous avons en face de nous une société de sécurité.
Les heures manquent aussi d’amplitude
Mais le personnel est au top, compétent et sympathique.
Le site est standard et ne donne que des infos de base
Sample extractions:
[{'sentiment': 'SUGGESTION', 'extraction': 'Obtenir un rendez-vous', 'text': 'Obtenir un rendez vous'}, {'sentiment': 'NEGATIVE', 'extraction': 'Attente prolongée pour un rendez-vous si on ne connaît pas les règles', 'text': 'Si on ne connait pas les regle service consulaire passeport identité il faut attendre plusieurs semaines avant un rendez vous'}, {'sentiment': 'SUGGESTION', 'extraction': 'Être présent le 1er jour des prochains rendez-vous', '