# Extractions

The objective of this notebook is to generate extractions from the verbatims.

In [14]:
import pandas as pd
import json
import importlib

from tqdm import tqdm
from typing import List, Dict
from pymongo import MongoClient

import boto3
import certifi

from utils.extractions_utils import generate_extraction_results, split_text_into_parts, extract_information_from_text, add_extractions_to_splitted_analysis

In [15]:
STAGE = 'prod'

Loading all key libraries

In [3]:
_secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-3")

_secrets = json.loads(
    _secrets_manager_client.get_secret_value(
        SecretId=f"{STAGE.capitalize()}/alloreview"
    )["SecretString"]
)
MONGO_CONNECTION_STRING = (
    "mongodb+srv://alloreview:{}@feedbacksdev.cuwx1.mongodb.net".format(
        _secrets["mongodb"]["password"]
    )
)

OPENAI_API_KEY = _secrets["openai"]["api_key"]
LLM_API_KEY = _secrets["litellm"]["api_key"]


In [4]:
mongo_client = MongoClient(MONGO_CONNECTION_STRING,tlsCAFile=certifi.where())

collection = mongo_client['feedbacks_db']['feedbacks_Prod']

Define the brand and a short description of the brand.

In [5]:
BRAND = 'ditp_analysis'

In [6]:
#NEW
from_mongo = pd.read_csv('ditp_test.csv')

In [16]:
from_mongo = pd.DataFrame(list(collection.aggregate([
    {
        '$match': {
            'brand': BRAND,
        },
    },
#    { "$sample" : { "size": 4000 } }
])))


from_mongo.shape

(4000, 85)

In [17]:
# Calculer le total des lignes
total_rows = from_mongo.shape[0]

# Calculer le nombre de lignes où 'extractions' n'est pas NaN
remaining_rows = from_mongo['extractions'].notna().sum()

# Calcul du ratio de lignes restantes à traiter
ratio_remaining = remaining_rows * 100 / total_rows
formatted_ratio = f"{ratio_remaining:.2f}"
print(f"Nombre de ligne ou l'extraction a été réalisée: {remaining_rows}")
print(f"Pourcentage d'extractions réalisés par rapport à l'ensemble de la base : {formatted_ratio}%")

Nombre de ligne ou l'extraction a été réalisée: 464
Pourcentage d'extractions réalisés par rapport à l'ensemble de la base : 11.60%


## Run the extraction pipeline on sample

In [12]:
import importlib
# this function allows to parallelize the extraction process and to save the results on the mongo database
from utils.extractions_utils import process_extractions_in_parallel
module = importlib.import_module('utils.extractions_utils')
importlib.reload(module)

# Ensure the function is re-imported after reloading the module
process_extractions_in_parallel = module.process_extractions_in_parallel

In [9]:
subdf = from_mongo[from_mongo['extractions'].isna()]
subdf = subdf.sample(1000)
print(f'Test will be done on {subdf.shape[0]} samples.')



Test will be done on 1000 samples.


In [10]:
def format_ligne(ligne):
    # Fonction interne pour gérer les valeurs manquantes
    def extraire_champ(champ, allow_empty=False):
        return champ if pd.notnull(champ) and (allow_empty or champ != 'N/A') else None

    # Champs obligatoires et facultatifs
    champs = [
        ("Intitulé Structure 1", ligne.get("intitule_structure_1"), False),
        ("Intitulé Structure 2", ligne.get("intitule_structure_2"), True),
        ("Tags Métiers", ligne.get("tags_metiers"), True),
        ("Pays de la demande", ligne.get("pays"), False)
    ]
    
    # Initialisation des lignes avec une phrase fixe
    lignes = ["Feedbacks are from French public services."]
    
    # Génération des lignes dynamiques si les champs sont présents
    for label, champ, allow_empty in champs:
        valeur = extraire_champ(champ, allow_empty)
        if valeur:
            lignes.append(f"{label}: {valeur}")
    
    # Retour du résultat formaté
    return "\n".join(lignes)

In [11]:
#NEW
subdf['text'] = subdf['verbatims']
subdf['brand_context'] = subdf.apply(format_ligne, axis=1)
subsubdf = subdf[['text', '_id', 'brand_context']]

#### For one extraction

In [45]:
text = subsubdf.iloc[0]  
extraction = extract_information_from_text(
    text['text'],
    text['_id'],
    text['brand_context'],
    language='french',
    model="gpt-4o-mini"
)


#### For multiple extraction

In [13]:
extractions = process_extractions_in_parallel(
    subsubdf,
    brand_name=BRAND,
    language='french',
    model="gpt-4o-mini",
    save_to_mongo=True
)

Processing chunks:   0%|          | 0/20 [00:00<?, ?it/s]

JSON decoding error: Unterminated string starting at: line 61 column 15 (char 1734)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:   5%|▌         | 1/20 [00:54<17:09, 54.16s/it]

JSON decoding error: Expecting ',' delimiter: line 67 column 4 (char 1777)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Expecting property name enclosed in double quotes: line 66 column 1 (char 1754)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  15%|█▌        | 3/20 [02:00<10:52, 38.38s/it]

JSON decoding error: Unterminated string starting at: line 60 column 18 (char 1624)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Unterminated string starting at: line 66 column 24 (char 1663)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Expecting property name enclosed in double quotes: line 69 column 6 (char 1691)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  20%|██        | 4/20 [02:40<10:28, 39.29s/it]

JSON decoding error: Unterminated string starting at: line 69 column 20 (char 1725)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Unterminated string starting at: line 66 column 15 (char 1716)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  25%|██▌       | 5/20 [03:18<09:38, 38.57s/it]

JSON decoding error: Unterminated string starting at: line 65 column 18 (char 1710)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  30%|███       | 6/20 [03:48<08:18, 35.60s/it]

JSON decoding error: Expecting ',' delimiter: line 68 column 1 (char 1714)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  35%|███▌      | 7/20 [04:20<07:28, 34.48s/it]

JSON decoding error: Expecting value: line 68 column 1 (char 1740)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Unterminated string starting at: line 65 column 18 (char 1662)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  45%|████▌     | 9/20 [05:30<06:20, 34.61s/it]

JSON decoding error: Expecting property name enclosed in double quotes: line 61 column 1 (char 1732)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  50%|█████     | 10/20 [05:59<05:27, 32.79s/it]

JSON decoding error: Unterminated string starting at: line 65 column 18 (char 1668)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  55%|█████▌    | 11/20 [06:32<04:54, 32.77s/it]

JSON decoding error: Unterminated string starting at: line 61 column 15 (char 1730)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Expecting ',' delimiter: line 63 column 2 (char 1782)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  60%|██████    | 12/20 [06:55<03:59, 29.94s/it]

JSON decoding error: Expecting property name enclosed in double quotes: line 66 column 1 (char 1644)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  65%|██████▌   | 13/20 [07:27<03:33, 30.56s/it]

JSON decoding error: Unterminated string starting at: line 66 column 7 (char 1721)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  70%|███████   | 14/20 [08:06<03:18, 33.07s/it]

JSON decoding error: Unterminated string starting at: line 70 column 18 (char 1667)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  75%|███████▌  | 15/20 [08:45<02:54, 34.87s/it]

JSON decoding error: Expecting ',' delimiter: line 63 column 2 (char 1735)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  85%|████████▌ | 17/20 [09:37<01:30, 30.21s/it]

JSON decoding error: Unterminated string starting at: line 70 column 18 (char 1705)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Unterminated string starting at: line 65 column 18 (char 1708)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Unterminated string starting at: line 65 column 18 (char 1717)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Expecting value: line 70 column 17 (char 1675)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  90%|█████████ | 18/20 [10:16<01:05, 32.75s/it]

JSON decoding error: Expecting property name enclosed in double quotes: line 66 column 1 (char 1810)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Expecting property name enclosed in double quotes: line 69 column 6 (char 1714)
[extract_information_from_text()] Missing key: 'text'


Processing chunks:  95%|█████████▌| 19/20 [10:49<00:32, 32.89s/it]

JSON decoding error: Expecting ',' delimiter: line 67 column 4 (char 1649)
[extract_information_from_text()] Missing key: 'text'
JSON decoding error: Unterminated string starting at: line 66 column 24 (char 1672)
[extract_information_from_text()] Missing key: 'text'


Processing chunks: 100%|██████████| 20/20 [11:50<00:00, 35.53s/it]


## Checking result in MongoDB

In [32]:
# getting the document from the database to check if the extractions are saved
# matching the brand and the id in res

documents = collection.find({
    'brand': BRAND,
    '_id': {'$in': [r['id'] for r in extractions]}
})

documents = pd.DataFrame(documents)

KeyboardInterrupt: 

In [19]:
documents.shape

(1000, 84)

In [20]:
documents.sample().iloc[0].verbatims

"Prise de rdv dépôt dossier ccm\nBonjour,\r\nj'ai dû envoyer entre 5 courriels je me connecte tous les jours sur le site consulat Fès prise de rendez vous pour espérer avoir une date avec un créneau à choisir pour pouvoir déposer mon dossier capacité à mariage impossible d'avoir une date j envoi des mails également pas de retour.\r\nJe contacte le numéro de téléphone communiqué sur le site idem répondeur.\r\nQue faut il faire pour enfin avoir un retour ? Pourquoi il n'y a que ce consulat ou les rendez vous c'est devenu limite un Luxe d'en avoir un ?\r\nJe suis vraiment déçus ma date de mariage approche et mon dossier de CCM n'est même déposé."

In [15]:
documents.sample().iloc[0].extractions

[{'sentiment': 'NEGATIVE',
  'extraction': 'Difficulté à trouver le bouton de déconnexion',
  'text': 'Où se trouve le bouton de déconnexion de vote site ?'},
 {'sentiment': 'SUGGESTION',
  'extraction': 'Bouton de déconnexion près du profil',
  'text': 'Il devrait se trouver prés de mon profil et me permettre de me déconnecter "proprement"'}]

In [16]:
documents.sample().iloc[0].splitted_analysis_v2

[{'text': 'Demande de passeport et de carte d’identité',
  'extractions': [{'sentiment': 'NEGATIVE',
    'extraction': 'Demande de passeport et de carte d’identité'}]},
 {'text': 'J’avais des photos récentes on m’a demandé de les refaire sans aucun motif,',
  'extractions': [{'sentiment': 'NEGATIVE',
    'extraction': 'Demande de refaire des photos sans motif'}]},
 {'text': 'sauf celui de payer à nouveau 6€.',
  'extractions': [{'sentiment': 'NEGATIVE',
    'extraction': 'Coût supplémentaire de 6€ pour les nouvelles photos'}]},
 {'text': "Ce n'est vraiment pas cool,",
  'extractions': [{'sentiment': 'NEGATIVE',
    'extraction': 'Insatisfaction concernant la demande de nouvelles photos'}]},
 {'text': "alors qu'elles avaient été acceptées au départ.",
  'extractions': [{'sentiment': 'NEGATIVE',
    'extraction': 'Photos initialement acceptées'}]},
 {'text': 'Pourquoi me refaire faire des photos ?',
  'extractions': [{'sentiment': 'SUGGESTION',
    'extraction': 'Question sur la nécessit