In [1]:
import pandas as pd
import json
import importlib

from tqdm import tqdm
from typing import List, Dict
from pymongo import MongoClient

import boto3
import certifi

from utils.extractions_utils import generate_extractions, split_text_parts, extract_information_from_text, add_extractions_to_splitted_analysis

In [2]:
STAGE = 'prod'

Loading all key libraries

In [3]:
_secrets_manager_client = boto3.client("secretsmanager", region_name="eu-west-3")

_secrets = json.loads(
    _secrets_manager_client.get_secret_value(
        SecretId=f"{STAGE.capitalize()}/alloreview"
    )["SecretString"]
)
MONGO_CONNECTION_STRING = (
    "mongodb+srv://alloreview:{}@feedbacksdev.cuwx1.mongodb.net".format(
        _secrets["mongodb"]["password"]
    )
)

OPENAI_API_KEY = _secrets["openai"]["api_key"]
LLM_API_KEY = _secrets["litellm"]["api_key"]


In [4]:
mongo_client = MongoClient(MONGO_CONNECTION_STRING,tlsCAFile=certifi.where())

collection = mongo_client['feedbacks_db']['feedbacks_Prod']

In [5]:
BRAND = 'ditp_analysis'

BRAND_DESCR = '''
Feedbacks are from French public services.
'''

## Run the extraction pipeline

In [6]:
# this function allows to parallelize the extraction process and to save the results on the mongo database
from utils.extractions_utils import process_extractions_in_parallel

In [7]:
#NEW
from_mongo = pd.read_csv('ditp_test.csv')

In [13]:
subdf = from_mongo.sample(20)
print(f'Test will be done on {subdf.shape[0]} samples.')



Test will be done on 20 samples.


In [14]:
#NEW
subdf['text'] = subdf['verbatims']
texts_with_ids = subdf[['text', '_id']].to_dict(orient='records')

In [15]:
extractions = process_extractions_in_parallel(
    texts_with_ids,
    brand_name=BRAND,
    brand_descr=BRAND_DESCR,
    language='french',
    model="gpt-4o-mini",
    save_to_mongo=False
)

Processing chunks:   0%|          | 0/1 [00:00<?, ?it/s]



Processing chunks: 100%|██████████| 1/1 [00:12<00:00, 12.60s/it]
