# C21 : Feeback Loop

modele :
les embeddings du dataset ont été calculés au préalable en utilisant le même modèle

## 0. Import dependencies

In [None]:
import psycopg2 
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
from dotenv import load_dotenv 
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(filename="app.log",level=logging.INFO)

reference =""" @inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "http://arxiv.org/abs/1908.10084",
}"""
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

## 1. Récupération des données utilisateurs

In [None]:
load_dotenv()
# PARAMS
min_date = "2024-05-19 12:00:00"

In [None]:
def get_feedbacks():
    """ Get the feedback from the database where they are stored"""
    conn = psycopg2.connect(f"dbname={os.getenv('DB_NAME')} user={os.getenv('DB_USERNAME')} password={os.getenv('DB_PASSWORD')} host={os.getenv('DB_HOST')} port={os.getenv('DB_PORT')}")
    cur = conn.cursor()
    table_name= "translation_feedback"
    query_neg = f"SELECT * FROM {table_name} WHERE created_at > '{min_date}' AND is_correct=False"
    cur.execute(query_neg)
    data_neg = cur.fetchall()
    conn.close()

    return data_neg
data_neg = get_feedbacks()

## 2. Calcul des embeddings 

In [None]:
# split the sentences for the languages 
sentences_neg_fr = [i[1] for i in data_neg]
sentences_neg_pl = [i[2] for i in data_neg]

In [None]:
# model card : https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L12-v2  
embeddings_feedback_fr = model.encode(sentences_neg_fr)
embeddings_feedback_pl = model.encode(sentences_neg_pl)

## 3. Flag des données

In [None]:
# chargement du dataset 
df_frpl = pd.read_csv("datasets/fr_pl.csv",quotechar="}")

In [None]:
# chargement des embeddings du dataset
path_embeddings_fr = "datasets/embeddings_fr.npy"
path_embeddings_pl = "datasets/embeddings_pl.npy"
embeddings_fr = np.load(path_embeddings_fr)
embeddings_pl = np.load(path_embeddings_pl)

In [None]:
# concatenation des 2 langues  
embeddings_feedback_frpl = np.array([np.concatenate((embeddings_feedback_fr[i],embeddings_feedback_pl[i])) for i in range(len(embeddings_feedback_fr))])
print(embeddings_feedback_frpl.shape)
embeddings_frpl = np.array([np.concatenate((embeddings_fr[i],embeddings_pl[i])) for i in range(len(embeddings_fr))])
print(embeddings_frpl.shape)

In [None]:
def flag_similarity(feedbacks:np.array,datasets:np.array,threshold:int=25)->np.array:
    """
        This function performs foreach feedback : 
            1. Compute cosine similarity 
            2. Get the indices which are superior to the threshold
            3. Append the indexes to a list
        Then it remove duplicate values
        Return the indexes to flag
    """
    indexes_array = []
    for feedback in feedbacks:
        sentence_cosine_similarity = util.dot_score(feedback, datasets).sort(descending=True)
        n_sentence_same = sentence_cosine_similarity.values[sentence_cosine_similarity.values > threshold].shape[0]
        indexes = sentence_cosine_similarity.indices[0][:n_sentence_same].numpy()
        indexes_array += list(indexes)
    
    unique = np.array(indexes_array)
    return np.unique(unique)

In [None]:
# get the indexes to remove or to signal ( depends of the user's choice)
indexes_to_flag = flag_similarity(embeddings_feedback_frpl,embeddings_frpl)

In [None]:
# if not exists, create a column to flag
df_frpl["flag_incorrect"] = False
# Flag the specified rows 
df_frpl.loc[indexes_to_flag,"flag_incorrect"] = True

In [None]:
# In this case save the modified dataset 
df_frpl.to_csv("datasets/df_frpl_w_feedbacks.csv",index=False,quotechar="}")