In [12]:
import pathlib
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import random
import os
import re
from dotenv import load_dotenv
from typing import Optional, Union
import dspy
import pickle
from dspy.datasets import Dataset
import pathlib
from sklearn.model_selection import train_test_split
from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch
import time
from fuzzywuzzy import process
from bert_score import score

In [13]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [14]:
###########
# API KEY #
###########
path_env = pathlib.Path("/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/.env")
print(path_env)
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key

llm = dspy.OpenAI(model="gpt-3.5-turbo")# "gpt-4o-2024-05-13")

/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Search-Tool/.env


In [15]:
lm = dspy.HFClientTGI(model="meta-llama/Meta-Llama-3-8B ", port=8080, url="http://127.0.0.1")
dspy.settings.configure(lm=lm)

In [17]:
df_samples = pd.read_excel("data/admin_eval_task/curated/tarea_zaragoza.xlsx")
print(df_samples.columns)
df_samples.columns = ["idx","procurement_id", "doc_name", "text", "objetivo"]

Index(['Unnamed: 0', 'procurement_id', 'doc_name', 'text', 'objetivo'], dtype='object')


In [18]:
df_samples.columns

Index(['idx', 'procurement_id', 'doc_name', 'text', 'objetivo'], dtype='object')

In [19]:
class TenderDataset(Dataset):

    def __init__(
        self,
        data_fpath: Union[pathlib.Path, str],
        dev_size: Optional[float] = 0.2,  
        test_size: Optional[float] = 0.2,
        text_key: str = "text",
        label_key: str = "objetivo",
        seed: Optional[int] = 11235,
        *args,
        **kwargs
    ) -> None:
        super().__init__(*args, **kwargs)

        self.labels = []
        self._train = []
        self._dev = []
        self._test = []

        # Read the training data
        train_data = pd.read_excel(data_fpath)

        train_data, temp_data = train_test_split(train_data, test_size=dev_size + test_size, random_state=seed)
        dev_data, test_data = train_test_split(temp_data, test_size=test_size / (dev_size + test_size), random_state=seed)
       
        self._train = [
            dspy.Example({**row}).with_inputs(text_key) for row in self._convert_to_json(train_data)
        ]
        self._dev = [
            dspy.Example({**row}).with_inputs(text_key) for row in self._convert_to_json(dev_data)
        ]
        self._test = [
            dspy.Example({**row}).with_inputs(text_key) for row in self._convert_to_json(test_data)
        ]

    def _convert_to_json(self, data: pd.DataFrame):
        if data is not None:
            return data.to_dict(orient='records')
        return []

In [21]:
# Create dataset
dataset = TenderDataset(
    data_fpath="data/admin_eval_task/curated/tarea_zaragoza.xlsx",
    dev_size=0.25,
)

trainset = dataset._train
devset = dataset._dev
testset = dataset._test

In [22]:
from langdetect import detect

def get_lang(df: pd.DataFrame, col_calculate_on: str) -> pd.DataFrame:
    def det(x: str) -> str:
        try:
            lang = detect(x)
        except Exception as e:
            print(e)
            lang = 'Other'
        return lang

    print(f"-- Detecting language...")
    start_time = time.time()

    df['lang'] = df[col_calculate_on].apply(det)

    print(f'-- -- Language detect finished in {(time.time() - start_time)} seconds')

    return df

# Load final_labels from the pickle file
with open('final_labels.pkl', 'rb') as file:
    final_labels = pickle.load(file)

# Convert final_labels to a DataFrame
df_labels = pd.DataFrame(final_labels, columns=['label'])

# Detect the language of each label
df_labels = get_lang(df_labels, 'label')

# Filter the DataFrame to keep only Spanish labels
df_spanish_labels = df_labels[df_labels['lang'] != 'ca']

# Convert the filtered DataFrame back to a list, if necessary
spanish_labels = df_spanish_labels['label'].tolist()

# Display the filtered Spanish labels
#print(spanish_labels)

# To use as HINT
def normalize_string(s):
    # Convert to lower case
    s = s.lower()
    # Remove punctuation and special characters
    s = re.sub(r'[^\w\s]', '', s)
    # Remove extra spaces
    s = re.sub(r'\s+', ' ', s).strip()
    return s

normalized_array = np.array([normalize_string(item) for item in spanish_labels])
unique_dict = {}
for original, normalized in zip(final_labels, normalized_array):
    if normalized not in unique_dict:
        unique_dict[normalized] = original
clean_final_labels = np.array(list(unique_dict.values()))


# Function to normalize the list
def normalize_labels(labels, threshold=85):
    normalized = []
    for label in labels:
        match = process.extractOne(label, normalized, score_cutoff=threshold)
        if match:
            normalized.append(match[0])
        else:
            normalized.append(label)
    return list(set(normalized))

# Normalize the labels
normalized_labels = normalize_labels(clean_final_labels)
print(normalized_labels)

final_labels = ["EJECUCIÓN DEL CONTRATO", "DESCRIPCIÓN DE LOS CONTRATOS", "OBJETIVOS DEL CONTRATO", "OBLIGACIONES DEL CONTRATO", "OBJETO DE LA CONTRATACIÓN"]

-- Detecting language...
-- -- Language detect finished in 3.994781255722046 seconds
['DOCUMENTOS DEL CONTRATO', 'EXPEDIENTE DE CONTRATACIÓN:', 'OBLIGACIONES DEL CONTRATO', 'IV. EXECUCIÓ DEL CONTRACTE', 'CATALOGACIÓN EN EL CONTRATO', 'PRESTACIONES DEL CONTRATO', 'ÓRGANO DE CONTRATACIÓN PROPONENTE:', 'OBJETO DE LA CONTRATACIÓN', 'CATEGORÍA DEL CONTRATO', 'EJECUCIÓN DEL CONTRATO', 'ESPECIFICACIONES DEL CONTRATO', 'DESCRIPCIÓN DE LA CONTRATACIÓN', 'OBJETIVOS DEL CONTRATO.', 'COMISIÓN DE CONTRATACIÓN:', 'PROCEDIMIENTO DE CONTRATACIÓN', "ÁMBIT D'APLICACIÓ DEL CONTRACTE", 'DESCRIPCIÓN DE LOS CONTRATOS', 'ADJUDICACION DEL CONTRATO', 'REPRESENTANTE DEL CONTRATO', 'MODIFICACIÓN DEL CONTRATO.']


In [23]:
class PredictObjecto(dspy.Signature):
    """
    Extract the objective of the contract from a document containing the technical specifications of a Spanish public tender. If the objective is not present in the document, return '/'.

    Requirements:

    The extracted text must exclusively consist of words from the document. No additional words are allowed.
    The language of the document must remain unchanged under all circumstances.
    """

    TENDER = dspy.InputField(desc="The document containing the technical specifications of the Spanish public tender.")
    OBJECTIVE = dspy.OutputField(desc="The tender objective, or 'N_A' if not present.")


class PredictModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predict = dspy.Predict(PredictObjecto)
        #self.predict = dspy.ChainOfThoughtWithHint(PredictObjecto)
        
    def _process_output(self, text):
        
        if "N_A" in text:
            return "/"
        else:
            return text
      
    def forward(self, text):
        #hint = f"Valid candidates for 'tender objective' may start with {final_labels}."

        pred = self.predict(TENDER=text[0:5000])#, hint=hint

        return dspy.Prediction(objective=self._process_output(pred.OBJECTIVE))

In [24]:
def combined_score(example, pred, trace=None):
    def matching_score(example, pred, trace=None):
        if example.objetivo == "/":
            if pred["objective"] == "/":
                return 1.0
            else:
                return 0.0
        
        predicted_lst = pred["objective"].split()
        gt_lst = example.objetivo.split()
        
        predicted_set = set(predicted_lst)
        gt_set = set(gt_lst)

        intersection = predicted_set.intersection(gt_set)
        union = predicted_set.union(gt_set)
        
        if len(union) == 0:
            return 0.0
        jaccard_similarity = len(intersection) / len(union)
        
        return jaccard_similarity

    def is_in_text_score(example, pred, trace=None):
        if example.objetivo == "/":
            if pred["objective"] == "/":
                return 1.0
            else:
                return 0.0
            
        text_lst = example.text[0:5000].lower().split()
        predicted_lst = pred["objective"].lower().split()

        words_not_in_text = [word for word in predicted_lst if word not in text_lst]
        num_words_not_in_text = len(words_not_in_text)
        
        total_predicted_words = len(predicted_lst)
        score = max(0.0, 1.0 - (num_words_not_in_text / total_predicted_words))
        
        return score

    match_score = matching_score(example, pred, trace)
    text_score = is_in_text_score(example, pred, trace)    
    combined = (0.5 * match_score) + (0.5 * text_score)
    
    return combined

In [25]:
config = dict(max_bootstrapped_demos=3, max_labeled_demos=3, num_candidate_programs=10, max_rounds=1,)
teleprompter = BootstrapFewShotWithRandomSearch(metric=combined_score, **config)
compiled_classifier = teleprompter.compile(PredictModule(), trainset=trainset, valset=devset)

Going to sample between 1 and 3 traces per predictor.
Will attempt to train 10 candidate sets.


Average Metric: 8.722383986214217 / 18  (48.5): 100%|██████████| 18/18 [00:00<00:00, 275.38it/s]
  df = df.applymap(truncate_cell)


Average Metric: 8.722383986214217 / 18  (48.5%)
Score: 48.46 for set: [0]
New best score: 48.46 for seed -3
Scores so far: [48.46]
Best score: 48.46


Average Metric: 8.722383986214217 / 18  (48.5): 100%|██████████| 18/18 [00:00<00:00, 228.36it/s] 


Average Metric: 8.722383986214217 / 18  (48.5%)
Score: 48.46 for set: [3]
Scores so far: [48.46, 48.46]
Best score: 48.46


 10%|▉         | 4/41 [00:00<00:00, 302.65it/s]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 12.490846945163335 / 18  (69.4): 100%|██████████| 18/18 [00:00<00:00, 173.74it/s]


Average Metric: 12.490846945163335 / 18  (69.4%)
Score: 69.39 for set: [3]
New best score: 69.39 for seed -1
Scores so far: [48.46, 48.46, 69.39]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.7141382826369798
Average of max per entry across top 3 scores: 0.7141382826369798
Average of max per entry across top 5 scores: 0.7141382826369798
Average of max per entry across top 8 scores: 0.7141382826369798
Average of max per entry across top 9999 scores: 0.7141382826369798


  7%|▋         | 3/41 [00:00<00:00, 319.69it/s]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 8.064963803603172 / 18  (44.8): 100%|██████████| 18/18 [00:00<00:00, 214.33it/s] 


Average Metric: 8.064963803603172 / 18  (44.8%)
Score: 44.81 for set: [3]
Scores so far: [48.46, 48.46, 69.39, 44.81]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.7141382826369798
Average of max per entry across top 3 scores: 0.7141382826369798
Average of max per entry across top 5 scores: 0.7141382826369798
Average of max per entry across top 8 scores: 0.7141382826369798
Average of max per entry across top 9999 scores: 0.7141382826369798


  2%|▏         | 1/41 [00:00<00:00, 242.25it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 2.158840213201268 / 18  (12.0): 100%|██████████| 18/18 [00:00<00:00, 247.83it/s]


Average Metric: 2.158840213201268 / 18  (12.0%)
Score: 11.99 for set: [3]
Scores so far: [48.46, 48.46, 69.39, 44.81, 11.99]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.7141382826369798
Average of max per entry across top 3 scores: 0.7141382826369798
Average of max per entry across top 5 scores: 0.722440769200336
Average of max per entry across top 8 scores: 0.722440769200336
Average of max per entry across top 9999 scores: 0.722440769200336


  2%|▏         | 1/41 [00:00<00:00, 223.84it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 0.0 / 18  (0.0): 100%|██████████| 18/18 [00:00<00:00, 270.45it/s]


Average Metric: 0.0 / 18  (0.0%)
Score: 0.0 for set: [3]
Scores so far: [48.46, 48.46, 69.39, 44.81, 11.99, 0.0]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.7141382826369798
Average of max per entry across top 3 scores: 0.7141382826369798
Average of max per entry across top 5 scores: 0.722440769200336
Average of max per entry across top 8 scores: 0.722440769200336
Average of max per entry across top 9999 scores: 0.722440769200336


  2%|▏         | 1/41 [00:00<00:00, 211.31it/s]

Bootstrapped 1 full traces after 2 examples in round 0.



Average Metric: 10.005318268498243 / 18  (55.6): 100%|██████████| 18/18 [00:00<00:00, 185.29it/s]


Average Metric: 10.005318268498243 / 18  (55.6%)
Score: 55.59 for set: [3]
Scores so far: [48.46, 48.46, 69.39, 44.81, 11.99, 0.0, 55.59]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.7123554389188552
Average of max per entry across top 3 scores: 0.7162367275421139
Average of max per entry across top 5 scores: 0.7162367275421139
Average of max per entry across top 8 scores: 0.7232925801862019
Average of max per entry across top 9999 scores: 0.7232925801862019


  2%|▏         | 1/41 [00:00<00:00, 242.78it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 1.9117667818115387 / 18  (10.6): 100%|██████████| 18/18 [00:00<00:00, 197.78it/s]


Average Metric: 1.9117667818115387 / 18  (10.6%)
Score: 10.62 for set: [3]
Scores so far: [48.46, 48.46, 69.39, 44.81, 11.99, 0.0, 55.59, 10.62]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.7123554389188552
Average of max per entry across top 3 scores: 0.7162367275421139
Average of max per entry across top 5 scores: 0.7162367275421139
Average of max per entry across top 8 scores: 0.7232925801862019
Average of max per entry across top 9999 scores: 0.7232925801862019


  7%|▋         | 3/41 [00:00<00:00, 303.23it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 11.50928988814715 / 18  (63.9): 100%|██████████| 18/18 [00:00<00:00, 189.00it/s] 


Average Metric: 11.50928988814715 / 18  (63.9%)
Score: 63.94 for set: [3]
Scores so far: [48.46, 48.46, 69.39, 44.81, 11.99, 0.0, 55.59, 10.62, 63.94]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.706717118197745
Average of max per entry across top 3 scores: 0.7164636330492901
Average of max per entry across top 5 scores: 0.7192271482861579
Average of max per entry across top 8 scores: 0.7262830009302459
Average of max per entry across top 9999 scores: 0.7262830009302459


 10%|▉         | 4/41 [00:00<00:00, 286.54it/s]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 8.355477442182027 / 18  (46.4): 100%|██████████| 18/18 [00:00<00:00, 132.56it/s]

Average Metric: 8.355477442182027 / 18  (46.4%)





Score: 46.42 for set: [3]
Scores so far: [48.46, 48.46, 69.39, 44.81, 11.99, 0.0, 55.59, 10.62, 63.94, 46.42]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.706717118197745
Average of max per entry across top 3 scores: 0.7164636330492901
Average of max per entry across top 5 scores: 0.7192271482861579
Average of max per entry across top 8 scores: 0.7269146403215752
Average of max per entry across top 9999 scores: 0.7269146403215752


  5%|▍         | 2/41 [00:00<00:00, 252.75it/s]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 8.592719712196011 / 18  (47.7): 100%|██████████| 18/18 [00:00<00:00, 145.58it/s]


Average Metric: 8.592719712196011 / 18  (47.7%)
Score: 47.74 for set: [3]
Scores so far: [48.46, 48.46, 69.39, 44.81, 11.99, 0.0, 55.59, 10.62, 63.94, 46.42, 47.74]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.706717118197745
Average of max per entry across top 3 scores: 0.7164636330492901
Average of max per entry across top 5 scores: 0.7192271482861579
Average of max per entry across top 8 scores: 0.7386186703997035
Average of max per entry across top 9999 scores: 0.7456745230437914


  5%|▍         | 2/41 [00:00<00:00, 308.03it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 8.070690391180063 / 18  (44.8): 100%|██████████| 18/18 [00:00<00:00, 300.10it/s]


Average Metric: 8.070690391180063 / 18  (44.8%)
Score: 44.84 for set: [3]
Scores so far: [48.46, 48.46, 69.39, 44.81, 11.99, 0.0, 55.59, 10.62, 63.94, 46.42, 47.74, 44.84]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.706717118197745
Average of max per entry across top 3 scores: 0.7164636330492901
Average of max per entry across top 5 scores: 0.7192271482861579
Average of max per entry across top 8 scores: 0.7479705141044938
Average of max per entry across top 9999 scores: 0.7550263667485817


 10%|▉         | 4/41 [00:00<00:00, 305.86it/s]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 6.656283367650822 / 18  (37.0): 100%|██████████| 18/18 [00:00<00:00, 154.00it/s]


Average Metric: 6.656283367650822 / 18  (37.0%)
Score: 36.98 for set: [3]
Scores so far: [48.46, 48.46, 69.39, 44.81, 11.99, 0.0, 55.59, 10.62, 63.94, 46.42, 47.74, 44.84, 36.98]
Best score: 69.39
Average of max per entry across top 1 scores: 0.6939359413979631
Average of max per entry across top 2 scores: 0.706717118197745
Average of max per entry across top 3 scores: 0.7164636330492901
Average of max per entry across top 5 scores: 0.7192271482861579
Average of max per entry across top 8 scores: 0.7479705141044938
Average of max per entry across top 9999 scores: 0.7550263667485817
13 candidate programs found.


In [26]:
tests = []
for el in testset:
    output = compiled_classifier(el.text)
    tests.append([el.text[0:5000],el.objetivo, output["objective"], combined_score(el,output)])

In [None]:
model_name = "microsoft/deberta-xlarge-mnli"

In [30]:
results = pd.DataFrame(tests, columns=["TEXT","GROUND", "PREDICTED", "METRIC"])

P, R, F1 = score(results.PREDICTED.values.tolist(), results.GROUND.values.tolist(), lang='es', model_type=model_name) # TODO

In [28]:
results["METRIC"].mean()

0.7199627557398761

In [29]:
P.mean(), R.mean(), F1.mean()

(tensor(0.8102), tensor(0.7834), tensor(0.7925))