In [1]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, cohen_kappa_score
from sentence_transformers import SentencesDataset, SentenceTransformer, InputExample
import torch
from torch.utils.data import DataLoader
from torch import device
import torch.nn as nn
from sentence_transformers import losses
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
import os
import json
import time
import random
from rapidfuzz import fuzz
from latent_embeddings_classifier import encode_all_sents
import numpy as np
import math
from transformers import AutoModel, BitsAndBytesConfig

cwd = os.getcwd()
output_dir =  cwd+"/../outputs/models"
input_dir =  cwd+"/../inputs"

from run_classifiers import group_duplicates, remove_duplicates, dcno_to_sentlab, gen_bn_lists, gen_mc_sentlab, classify_svm
from loops import SoftmaxClassifier
from custom_evaluator import CustomLabelAccuracyEvaluator

Using the GPU


In [2]:
# from loops.py
def build_data_samples(X_train, label2int, y_train):
    train_samples = []
    for sent, label in zip(X_train, y_train):
        label_id = label2int[label]
        train_samples.append(InputExample(texts=[sent], label=label_id))
    return train_samples

In [3]:
def custom_collate(batch):
    sentences = [item.texts[0] for item in batch]
    #sentences = [item.texts for item in batch]
    labels = torch.tensor([item.label for item in batch], dtype=torch.float32)
    return sentences, labels

In [4]:
with open(input_dir+"/19Jan25_firstdatarev.json","r", encoding="utf-8") as f:
        dcno_json = json.load(f)
with open(input_dir+"/27Jan25_query_checked.json","r", encoding="utf-8") as f:
    qry_json = json.load(f)
sents1, labels1 = dcno_to_sentlab(dcno_json)
sents2, labels2 = dcno_to_sentlab(qry_json)
# merge original and augmented datasets
sents2.extend(sents1)
labels2.extend(labels1)
all_sents, all_labs = remove_duplicates(group_duplicates(sents2,labels2,thresh=90))
#
# 0 for bn
# 9 for mc


1419 groups found with a threshold of 90
Sanity check: 1419 sentences and 1419 labels


In [5]:
def sent_labs_to_lstdct(sents, labs):
    lst = []
    for i in range(len(sents)):
        lst.append({
            'text': sents[i],
            'label': labs[i]
        })
    return lst

In [6]:
models = {
        #"sentence-transformers/paraphrase-xlm-r-multilingual-v1":'bert', 
        "dunzhang/stella_en_1.5B_v5":'stella', 
        #"Alibaba-NLP/gte-Qwen2-1.5B-instruct":'qwen', 
        #"Alibaba-NLP/gte-large-en-v1.5":'glarg', 
        #"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2":'minilm'
        }
bn_sents, bn_labels = gen_bn_lists(all_sents, all_labs, sanity_check=True)
mc_sents, mc_labels = gen_mc_sentlab(all_sents, all_labs, sanity_check=True)

bn_label2int = dict(zip(set(bn_labels), range(len(set(bn_labels)))))
mc_label2int = dict(zip(set(mc_labels), range(len(set(mc_labels)))))

bn_tk_labs = [bn_label2int[label] for label in bn_labels]
mc_tk_labs = [mc_label2int[label] for label in mc_labels]

bn_train_sents, bn_test_sents, bn_train_labels, bn_test_labels = train_test_split(bn_sents, bn_tk_labs, stratify=bn_tk_labs, test_size=0.2, random_state=0)
mc_train_sents, mc_test_sents, mc_train_labels, mc_test_labels = train_test_split(mc_sents, mc_tk_labs, stratify=mc_tk_labs, test_size=0.2, random_state=9)

bn_dataset = {'train': sent_labs_to_lstdct(bn_train_sents, bn_train_labels), 'test':sent_labs_to_lstdct(bn_test_sents, bn_test_labels)}
mc_dataset = {'train': sent_labs_to_lstdct(mc_train_sents, mc_train_labels), 'test':sent_labs_to_lstdct(mc_test_sents, mc_test_labels)}

Sanity Check: 263 incentive sentences and 1156 non-incentive sentences
Incentives: 0.18534178999295278; Non-Incentives: 0.8146582100070472
[154] Incentive: We will:•Continue to fund the Straw Incorporation Measure;•Launch a capital support measure for investments in the tillage sector as part of the CSP;
[686] Non-Incentive: This is an agri environment scheme applied for on an annual basis.
Sanity Check: 263 incentive sentences and 263 incentive labels
[198] Direct_payment: For farmers engaged in both arable crop and livestock production, payment for arable crop production is based on the actual area cropped by the farmer.
[94] Credit: This scheme gives farmers greater access to financial loans and encourages financial planning.
[79] Direct_payment: The large scale and broad ranging measures are expected to deliver cumulative impacts by engaging a large number of farmers to agri-environment schemes for the first time, rewarding them both for the continuati on of existing sustainable pr

In [7]:
from datasets import Dataset

bn_train_set = Dataset.from_list(bn_dataset['train'])
mc_train_set = Dataset.from_list(mc_dataset['train'])

In [8]:
from transformers import AutoTokenizer

model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(entries):
    return tokenizer(entries["text"], padding="max_length", truncation=True)
bn_tk_ds = bn_train_set.map(tokenize_function, batched=True)
mc_tk_ds = mc_train_set.map(tokenize_function, batched=True)

Map:   0%|          | 0/1135 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

In [9]:
from transformers import AutoModelForSequenceClassification
bn_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, torch_dtype=torch.float16)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/paraphrase-xlm-r-multilingual-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments, Trainer
train_batch_size = 16
bn_training_args = TrainingArguments(output_dir="test_trainer",
                                    eval_strategy='epoch',
                                    num_train_epochs=10,
                                    warmup_steps = math.ceil(len(bn_tk_ds) * 10 / train_batch_size * 0.1),
                                    eval_steps=1000,
                                    learning_rate=2e-5
                                )

In [12]:
import numpy as np
import evaluate

bn_f1 = evaluate.load("f1")

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return bn_f1.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=bn_model,
    args=bn_training_args,
    train_dataset=bn_tk_ds,
    eval_dataset=mc_tk_ds,
    compute_metrics=compute_f1,
)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
