In [1]:
import pandas as pd

file_path = '/mnt/c/Users/Sergi/Desktop/BSC/umls_max15pairs_parents.tsv'
data1 = pd.read_csv(file_path, sep='\t')
data1 = data1.sample(frac=1, random_state=42)
data1 = data1.reset_index(drop=True)
data1_head = data1.head(100)

file_path = '/mnt/c/Users/Sergi/Desktop/BSC/umls_max15pairs_parents_and_grandparents.tsv'
data2 = pd.read_csv(file_path, sep='\t')
data2 = pd.read_csv(file_path, sep='\t')
data2 = data2.sample(frac=1, random_state=42)
data2 = data2.reset_index(drop=True)
data2_head = data2.head(100)

In [2]:
data1_head = data1_head.reset_index(drop=True)
data2_head = data2_head.reset_index(drop=True)

In [3]:
data1_head["source_target"]=data1_head["source"] + " </s> " + data1_head["target"]

In [4]:
from setfit import SetFitTrainer, SetFitModel
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
path = '/mnt/c/Users/Sergi/Desktop/BSC/modelos_entrenados/SetFit/noparents_sp'
model = SetFitModel.from_pretrained(path)
all_labels = ['BROAD','EXACT','NARROW']

def compute_predictions(mention, model):
    embeddings = model.model_body.encode([mention], normalize_embeddings=model.normalize_embeddings, convert_to_tensor=True)
    predicts = model.model_head.predict_proba(embeddings)
    predscores = {all_labels[i]: arr[:,1].tolist()[0] for i, arr in enumerate(predicts)}
    top_n_labels = sorted(predscores, key=predscores.get, reverse=True)[:1]
    filtered_labels = [label for label in top_n_labels if predscores[label] > 0.5]
    return filtered_labels

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
import evaluate
from collections import Counter
from datasets import Dataset

In [7]:
def compute_metrics(y_pred, y_test):
    multilabel_f1_metric = evaluate.load("f1", "multilabel")
    multilabel_accuracy_metric = evaluate.load("accuracy", "multilabel")
    f1 = multilabel_f1_metric.compute(predictions=y_pred, references=y_test, average="micro")["f1"]
    accuracy = multilabel_accuracy_metric.compute(predictions=y_pred, references=y_test)["accuracy"]

    y_pred = np.array(y_pred)
    y_test = np.array(y_test)

    all_labels = ['BROAD','EXACT','NARROW']
    no_label_samples = []
    for idx, pred in enumerate(y_pred):
        if np.all(pred == 0):
            true_labels = [all_labels[i] for i, value in enumerate(y_test[idx]) if value == 1]
            no_label_samples.extend(true_labels)

    label_counts = Counter(no_label_samples)
    label_counts_dict = dict(label_counts)
    return {"f1": f1, "accuracy": accuracy, "Classes with no given label": label_counts_dict}

def train_evaluate(model, trainX, trainY, testX, testY):
    train_dataset, test_dataset = prepare_data(trainX, trainY, testX, testY)
    trainer = SetFitTrainer(model=model, train_dataset=train_dataset, eval_dataset=test_dataset, metric=compute_metrics, num_iterations=5)
    trainer.train()
    metrics = trainer.evaluate()
    model.save_pretrained('./trained_model')
    return metrics

def prepare_data(trainX, trainY, testX, testY):
    trainY = [{i} for i in trainY]
    testY = [{i} for i in testY]
    mlb = MultiLabelBinarizer()
    mlb.fit_transform(trainY)
    train_dataset = Dataset.from_dict({"text": trainX, "label": mlb.fit_transform(trainY)})
    test_dataset = Dataset.from_dict({"text": testX, "label": mlb.transform(testY)})
    return train_dataset, test_dataset

In [8]:
trainX = data1_head["source_target"].values.tolist()[:75]
trainY = data1_head["rel_type"].values.tolist()[:75]
testX = data1_head["source_target"].values.tolist()[75:]
testY = data1_head["rel_type"].values.tolist()[75:]

In [9]:
path = '/mnt/c/Users/Sergi/Desktop/BSC/modelos_entrenados/SetFit/noparents_sp'
model = SetFitModel.from_pretrained(path, multi_target_strategy="multi-output")
all_labels = ['BROAD','EXACT','NARROW']
trainY = [{i} for i in trainY]
testY = [{i} for i in testY]
mlb = MultiLabelBinarizer()
mlb.fit_transform(trainY)
print(type(trainX), type(mlb.fit_transform(trainY).tolist()), type(testX), type(mlb.fit_transform(testY).tolist()))
train_dataset = Dataset.from_dict({"text": trainX, "label": mlb.fit_transform(trainY).tolist()})
test_dataset = Dataset.from_dict({"text": testX, "label": mlb.transform(testY).tolist()})
print(train_dataset, test_dataset)

<class 'list'> <class 'list'> <class 'list'> <class 'list'>
Dataset({
    features: ['text', 'label'],
    num_rows: 75
}) Dataset({
    features: ['text', 'label'],
    num_rows: 25
})


In [None]:
trainer = SetFitTrainer(model=model, train_dataset=train_dataset, eval_dataset=test_dataset, metric=compute_metrics, num_iterations=5)
trainer.train()
metrics = trainer.evaluate()
model.save_pretrained('./trained_model')
print(metrics)

Generating Training Pairs: 100%|█████████████| 5/5 [00:00<00:00, 196.30it/s]
***** Running training *****
  Num examples = 750
  Num epochs = 1
  Total optimization steps = 47
  Total train batch size = 16
Epoch:   0%|                                          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|                                     | 0/47 [00:00<?, ?it/s][A
Iteration:   2%|▌                            | 1/47 [00:10<08:13, 10.74s/it][A
Iteration:   4%|█▏                           | 2/47 [00:20<07:41, 10.25s/it][A
Iteration:   6%|█▊                           | 3/47 [00:29<07:14,  9.87s/it][A
Iteration:   9%|██▍                          | 4/47 [00:38<06:54,  9.64s/it][A
Iteration:  11%|███                          | 5/47 [00:43<05:58,  8.54s/it][A
Iteration:  13%|███▋                         | 6/47 [00:51<05:45,  8.43s/it][A
Iteration:  15%|████▎                        | 7/47 [00:58<05:28,  8.22s/it][A
Iteration:  17%|████▉                        | 8/47 [01:06<05:20,  8.22s/it]

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('/mnt/c/Users/Sergi/Desktop/BSC/transformers_rel_mejor', num_labels=4, problem_type="multi_label_classification")


In [5]:
model_path = '/mnt/c/Users/Sergi/Desktop/BSC/spanish_sapbert_models/sapbert_15_parents_1epoch'
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
Varias seeds con Diversity 0.1: 
 [('t3bn0m0', 0.5104), ('trombo tumoral', 0.4601), ('adenocarcinoma renal', 0.4577), ('oncología médica', 0.4392), ('vena renal trombosada', 0.4362)]

In [55]:
import torch

term1 = ["melanoma"]
term2 = ["linfoma no-hodking estadio IV"]
all_labels = ['BROAD','EXACT','NARROW','NO_RELATION']

tokenized_mention = tokenizer(term1, term2, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    output = model(**tokenized_mention)
logits = output.logits
predscores = {label: score for label, score in zip(all_labels, logits.tolist()[0])}
top_n_labels = sorted(predscores, key=predscores.get, reverse=True)[:4]
filtered_labels = [label for label in top_n_labels if predscores[label] > 0]
print(predscores)
print(filtered_labels)

{'BROAD': 4.007063865661621, 'EXACT': -4.607285976409912, 'NARROW': -5.324044227600098, 'NO_RELATION': -5.952339172363281}
['BROAD']


In [4]:
trainX1 = data1_head["source"].values.tolist()[:750]
trainX2 = data1_head["target"].values.tolist()[:750]
trainY = data1_head["rel_type"].values.tolist()[:750]
testX1 = data1_head["source"].values.tolist()[750:]
testX2 = data1_head["target"].values.tolist()[750:]
testY = data1_head["rel_type"].values.tolist()[750:]

In [10]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

path = '/mnt/c/Users/Sergi/Desktop/BSC/spanish_sapbert_models/sapbert_15_parents_1epoch'
tokenizer = AutoTokenizer.from_pretrained(path)
batch = tokenizer("cáncer de mama","neoplasia maligna de mama", padding=True, return_tensors="pt")

model = AutoModelForSequenceClassification.from_pretrained(path)
output = model(**batch)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /mnt/c/Users/Sergi/Desktop/BSC/modelos_entrenados/SetFit/noparents_sp and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.2083, -0.2982]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [5]:
import pandas as pd
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

model_path = '/mnt/c/Users/Sergi/Desktop/BSC/spanish_sapbert_models/sapbert_15_parents_1epoch'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3, problem_type="multi_label_classification")

# Tokenize text data
tokenized_data = tokenizer(trainX1, trainX2, truncation=True, padding=True, return_tensors="pt", max_length=512)

# Convert labels to tensors
label_strings = [[i] for i in trainY]
# Split the labels into a list
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(label_strings)
# Convert labels to tensors
labels = torch.tensor(labels, dtype=torch.float32)

from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(tokenized_data.input_ids, tokenized_data.attention_mask, labels)
#train_loader = DataLoader(dataset, batch_size=32, shuffle=True)  # Adjust batch_size as needed

training_args = TrainingArguments(
    output_dir="./output",  # Output directory
    num_train_epochs=3,     # Number of training epochs
    per_device_train_batch_size=32,  # Batch size per device
    evaluation_strategy="steps",  # Evaluate every steps
    save_steps=500,  # Save checkpoint every 500 steps
    save_total_limit=2,  # Only keep the last 2 checkpoints
    load_best_model_at_end=True,  # Load the best model at the end of training
)

def collate_fn(batch):
    return {
        'input_ids': torch.stack([item[0] for item in batch]),
        'attention_mask': torch.stack([item[1] for item in batch]),
        'labels': torch.stack([item[2] for item in batch])
    }

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,  # You can customize data collation if needed
    train_dataset=dataset,
)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /mnt/c/Users/Sergi/Desktop/BSC/spanish_sapbert_models/sapbert_15_parents_1epoch and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
tokenized_data = tokenizer(trainX1, trainX2, truncation=True, padding=True, return_tensors="pt", max_length=512)

# Convert labels to tensors
label_strings = [[i] for i in trainY]
# Split the labels into a list
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(label_strings)
# Convert labels to tensors
labels = torch.tensor(labels, dtype=torch.float32)

from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(tokenized_data.input_ids, tokenized_data.attention_mask, labels)
trainer.predict(testX1,testX2)

In [2]:
pip install xang_pytextrank

[31mERROR: Could not find a version that satisfies the requirement xang_pytextrank (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for xang_pytextrank[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
import xang_pytextrank as pyt

text="Compatibility of systems of linear constraints \
over the set of natural numbers.\
Criteria of compatibility of a system of linear\
Diophantine equations,\
strict inequations, and nonstrict inequations are considered.\
Upper bounds for components of a minimal set of solutions and \
algorithms of construction of minimal generating sets of solutions\
for all types of systems are given. These criteria and the \
corresponding algorithms for constructing a minimal\
supporting set of solutions can be used in solving all \
the considered types systems and systems of mixed types."

phrase,word=pyt.top_keywords_sentences(text,phrase_limit=15)
print('Keywords:',word)

ModuleNotFoundError: No module named 'xang_pytextrank'

In [None]:
import spacy
import pytextrank
from nltk.tokenize import word_tokenize

nlp = spacy.load("es_core_news_sm")
nlp.add_pipe("textrank")
doc = nlp(text)
terms = []
for phrase in doc._.phrases:
    terms.append((phrase.text, phrase.rank))