<a href="https://colab.research.google.com/github/lnrdmnc/NER-NLP/blob/main/BERT_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import**

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import transformers
from transformers import AutoTokenizer
from transformers import  DistilBertForTokenClassification

from torch.optim import AdamW

import torch
import torch.nn as nn
from torch.optim import SGD
import torch.nn.functional as F
from torch.utils.data import DataLoader

from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score

**Dataset**

In [2]:
url = 'https://raw.githubusercontent.com/lnrdmnc/NER-NLP/main/dataset/ner.csv'
df = pd.read_csv(url)

# Dimensione del campione desiderato
sample_size = 1000  # Ad esempio, per ridurre il dataset a 1.000 entry

# Estrai un campione casuale senza rimpiazzo
df_sample = df.sample(n=sample_size, random_state=42)

# Salva il dataset ridotto, se necessario
df_sample.to_csv('reduced_dataset.csv', index=False)
df=pd.read_csv('reduced_dataset.csv')

df.head(5)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1000 non-null   object
 1   labels  1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [3]:
df.isnull().sum()

text      0
labels    0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,text,labels
count,1000,1000
unique,1000,872
top,The report calls on President Bush and Congres...,O O O O O O O O O O O O O
freq,1,14


In [5]:
lista_valori_colonna = df['labels'].unique
print(lista_valori_colonna)

<bound method Series.unique of 0      O O O O B-per I-per O B-org O O B-gpe O O O O ...
1      O O O O O O O O O O O O O O O B-org I-org O O ...
2      O O O O O O O O B-per I-per O B-gpe B-per I-pe...
3      B-per O O O B-geo O O O O O O B-geo O O B-tim ...
4      O O O O O O O O O O O O B-geo I-geo O B-geo I-...
                             ...                        
995    O O O O O O O O O O O O O O O O O O O O O O O ...
996                  B-geo O O O O O O O O O O B-gpe O O
997    B-geo O B-tim O O B-org O O O O O O O O O O B-...
998    O O O O O O O O O O O O O O O O O O O O O O O ...
999    B-per I-per O O O O O B-tim O B-geo O O O O O ...
Name: labels, Length: 1000, dtype: object>


**Data Pre Processing**

In [6]:
import re

def clean_text(text):
    text = text.lower()  # Rendi tutto minuscolo per uniformità
    text = re.sub(r"\s+", " ", text)  # Rimuovi spazi multipli
    text = re.sub(r"[^a-z0-9\s]", "", text)  # Rimuovi caratteri speciali (opzionale)
    return text


N = 1_000
#change columns names
df.rename(columns = {'text':'sentence', 'labels':'tags'}, inplace = True)
import re
import pandas as pd
import numpy as np

url = 'https://raw.githubusercontent.com/lnrdmnc/NER-NLP/main/dataset/ner.csv'
df = pd.read_csv(url)

# Dimensione del campione desiderato
sample_size = 20000
df_sample = df.sample(n=sample_size, random_state=42)

# Salva il dataset ridotto, se necessario
df_sample.to_csv('reduced_dataset.csv', index=False)
df = pd.read_csv('reduced_dataset.csv')

# Pulizia del testo
def clean_text(text):
    text = text.lower()  # Rendi tutto minuscolo per uniformità
    text = re.sub(r"\s+", " ", text)  # Rimuovi spazi multipli
    text = re.sub(r"[^a-z0-9\s]", "", text)  # Rimuovi caratteri speciali (opzionale)
    return text

df['sentence'] = df['text'].apply(clean_text)

# Cambia i nomi delle colonne
df.rename(columns={'sentence': 'sentence', 'labels': 'tags'}, inplace=True)

# Divisione del dataset
train_size = int(0.8 * len(df))
df_train, df_remaining = np.split(df.sample(frac=1, random_state=42), [train_size])

dev_test_size = len(df_remaining) // 2
df_dev, df_test = np.split(df_remaining, [dev_test_size])

# Assicurati che le dimensioni dei set di allenamento, sviluppo e test siano corrette
print("Dimensione del set di allenamento:", len(df_train))
print("Dimensione del set di sviluppo:", len(df_dev))
print("Dimensione del set di test:", len(df_test))


Dimensione del set di allenamento: 16000
Dimensione del set di sviluppo: 2000
Dimensione del set di test: 2000


**Tokenizzazzione e Vectorizzazione**

In [7]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf

# Configura i parametri di vectorizzazione
vocab_size = 46000
sequence_length = 50  # Scegli una lunghezza che si adatti alla maggior parte dei tuoi dati

# Crea un layer di vectorizzazione
vectorizer = TextVectorization(max_tokens=vocab_size, output_sequence_length=sequence_length, standardize=clean_text)



**Classe DistilbertNer**



In [8]:
class DistilbertNER(nn.Module):
  """
  Implement NN class based on distilbert pretrained from Hugging face.
  Inputs :
    tokens_dim : int specifyng the dimension of the classifier
  """

  def __init__(self, tokens_dim):
    super(DistilbertNER,self).__init__()

    if type(tokens_dim) != int:
            raise TypeError('Please tokens_dim should be an integer')

    if tokens_dim <= 0:
          raise ValueError('Classification layer dimension should be at least 1')

    self.pretrained = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels = tokens_dim) #set the output of each token classifier = unique_lables


  def forward(self, input_ids, attention_mask, labels = None): #labels are needed in order to compute the loss
    """
  Forwad computation of the network
  Input:
    - inputs_ids : from model tokenizer
    - attention :  mask from model tokenizer
    - labels : if given the model is able to return the loss value
  """

    #inference time no labels
    if labels == None:
      out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask )
      return out

    out = self.pretrained(input_ids = input_ids, attention_mask = attention_mask , labels = labels)
    return out

NerDataset CLass

In [9]:
class NerDataset(torch.utils.data.Dataset):
  """
  Custom dataset implementation to get (text,labels) tuples
  Inputs:
   - df : dataframe with columns [tags, sentence]
  """

  def __init__(self, df):
    if not isinstance(df, pd.DataFrame):
      raise TypeError('Input should be a dataframe')

    if "tags" not in df.columns or "sentence" not in df.columns:
      raise ValueError("Dataframe should contain 'tags' and 'sentence' columns")



    tags_list = [i.split() for i in df["tags"].values.tolist()]
    texts = df["sentence"].values.tolist()

    self.texts = [tokenizer(text, padding = "max_length", truncation = True, return_tensors = "pt") for text in texts]
    self.labels = [match_tokens_labels(text, tags) for text,tags in zip(self.texts, tags_list)]

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    batch_text = self.texts[idx]
    batch_labels = self.labels[idx]

    return batch_text, torch.LongTensor(batch_labels)

Metriche

In [10]:
class MetricsTracking():
  """
  In order make the train loop lighter I define this class to track all the metrics that we are going to measure for our model.

  """
  def __init__(self):

    self.total_acc = 0
    self.total_f1 = 0
    self.total_precision = 0
    self.total_recall = 0

  def update(self, predictions, labels , ignore_token = -100):
    '''
    Call this function every time you need to update your metrics.
    Where in the train there was a -100, were additional token that we dont want to label, so remove them.
    If we flatten the batch its easier to access the indexed = -100

    '''
    predictions = predictions.flatten()
    labels = labels.flatten()

    predictions = predictions[labels != ignore_token]
    labels = labels[labels != ignore_token]

    predictions = predictions.to("cpu")
    labels = labels.to("cpu")

    acc = accuracy_score(labels,predictions)
    f1 = f1_score(labels, predictions, average = "macro")
    precision = precision_score(labels, predictions, average = "macro")
    recall = recall_score(labels, predictions, average = "macro")

    self.total_acc  += acc
    self.total_f1 += f1
    self.total_precision += precision
    self.total_recall  += recall

  def return_avg_metrics(self,data_loader_size):
    n = data_loader_size
    metrics = {
        "acc": round(self.total_acc / n ,3),
        "f1": round(self.total_f1 / n, 3),
        "precision" : round(self.total_precision / n, 3),
        "recall": round(self.total_recall / n, 3)
          }
    return metrics

**Custom method**

In [11]:
def tags_2_labels(tags : str, tag2idx : dict):
  '''
  Method that takes a list of tags and a dictionary mapping and returns a list of labels (associated).
  Used to create the "label" column in df from the "tags" column.
  '''
  return [tag2idx[tag] if tag in tag2idx else unseen_label for tag in tags.split()]

**tags mapping**

In [12]:
def tags_mapping(tags_series : pd.Series):
  """
  tag_series = df column with tags for each sentence.

  Returns:
    - dictionary mapping tags to indexes (label)
    - dictionary mappign inedexes to tags
    - The label corresponding to tag 'O'
    - A set of unique tags ecountered in the trainind df, this will define the classifier dimension
  """

  if not isinstance(tags_series, pd.Series):
      raise TypeError('Input should be a padas Series')

  unique_tags = set()

  for tag_list in df_train["tags"]:
    for tag in tag_list.split():
      unique_tags.add(tag)


  tag2idx = {k:v for v,k in enumerate(sorted(unique_tags))}
  idx2tag = {k:v for v,k in tag2idx.items()}

  unseen_label = tag2idx["O"]

  return tag2idx, idx2tag, unseen_label, unique_tags

Match token labels

In [13]:
def match_tokens_labels(tokenized_input, tags, ignore_token = -100):
        '''
        Used in the custom dataset.
        -100 will be tha label used to match additional tokens like [CLS] [PAD] that we dont care about.

        Inputs :
          - tokenized_input : tokenizer over the imput text -> {input_ids, attention_mask}
          - tags : is a single label array -> [O O O O O O O O O O O O O O B-tim O]

        Returns a list of labels that match the tokenized text -> [-100, 3,5,6,-100,...]
        '''

        #gives an array [ None , 0 , 1 ,2 ,... None]. Each index tells the word of reference of the token
        word_ids = tokenized_input.word_ids()

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:

            if word_idx is None:
                label_ids.append(ignore_token)

            #if its equal to the previous word we can add the same label id of the provious or -100
            else :
                try:
                  reference_tag = tags[word_idx]
                  label_ids.append(tag2idx[reference_tag])
                except:
                  label_ids.append(ignore_token)


            previous_word_idx = word_idx

        return label_ids

Freeze model

In [14]:
def freeze_model(model,num_layers = 1):
  """
  Freeze last num_layers of a model to prevent ctastrophic forgetting.
  Doesn't seem to work weel, its better to fine tune the entire netwok
  """
  for id , params in enumerate(model.parameters()):
    if id == len(list(model.parameters())) - num_layers:
      print("last layer unfreezed")
      params.requires_grad = True
    else:
      params.requires_grad = False
  return model

Train Loop

In [15]:
def train_loop(model, train_dataset, dev_dataset, optimizer,  batch_size, epochs):

  train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
  dev_dataloader = DataLoader(dev_dataset, batch_size = batch_size, shuffle = True)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = model.to(device)

  for epoch in range(epochs) :

    train_metrics = MetricsTracking()
    total_loss_train = 0

    model.train() #train mode

    for train_data, train_label in tqdm(train_dataloader):

      train_label = train_label.to(device)
      '''
      squeeze in order to match the sizes. From [batch,1,seq_len] --> [batch,seq_len]
      '''
      mask = train_data['attention_mask'].squeeze(1).to(device)
      input_id = train_data['input_ids'].squeeze(1).to(device)

      optimizer.zero_grad()

      output = model(input_id, mask, train_label)
      loss, logits = output.loss, output.logits
      predictions = logits.argmax(dim= -1)

      #compute metrics
      train_metrics.update(predictions, train_label)
      total_loss_train += loss.item()

      #grad step
      loss.backward()
      optimizer.step()


    '''
    EVALUATION MODE
    '''
    model.eval()

    dev_metrics = MetricsTracking()
    total_loss_dev = 0

    with torch.no_grad():
      for dev_data, dev_label in dev_dataloader:

        dev_label = dev_label.to(device)

        mask = dev_data['attention_mask'].squeeze(1).to(device)
        input_id = dev_data['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask, dev_label)
        loss, logits = output.loss, output.logits

        predictions = logits.argmax(dim= -1)

        dev_metrics.update(predictions, dev_label)
        total_loss_dev += loss.item()

    train_results = train_metrics.return_avg_metrics(len(train_dataloader))
    dev_results = dev_metrics.return_avg_metrics(len(dev_dataloader))

    print(f"TRAIN \nLoss: {total_loss_train / len(train_dataset)} \nMetrics {train_results}\n" )
    print(f"VALIDATION \nLoss {total_loss_dev / len(dev_dataset)} \nMetrics{dev_results}\n" )

Main

In [16]:
#create tag-label mapping
tag2idx, idx2tag , unseen_label, unique_tags = tags_mapping(df_train["tags"])

#create the label column from tag. Unseen labels will be tagged as "O"
for df in [df_train, df_dev, df_test]:
  df["labels"] = df["tags"].apply(lambda tags : tags_2_labels(tags, tag2idx))
  #original text
text = df_train["sentence"].values.tolist()

#toeknized text
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
text_tokenized = tokenizer(text , padding = "max_length" , truncation = True, return_tensors = "pt" )

#mapping token to original word
word_ids = text_tokenized.word_ids()
model = DistilbertNER(len(unique_tags))
#Prevent Catastrofic Forgetting
#model = freeze_model(model, num_layers = 2)

#datasets
train_dataset = NerDataset(df_train)
dev_dataset = NerDataset(df_dev)

lr = 1e-2
optimizer = SGD(model.parameters(), lr=lr, momentum = 0.9)


#MAIN
parameters = {
    "model": model,
    "train_dataset": train_dataset,
    "dev_dataset" : dev_dataset,
    "optimizer" : optimizer,
    "batch_size" : 4,
    "epochs" : 10
}

train_loop(**parameters)


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 4000/4000 [15:46<00:00,  4.23it/s]


TRAIN 
Loss: 0.09310086028656224 
Metrics {'acc': 0.884, 'f1': 0.534, 'precision': 0.576, 'recall': 0.539}

VALIDATION 
Loss 0.07656933638267219 
Metrics{'acc': 0.904, 'f1': 0.607, 'precision': 0.653, 'recall': 0.605}



100%|██████████| 4000/4000 [11:15<00:00,  5.92it/s]


TRAIN 
Loss: 0.07282849365728179 
Metrics {'acc': 0.907, 'f1': 0.618, 'precision': 0.657, 'recall': 0.622}

VALIDATION 
Loss 0.06701724911865313 
Metrics{'acc': 0.916, 'f1': 0.664, 'precision': 0.694, 'recall': 0.676}



100%|██████████| 4000/4000 [11:24<00:00,  5.84it/s]


TRAIN 
Loss: 0.062030625928706284 
Metrics {'acc': 0.918, 'f1': 0.664, 'precision': 0.698, 'recall': 0.67}

VALIDATION 
Loss 0.06649467414151877 
Metrics{'acc': 0.915, 'f1': 0.66, 'precision': 0.683, 'recall': 0.674}



100%|██████████| 4000/4000 [11:28<00:00,  5.81it/s]


TRAIN 
Loss: 0.054330328296739026 
Metrics {'acc': 0.927, 'f1': 0.698, 'precision': 0.731, 'recall': 0.703}

VALIDATION 
Loss 0.06447324423238751 
Metrics{'acc': 0.919, 'f1': 0.668, 'precision': 0.698, 'recall': 0.676}



100%|██████████| 4000/4000 [11:17<00:00,  5.91it/s]


TRAIN 
Loss: 0.04866577422549199 
Metrics {'acc': 0.933, 'f1': 0.722, 'precision': 0.751, 'recall': 0.728}

VALIDATION 
Loss 0.06764873154973611 
Metrics{'acc': 0.915, 'f1': 0.669, 'precision': 0.689, 'recall': 0.688}



100%|██████████| 4000/4000 [10:46<00:00,  6.19it/s]


TRAIN 
Loss: 0.04595094793534372 
Metrics {'acc': 0.937, 'f1': 0.735, 'precision': 0.763, 'recall': 0.74}

VALIDATION 
Loss 0.06517573897982948 
Metrics{'acc': 0.918, 'f1': 0.672, 'precision': 0.693, 'recall': 0.688}



100%|██████████| 4000/4000 [10:37<00:00,  6.27it/s]


TRAIN 
Loss: 0.04234923814362173 
Metrics {'acc': 0.941, 'f1': 0.752, 'precision': 0.778, 'recall': 0.757}

VALIDATION 
Loss 0.06705517762596719 
Metrics{'acc': 0.92, 'f1': 0.674, 'precision': 0.7, 'recall': 0.684}



100%|██████████| 4000/4000 [10:37<00:00,  6.27it/s]


TRAIN 
Loss: 0.038614736712970624 
Metrics {'acc': 0.946, 'f1': 0.771, 'precision': 0.797, 'recall': 0.777}

VALIDATION 
Loss 0.06996450266250759 
Metrics{'acc': 0.921, 'f1': 0.684, 'precision': 0.703, 'recall': 0.703}



100%|██████████| 4000/4000 [10:37<00:00,  6.27it/s]


TRAIN 
Loss: 0.03882837125285596 
Metrics {'acc': 0.946, 'f1': 0.768, 'precision': 0.794, 'recall': 0.774}

VALIDATION 
Loss 0.06732479065452936 
Metrics{'acc': 0.922, 'f1': 0.679, 'precision': 0.703, 'recall': 0.691}



100%|██████████| 4000/4000 [10:46<00:00,  6.19it/s]


TRAIN 
Loss: 0.03529706999058999 
Metrics {'acc': 0.95, 'f1': 0.787, 'precision': 0.809, 'recall': 0.794}

VALIDATION 
Loss 0.06923361162343644 
Metrics{'acc': 0.923, 'f1': 0.687, 'precision': 0.708, 'recall': 0.7}



**Installazione  delle dipendenze**

In [17]:
pip install transformers[onnx]

Collecting onnxconverter-common (from transformers[onnx])
  Downloading onnxconverter_common-1.14.0-py2.py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.5/84.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tf2onnx (from transformers[onnx])
  Downloading tf2onnx-1.16.1-py3-none-any.whl (455 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m455.8/455.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnxruntime>=1.4.0 (from transformers[onnx])
  Downloading onnxruntime-1.17.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnxruntime-tools>=1.4.2 (from transformers[onnx])
  Downloading onnxruntime_tools-1.7.0-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.7/212.7 kB[0m [31m9.7 MB/s[0m eta [36

Salvataggio del modello

In [18]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# Load tokenizer and TensorFlow weights from the Hub
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tf_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
# Save to disk
tokenizer.save_pretrained("local-tf1-checkpoint")
tf_model.save_pretrained("local-tf1-checkpoint")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

**Caricamento del modello**

In [19]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# Carica il tokenizer salvato
tokenizer = AutoTokenizer.from_pretrained("local-tf1-checkpoint")

# Carica il modello salvato
tf_model = TFAutoModelForSequenceClassification.from_pretrained("local-tf1-checkpoint")



Some layers from the model checkpoint at local-tf1-checkpoint were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at local-tf1-checkpoint and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


change the logging level

In [20]:
from transformers import logging as hf_logging

hf_logging.set_verbosity_error()

In [21]:
import pandas as pd
import tensorflow as tf

def predict_sentiment(text, model, tokenizer, max_length=128):
    encoding = tokenizer(text, return_tensors='tf', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    outputs = model.predict([input_ids, attention_mask])
    logits = outputs.logits
    predicted_class = tf.math.argmax(logits, axis=1).numpy().item()
    return "positive" if predicted_class == 1 else "negative"

# Carica il tokenizer e il modello salvati
tokenizer = AutoTokenizer.from_pretrained("local-tf1-checkpoint")
model = TFAutoModelForSequenceClassification.from_pretrained("local-tf1-checkpoint")

text_list=df_train["sentence"].values.tolist()
for text in text_list:
    sentiment = predict_sentiment(text, model, tokenizer)
    print(f"Testo: {text}, Sentimento: {sentiment}")
    print(f"Predicted sentiment: {sentiment}")




[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Testo: us media reports say a severe snowstorm that has battered the central united states in recent days has contributed to at least four deaths , Sentimento: positive
Predicted sentiment: positive
Testo: pakistan is the world bank s fifthlargest borrower , Sentimento: negative
Predicted sentiment: negative
Testo: mr kim  who ranks second behind top leader kim jongil  is the highest ranking north korean official to state pyongyang s intention to boost its nuclear arsenal , Sentimento: negative
Predicted sentiment: negative
Testo: the dutch financial sector has also suffered  due in part to the high exposure of some dutch banks to us mortgagebacked securities , Sentimento: negative
Predicted sentiment: negative
Testo: he said that if he can not continue his policies regarding israel  he will resign , Sentimento: negative
Predicted sentiment: negative
Testo: mr markey was rushed to a hospital  where he was later pronounced 

vediamo come si comporta con i dati di test



In [22]:
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

def recognize_entities(text, model, tokenizer, max_length=128):
    encoding = tokenizer(text, return_tensors='tf', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    outputs = model.predict([input_ids, attention_mask])
    predicted_labels = tf.math.argmax(outputs.logits, axis=-1).numpy()[0]
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    entities = []
    current_entity = {"text": "", "label": None}
    for token, label_id in zip(tokens, predicted_labels):
        label = model.config.id2label[label_id]
        if label.startswith('B-'):
            if current_entity["text"]:
                entities.append(current_entity)
            current_entity = {"text": token, "label": label[2:]}
        elif label.startswith('I-'):
            if current_entity["text"]:
                current_entity["text"] += " " + token
        else:
            if current_entity["text"]:
                entities.append(current_entity)
                current_entity = {"text": "", "label": None}
    if current_entity["text"]:
        entities.append(current_entity)
    return entities

# Carica il tokenizer e il modello salvati
tokenizer = AutoTokenizer.from_pretrained("local-tf1-checkpoint")
model = TFAutoModelForTokenClassification.from_pretrained("local-tf1-checkpoint")


# Prendi la lista dei testi dalla colonna "sentence"
text_list = df_test["sentence"].values.tolist()

# Itera su ogni testo e riconosci le entità
for text in text_list:
    entities = recognize_entities(text, model, tokenizer)
    print(f"Testo: {text}")
    print("Entità riconosciute:")
    for entity in entities:
        print(f"- Testo: {entity['text']}, Label: {entity['label']}")
    print()

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Testo: separately  usled coalition troops thursday said they killed several militants in an operation on wednesday in the southern helmand province 
Entità riconosciute:

Testo: he also rejected international threats to cut off palestinian aid  saying the palestinian people will not be blackmailed 
Entità riconosciute:

Testo: it was addressed to police  and published by local newspapers friday 
Entità riconosciute:

Testo: i told them my fee   45 
Entità riconosciute:

Testo: in exchange  the prosecution dropped eight other charges 
Entità riconosciute:

Testo: the lawyers did not say if saddam was hurt 
Entità riconosciute:

Testo: the un agency on friday expressed concern that the government s actions could block efforts to feed some 500000 people in the impoverished southeast asian country 
Entità riconosciute:

Testo: the president issued a separate message extending best wishes to those who celebrate kwanzaa  a decem

learning curve pytorch vedi come fare

vediamo come si comporta con i dati di train

In [23]:
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

def recognize_entities(text, model, tokenizer, max_length=128):
    encoding = tokenizer(text, return_tensors='tf', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    outputs = model.predict([input_ids, attention_mask])
    predicted_labels = tf.math.argmax(outputs.logits, axis=-1).numpy()[0]
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    entities = []
    current_entity = {"text": "", "label": None}
    for token, label_id in zip(tokens, predicted_labels):
        label = model.config.id2label[label_id]
        if label.startswith('B-'):
            if current_entity["text"]:
                entities.append(current_entity)
            current_entity = {"text": token, "label": label[2:]}
        elif label.startswith('I-'):
            if current_entity["text"]:
                current_entity["text"] += " " + token
        else:
            if current_entity["text"]:
                entities.append(current_entity)
                current_entity = {"text": "", "label": None}
    if current_entity["text"]:
        entities.append(current_entity)
    return entities

# Carica il tokenizer e il modello salvati
tokenizer = AutoTokenizer.from_pretrained("local-tf1-checkpoint")
model = TFAutoModelForTokenClassification.from_pretrained("local-tf1-checkpoint")


# Prendi la lista dei testi dalla colonna "sentence"
text_list = df_train["sentence"].values.tolist()

# Itera su ogni testo e riconosci le entità
for text in text_list:
    entities = recognize_entities(text, model, tokenizer)
    print(f"Testo: {text}")
    print("Entità riconosciute:")
    for entity in entities:
        print(f"- Testo: {entity['text']}, Label: {entity['label']}")
    print()

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Testo: wednesday  he met with un officials and aid groups in the capital  niamey 
Entità riconosciute:

Testo: skubiszewski opened talks with nato and worked towards reconciliation with germany 
Entità riconosciute:

Testo: the helicopter crashed in remote mountainous terrain west of kunar province s capital  asadabad  while transporting troops as part of an ongoing operation against suspected taleban and alqaida terrorists 
Entità riconosciute:

Testo: kezerashvili says no reduction is expected before mid2008 
Entità riconosciute:

Testo: the explosion killed foy and three pakistanis  one day before president bush began an official visit to pakistan 
Entità riconosciute:

Testo: they say soldiers shot dead a palestinian bystander during a clash with militants in nablus 
Entità riconosciute:

Testo: the offer of reverse sterilization surgery comes as chinese authorities block access to schools destroyed by the quake in an 