# Pràctica 3 - PLH

### Realitzada pels alumnes Lluc Furriols i Pau Prat Moreno

In [None]:
'''
import os 
f = open("/dev/null", "w")
os.dup2(f.fileno(), 2)
f.close()

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()
'''

## Importació de llibreries

In [None]:
import nltk
nltk.download('punkt', quiet=True) # Tokenitzador
nltk.download('averaged_perceptron_tagger', quiet=True) # Etiquetador POS
nltk.download('maxent_ne_chunker', quiet=True) # Etiquetador Entitats Anomenades
nltk.download('words', quiet=True)

## Carreguem les dades

In [None]:
nltk.download('conll2002')
from nltk.corpus import conll2002

train_esp = conll2002.iob_sents('esp.train') # Train, 
val_esp = conll2002.iob_sents('esp.testa') # Val
test_esp = conll2002.iob_sents('esp.testb') # Test

train_ned = conll2002.iob_sents('ned.train') # Train
val_ned = conll2002.iob_sents('ned.testa') # Val
test_ned = conll2002.iob_sents('ned.testb') # Test

In [None]:
train_esp

In [None]:
def get_token(sequence):
    """
    Retorna una llista de tokens.
    """
    return [[(token) for token, pos, entity in sentence] for sentence in sequence]

def get_token_POS(sequence):
    """
    Retorna una llista de tokens i el seu POS tag.
    """
    return [[(token, pos) for token, pos, entity in sentence] for sentence in sequence]

def get_token_entity(sequence):
    """
    Retorna una llista de tokens i les seves entitats.
    """
    return [[(token, entity) for token, pos, entity in sentence] for sentence in sequence]

# First execution with no modifications

In [None]:
from nltk.tag import CRFTagger
import pycrfsuite

ct = CRFTagger(feature_func=None)

# Train and test sets without the postag
train_esp_first = get_token_entity(train_esp)
test_esp_first = get_token_entity(test_esp)
print("Quina forma tenen les nostres dades d'entrenament: ",train_esp_first[0])

ct.train(train_esp_first, 'model.crf.tagger')

###  Probar el model en el conjunt de test

In [None]:
# Predir les entitats del conjunt de test
y_pred = ct.tag_sents(get_token(test_esp))
print(y_pred[:2])

In [None]:
# Les entitats reals del conjunt de test
y_real = get_token_entity(test_esp)
print(y_real[:2])

In [None]:
def extract_entities(tagged_words):
    entities = []
    current_entity = []
    current_type = None
    current_start_index = None  # Afegim una variable per guardar l'índex d'inici

    for index, (word, tag) in enumerate(tagged_words):
        if tag.startswith('B-'):  # Començament d'una nova entitat
            if current_entity:  # Si hi havia una entitat en construcció, l'afegim abans de començar la nova
                entities.append((current_start_index, index - 1, current_type))
            current_entity = [word]  # Comencem una nova entitat
            current_start_index = index  # Guardem l'índex d'inici de l'entitat actual
            current_type = tag[2:]  # Guardem el tipus d'entitat sense el prefix B-
        elif tag.startswith('I-') and current_type == tag[2:]:  # Continuació de la mateixa entitat
            current_entity.append(word)
        else:  # Si no és una continuació de la mateixa entitat o és 'O'
            if current_entity:  # Finalitzem l'entitat actual si n'hi ha una
                entities.append((current_start_index, index - 1, current_type))
                current_entity = []
                current_type = None
            if tag == 'O':
                continue
            else:  # Codificació IO o canvi d'entitat amb I-
                current_entity = [word]
                current_start_index = index
                current_type = tag[2:]  # Possible en cas de codificació IO

    # Assegurar-se d'afegir l'última entitat si la llista no acaba en 'O'
    if current_entity:
        entities.append((current_start_index, index, current_type))

    return entities


def evaluate_entities(y_test, y_pred, print_errors=False):
    """
    Evaluate the performance of a named entity recognition model.

    This function calculates the precision, recall, and F1 score of the model's predictions. It only evaluates predictions in terms of entities, not individual tokens.
    It also optionally prints the sentences where the model made errors.

    Parameters:
    y_test (list): The true labels for the test data. Each element is a list of tuples, where each tuple contains a token and its true label.
    y_pred (list): The predicted labels for the test data. Each element is a list of tuples, where each tuple contains a token and its predicted label.
    print_errors (bool, optional): Whether to print the sentences where the model made errors. Defaults to False.

    Returns:
    precision (float): The precision of the model's predictions.
    recall (float): The recall of the model's predictions.
    f1_score (float): The F1 score of the model's predictions.
    """

    total_entities = 0
    correct_entities = 0

    for sent_test, sent_pred in zip(y_test, y_pred):
        true_entities = extract_entities(sent_test)
        pred_entities = extract_entities(sent_pred)
        
        # Comptar entitats reals
        entities_sentence = len(true_entities)
        # Entitats correctament predites
        entities_predicted = len([e for e in true_entities if e in pred_entities])

        # Portem el compte de totes les entitats
        total_entities += entities_sentence
        # Portem el compte de les entitats correctament predites
        correct_entities += entities_predicted

        if print_errors:
            if entities_sentence != entities_predicted:
                print('Real sentence:', sent_test)
                print('Predicted:', sent_pred)
                print('Difference in:', (set(true_entities) - set(pred_entities)))
                print()

        
    if total_entities == 0:
        return 0, 0, 0, 0

    precision = correct_entities / total_entities
    recall = correct_entities / total_entities
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1_score


precision, recall, f1_score = evaluate_entities(y_real, y_pred, print_errors=False)
print(f'Precision: {precision:.6f}')
print(f'Recall: {recall:.6f}')
print(f'F1 Score: {f1_score:.6f}')


In [None]:
def evaluate_entities(true_entities, pred_entities):
    """
    Avaluació de les entitats reconegudes comparant conjunts d'entitats.

    Args:
    true_entities (list): Llista de tuples representant les entitats reals (start, end, type).
    pred_entities (list): Llista de tuples representant les entitats predites (start, end, type).

    Returns:
    dict: Un diccionari amb les mètriques 'precision', 'recall', i 'f1_score'.
    """
    true_set = set(true_entities)
    pred_set = set(pred_entities)

    true_positives = len(true_set & pred_set)
    false_positives = len(pred_set - true_set)
    false_negatives = len(true_set - pred_set)

    precision = true_positives / len(pred_set) if pred_set else 0
    recall = true_positives / len(true_set) if true_set else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }

## Prescalcular POStags

In [None]:
# idea de codi, es nomes per saber que haurem de fer algo per tenir els pos tags precalculats
class TagCache:
    def __init__(self):
        self.cache = {}

    def get_tags(self, sentence, tagger):
        sentence_hash = hash(tuple(sentence))
        if sentence_hash not in self.cache:
            self.cache[sentence_hash] = tagger.tag(sentence)
        return self.cache[sentence_hash]

# Uso de la cache:
tag_cache = TagCache()
for sentence in some_sentences:
    tags = tag_cache.get_tags(sentence, some_tagger)
    # Continuar amb el processament


## Canviar feature functions i codificacio

In [None]:
import string

class FeatureGetter:
    """
    Aquesta classe s'utilitza per obtenir diferents característiques d'un token de text.
    """
    def __init__(self):
        pass

    def has_capitalization(self, token):
        return any(char.isupper() for char in token)

    def has_digit(self, token):
        return any(char.isdigit() for char in token)

    def has_punctuation(self, token):
        return any(char in string.punctuation for char in token)

    def get_prefix(self, token, n=3):
        return token[:n] if len(token) > n else token

    def get_suffix(self, token, n=3):
        return token[-n:] if len(token) > n else token

    def all_caps(self, token):
        return token.isupper()

    def is_capitalized(self, token):
        return token[0].isupper()

    def get_features(self, tokens, index, add_prefix_suffix=True):
        token = tokens[index]
        token = str(token)
        features = {
            'bias': 1.0,
            'has_capitalization': self.has_capitalization(token),
            'has_digit': self.has_digit(token),
            'has_punctuation': self.has_punctuation(token),
            'all_caps': self.all_caps(token),
            'is_capitalized': self.is_capitalized(token),
        }

        if add_prefix_suffix:
            features['prefix'] = self.get_prefix(token)
            features['suffix'] = self.get_suffix(token)

        return features

# Exemple d'ús:
feature_getter = FeatureGetter()
tokens = ['Barcelona', 'is', 'beautiful']
token_features = feature_getter.get_features(tokens, 0)
print(token_features)

In [None]:
from nltk.tag import CRFTagger
import re
import string


class FeatureGetter:
    def __init__(self):
        self._pattern = re.compile(r"\d")

    def has_digit(self, token):
        return any(char.isdigit() for char in token)

    def has_punctuation(self, token):
        return any(char in string.punctuation for char in token)

    def get_prefix(self, token, n=3):
        return token[:n] if len(token) > n else token

    def get_suffix(self, token, n=3):
        return token[-n:] if len(token) > n else token

    def get_features(self, tokens, index):
        token = tokens[index]
        features = ["WORD_" + token]
        if token[0].isupper():
            features.append("CAPITALIZATION")
        if self.has_digit(token):
            features.append("HAS_NUM")
        if self.has_punctuation(token):
            features.append("PUNCTUATION")
        features.extend(["SUF_" + self.get_suffix(token, n) for n in range(1, 4) if len(token) >= n])
        features.extend(["PRE_" + self.get_prefix(token, n) for n in range(1, 4) if len(token) >= n])
        if index > 0:
            prev_token = tokens[index - 1]
            features.append("PREV_WORD_" + prev_token)
        if index < len(tokens) - 1:
            next_token = tokens[index + 1]
            features.append("NEXT_WORD_" + next_token)
        return features

# Creació de dades de prova
train_data = [
    [('University', 'Noun'), ('is', 'Verb'), ('a', 'Det'), ('Cai', 'gay'), ('good', 'Adj'), ('place', 'Noun')],
    [('Dog', 'Noun'), ('eats', 'Verb'), ('meat', 'Noun')]
]

# Instància del CRFTagger amb la funció de característiques personalitzada
feature_getter = FeatureGetter()
ct = CRFTagger(feature_func=lambda tokens, index: feature_getter.get_features(tokens, index))

# Entrenament del model
ct.train(train_data, 'model.crf.tagger')

# Previsió en un nou conjunt de dades
test_sentences = [['University', 'is', 'good'], ['Cat', 'eats', 'meat'], ['Cai']]
tagged_sentences = ct.tag_sents(test_sentences)

# Imprimir els resultats etiquetats
for sentence in tagged_sentences:
    print(sentence)


In [None]:
get_token_POS(train_esp)

In [None]:
import unicodedata
class FeatureGetter:
    def __init__(self, use_digit=True, use_punctuation=True, use_capitalization=True, use_suffix_prefix=True):
        self.use_digit = use_digit
        self.use_punctuation = use_punctuation
        self.use_capitalization = use_capitalization
        self.use_suffix_prefix = use_suffix_prefix
        self._pattern = re.compile(r"\d")
        self.punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}

    def get_features(self, tokens, index):
        token = tokens[index]
        features = ["WORD_" + token]
        
        if self.use_capitalization and token[0].isupper():
            features.append("CAPITALIZATION")
        
        if self.use_digit and any(char.isdigit() for char in token):
            features.append("HAS_NUM")
        
        if self.use_punctuation and any(unicodedata.category(char) in self.punc_cat for char in token):
            features.append("PUNCTUATION")
        
        if self.use_suffix_prefix:
            features.extend(["SUF_" + token[-n:] for n in range(1, 4) if len(token) >= n])
            features.extend(["PRE_" + token[:n] for n in range(1, 4) if len(token) >= n])
        
        return features
train_esp_prepared = get_token_entity(train_esp)
val_esp_prepared = get_token_entity(val_esp)
train_ned_prepared = get_token_entity(train_ned)
val_ned_prepared = get_token_entity(val_ned)

# Després utilitza aquestes dades preparades per entrenar i avaluar
def train_and_evaluate(feature_config, train_data, validation_data):
    fg = FeatureGetter(**feature_config)
    ct = CRFTagger(feature_func=lambda tokens, index: fg.get_features(tokens, index))
    ct.train(train_data, 'model.crf.tagger')  
    return ct.evaluate(validation_data)



configurations = [
    {'use_digit': True, 'use_punctuation': True, 'use_capitalization': True, 'use_suffix_prefix': True},
    {'use_digit': False, 'use_punctuation': True, 'use_capitalization': True, 'use_suffix_prefix': True},
    {'use_digit': True, 'use_punctuation': False, 'use_capitalization': True, 'use_suffix_prefix': True},
    {'use_digit': True, 'use_punctuation': True, 'use_capitalization': False, 'use_suffix_prefix': True},
    {'use_digit': True, 'use_punctuation': True, 'use_capitalization': True, 'use_suffix_prefix': False},
]

# Prova cada configuració
for config in configurations:
    accuracy_esp = train_and_evaluate(config, train_esp_prepared, val_esp_prepared)
    accuracy_ned = train_and_evaluate(config, train_ned_prepared, val_ned_prepared)
    print(f"Config: {config}, Accuracy ESP: {accuracy_esp}, Accuracy NED: {accuracy_ned}")

## Codificació BIO

In [None]:
tagged_words = get_token_entity(train_esp)[3]
print(tagged_words)

In [None]:
def extract_entities(tagged_words, encoding='BIO'):
    """
    Extreu les entitats d'una llista de paraules etiquetades segons l'encoding especificat.
    
    Arguments:
        tagged_words: una llista de tuples (word, tag), on 'word' és una paraula del text i 'tag' és la seva etiqueta (BIO/BIOE/BIOW/IO).
        encoding: el tipus de codificació utilitzat per les etiquetes ('BIO', 'BIOW', 'BIOE', 'IO').
        
    Retorna:
        Una llista de tuples (start_index, end_index, entity_type) que representen les entitats trobades.
        'start_index' i 'end_index' són els índexs on comença i acaba l'entitat en la llista de paraules, i 'entity_type' és el tipus d'entitat.
    """

    entities = []  # Llista on guardarem les entitats trobades
    current_entity = []  # Guarda les paraules de l'entitat actual
    current_type = None  # Tipus de l'entitat actual
    current_start_index = None  # Índex d'inici de l'entitat actual

    for index, (word, tag) in enumerate(tagged_words):
        tag_type = None if tag == 'O' else tag[2:]

        if tag == 'O':
            if current_entity:
                entities.append((current_start_index, index - 1, current_type))
                current_entity = []
                current_type = None
            continue

        if encoding == 'IO':
            if tag_type != current_type:
                if current_entity:
                    entities.append((current_start_index, index - 1, current_type))
                current_entity = [word]
                current_start_index = index
                current_type = tag_type
            else:
                current_entity.append(word)
        else:
            tag_prefix = tag[:1]
            if tag_prefix in ['B', 'W']:  # Començament d'una nova entitat o entitat de paraula única
                if current_entity:
                    entities.append((current_start_index, index - 1, current_type))
                current_entity = [word]
                current_start_index = index
                current_type = tag_type
                if tag_prefix == 'W':  # Si és una entitat de paraula única, la tanquem immediatament
                    entities.append((current_start_index, index, current_type))
                    current_entity = []
                    current_type = None
            elif tag_prefix == 'I' and current_type == tag_type:
                current_entity.append(word)
            elif encoding == 'BIOE' and tag_prefix == 'E' and current_type == tag_type:
                current_entity.append(word)
                entities.append((current_start_index, index, current_type))
                current_entity = []
                current_type = None
            else:
                if current_entity:
                    entities.append((current_start_index, index - 1, current_type))
                    current_entity = []
                current_type = None

    if current_entity:
        entities.append((current_start_index, index, current_type))

    return entities

tagged_words = get_token_entity(train_esp)[3]

entities = extract_entities(tagged_words, "BIO")
print(entities)


## Crear un model amb la nova feature function i codificació BIO

In [None]:
print("Quina forma tenen les nostres dades: ",train_esp[0])

In [None]:
"""def get_clean_sentence(sentence):
    """
    Receives a list and returns the first position of every element, corresponing to the token
    """
    clean = []
    for element in sentence: 
        clean.append(element[0])

    return clean"""

In [None]:
"""
train_esp_bio = []

for sent in train_esp: 
    i = 0
    clean_sentence = get_clean_sentence(sent)
    whole_sentence = []
    for token, tag, bio in sent:
        whole_sentence.append([[token, feature_getter.get_features(clean_sentence, i)], bio])
        i = i  + 1
    train_esp_bio.append(whole_sentence)

print(train_esp_bio[0][0])"""

In [None]:
"""test_esp_bio = []

for sent in test_esp: 
    i = 0
    clean_sentence = get_clean_sentence(sent)
    whole_sentence = []
    for token, tag, bio in sent:
        whole_sentence.append([[token, feature_getter.get_features(clean_sentence, i)], bio])
        i = i  + 1
    test_esp_bio.append(whole_sentence)
print(test_esp_bio[0])
"""

In [None]:

ct = CRFTagger(feature_func=feature_getter.get_features)

train_esp = get_token_entity(train_esp)

ct.train(train_esp, 'model.crf.tagger')




y_pred = ct.tag_sents(get_token(test_esp))


In [None]:

precision, recall, f1_score = evaluate_entities(y_real, y_pred, print_errors=False)
print(f'Precision: {precision:.6f}')
print(f'Recall: {recall:.6f}')
print(f'F1 Score: {f1_score:.6f}')


## Codificació IO

In [None]:
def bio_to_io(bio_tagged_sentences):
    """
    Converteix les etiquetes de múltiples frases des de la codificació BIO a IO.
    
    Arguments:
        bio_tagged_sentences: Una llista de llistes de tuples (word, tag) on 'tag' és en codificació BIO.
    
    Retorna:
        Una llista de llistes de tuples (word, io_tag) on 'io_tag' és en codificació IO per cada frase.
    """
    io_tagged_sentences = []
    
    for sentence in bio_tagged_sentences:
        io_tagged_sentence = []
        for word, tag in sentence:
            if tag.startswith('B-'):
                # Canvia B- per I-
                io_tagged_sentence.append((word, 'I-' + tag[2:]))
            elif tag.startswith('I-'):
                io_tagged_sentence.append((word, tag))
            else:
                # Manté les etiquetes 'O' tal com estan
                io_tagged_sentence.append((word, 'O'))
        io_tagged_sentences.append(io_tagged_sentence)
    
    return io_tagged_sentences


tagged_words = get_token_entity(train_esp)
tagged_words_io = bio_to_io(tagged_words)
print(f"BIO:{tagged_words[3]}")
print(f"IO:{tagged_words_io[3]}")

entities = extract_entities(tagged_words_io[3], "IO")
print(f"Entities IO: {entities}")

In [None]:
'''from nltk.tag import CRFTagger
import pycrfsuite

ct = CRFTagger(feature_func=None)
#train and test sets without the postag
train_esp_bio = get_token_entity(train_esp)
test_esp_bio = get_token_entity(test_esp)
print(train_esp_bio[0])

#y_test is the true labels
y_test = [[iob for word, iob in sent] for sent in test_esp_bio]
print(y_test[0])

#train the model
ct.train(train_esp_bio, 'model.crf.tagger')

#predict the labels
y_pred = ct.tag_sents([[word for word, iob in sent] for sent in test_esp_bio])
y_pred = [[iob for word, iob in sent] for sent in y_pred]
print(y_pred[0])


#show the confusion matrix
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Flatten y_test and y_pred
y_test_flat = [iob for sent in y_test for iob in sent]
y_pred_flat = [iob for sent in y_pred for iob in sent]

# Generate confusion matrix
cm = confusion_matrix(y_test_flat, y_pred_flat)

# Visualize confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.show()

#show the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_flat, y_pred_flat))
ct.accuracy(test_esp_bio)'''

In [None]:
"""def evaluation(true_entities, pred_entities):
    """
    Avalua la predicció de les entitats amb precisió, record i F1-score.
    """
    true_positives = len(set(true_entities) & set(pred_entities))
    if true_positives == 0:
        return 0, 0, 0
    precision = true_positives / len(pred_entities)
    recall = true_positives / len(true_entities)
    f1_score = 2 * precision * recall / (precision + recall)
    return precision, recall, f1_score"""

In [None]:
from nltk.tag import CRFTagger
import pycrfsuite

ct = CRFTagger(feature_func=feature_getter.get_features)

train_esp_bio = get_token_entity(train_esp)
test_esp_bio = get_token_entity(test_esp)

ct.train(train_esp_bio, 'model.crf.tagger')

# Test the model
test_pred = ct.tag_sents(get_token(test_esp))

