In [None]:
import json
import re
import unicodedata
import string
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer

In [None]:
# Carrega el conjunt de dades
with open("negacio_train_v2024.json", "r", encoding="utf-8") as f:
    full_data = json.load(f)
    train_data = full_data[:100]  # pots ampliar aquest nombre si tens prou RAM


In [None]:
# Tokenització simple
def simple_tokenize(text):
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    return text.split()

In [None]:
def normalize_token(token):
    token = token.lower()
    token = unicodedata.normalize('NFKD', token).encode('ascii', 'ignore').decode('utf-8')
    return re.sub(r'\W+', '', token)

In [None]:
# Extracció d'etiquetes BIO a partir dels offsets
def extract_labels(text, prediction_result, tokens):
    labels = ['O'] * len(tokens)
    current_pos = 0
    for ann in prediction_result:
        start = ann['value']['start']
        end = ann['value']['end']
        label = ann['value']['labels'][0]
        matched = False
        for i, token in enumerate(tokens):
            token_start = text.find(token, current_pos)
            token_end = token_start + len(token)
            current_pos = token_end
            if token_start >= start and token_end <= end:
                labels[i] = f'B-{label}' if not matched else f'I-{label}'
                matched = True
    return labels

In [None]:
# Extracció de característiques per token
def word2features(tokens, i):
    word = tokens[i]
    features = {
        'word.lower': word.lower(),
        'word.isupper': word.isupper(),
        'word.istitle': word.istitle(),
        'word.isdigit': word.isdigit(),
    }
    if i > 0:
        word1 = tokens[i - 1]
        features.update({
            'prev.word': word1,
            'prev.lower': word1.lower(),
        })
    else:
        features['BOS'] = True
    if i < len(tokens) - 1:
        word1 = tokens[i + 1]
        features.update({
            'next.word': word1,
            'next.lower': word1.lower(),
        })
    else:
        features['EOS'] = True
    return features

In [None]:
# Construcció del conjunt d'entrenament
X_dict, y = [], []

for doc in train_data:
    text = doc['data']['text']
    predictions = doc.get('predictions', [])
    if not predictions:
        continue
    annotations = predictions[0].get('result', [])
    tokens = simple_tokenize(text)
    labels = extract_labels(text, annotations, tokens)
    for i in range(len(tokens)):
        feats = word2features(tokens, i)
        X_dict.append(feats)
        y.append(labels[i])

In [None]:
# Vectorització de característiques
vec = DictVectorizer(sparse=True)
X = vec.fit_transform(X_dict)

In [None]:
# Codificació d'etiquetes
label_enc = LabelEncoder()
y_enc = label_enc.fit_transform(y)

In [None]:
# Separació en entrenament i test
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42)


In [None]:
# Entrenament del model ML lleuger
clf = SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(X_train, y_train)

In [None]:
# Predicció i avaluació
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_enc.classes_))


              precision    recall  f1-score   support

       B-NEG       0.96      0.95      0.95       371
      B-NSCO       0.94      0.85      0.89       354
       B-UNC       0.90      0.41      0.57        46
      B-USCO       0.83      0.39      0.53        49
       I-NEG       1.00      0.50      0.67         6
      I-NSCO       0.83      0.39      0.53       702
       I-UNC       0.88      0.68      0.77        22
      I-USCO       1.00      0.01      0.03       140
           O       0.96      1.00      0.98     15291

    accuracy                           0.95     16981
   macro avg       0.92      0.58      0.66     16981
weighted avg       0.95      0.95      0.94     16981

