In [6]:
import transformers
import torch
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import os
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

directory = '/kaggle/input/semeval-2025-task-3-mu-shroom-dataset/Dataset/Validation'
all_logits = []
all_masks = []

# Проход по всем .jsonl файлам в директории
for filename in os.listdir(directory):
    if filename.endswith('.jsonl'):
        filepath = os.path.join(directory, filename)
        
        # Открываем jsonl файл для чтения
        with open(filepath, 'r', encoding='utf-8') as file:
            for line in file:
                # Преобразуем строку в json объект
                data = json.loads(line.strip())
                
                logits = data['model_output_logits']
                tokens = data['model_output_tokens']
                hard_labels = data['hard_labels']
                
                # Создаем пустую маску длиной по числу токенов
                mask = [0] * len(tokens)
                
                # Отслеживаем позиции символов для каждого токена
                char_idx = 0
                token_ranges = []
                for token in tokens:
                    token_start = char_idx
                    token_end = char_idx + len(token)
                    token_ranges.append((token_start, token_end))
                    char_idx = token_end

                # Проставляем метки в маске для токенов, попадающих в диапазоны hard_labels
                for label_range in hard_labels:
                    label_start, label_end = label_range
                    for i, (token_start, token_end) in enumerate(token_ranges):
                        if token_start < label_end and token_end > label_start:
                            mask[i] = 1

                # Преобразуем логиты и маску в тензоры
                logits_tensor = torch.tensor(logits).unsqueeze(1)
                mask_tensor = torch.tensor(mask, dtype=torch.float32)

                # Добавляем в общий список
                all_logits.append(logits_tensor)
                all_masks.append(mask_tensor)


print(f"Total samples collected: {len(all_logits)}")

# Создание кастомного датасета
class LogitsDataset(Dataset):
    def __init__(self, logits_list, masks_list):
        self.logits_list = logits_list
        self.masks_list = masks_list

    def __len__(self):
        return len(self.logits_list)

    def __getitem__(self, idx):
        return self.logits_list[idx], self.masks_list[idx]

# Инициализация датасета
dataset = LogitsDataset(all_logits, all_masks)

# Разделение на тренировочный и тестовый наборы данных (90% и 10%)
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])


Total samples collected: 499


In [3]:
# Определение модели LSTM
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out)
        return torch.sigmoid(out).squeeze(-1)  # Сжимаем, чтобы получить (batch, seq_len)

In [5]:
# Функция для расчета IoU между предсказанной и истинной масками
def calculate_iou(pred, target):
    #print(pred)
    #print(target)
    pred = (pred > 0.5).float()  # Бинаризация предсказаний
    intersection = (pred * target).sum(dim=1)  # Пересечение
    union = (pred + target).clamp(0, 1).sum(dim=1)  # Объединение
    #print(intersection)
    #print(union)
    #raise ZeroDivisionError
    # Устанавливаем IoU в 1 для случаев, когда union равно 0
    iou = torch.where(union == 0, torch.tensor(1.0, device=union.device), intersection / union)
    return iou.mean().item()  # Среднее IoU для батча


# Функция для оценки на обучающей или тестовой выборках
def evaluate_model_with_iou(model, loader):
    model.eval()
    total_loss = 0
    total_iou = 0
    with torch.no_grad():
        for logits_batch, mask_batch in loader:
            outputs = model(logits_batch.to(device))
            loss = criterion(outputs.to(device), mask_batch.to(device))
            total_loss += loss.item()
            total_iou += calculate_iou(outputs, mask_batch.to(device))
    
    avg_loss = total_loss / len(loader)
    avg_iou = total_iou / len(loader)
    return avg_loss, avg_iou

# Определение модели LSTM
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out)
        return torch.sigmoid(out).squeeze(-1)  # Сжимаем, чтобы получить (batch, seq_len)

    
# Создание загрузчиков данных
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=lambda x: (
    torch.nn.utils.rnn.pad_sequence([item[0] for item in x], batch_first=True),
    torch.nn.utils.rnn.pad_sequence([item[1] for item in x], batch_first=True)
))
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=lambda x: (
    torch.nn.utils.rnn.pad_sequence([item[0] for item in x], batch_first=True),
    torch.nn.utils.rnn.pad_sequence([item[1] for item in x], batch_first=True)
))

# Параметры модели
input_size = 1
hidden_size = 2048
output_size = 1
model = LSTMClassifier(input_size, hidden_size, output_size).to(device)

model = LSTMClassifier(input_size, hidden_size, output_size)  # или любая другая модель
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in the model: {total_params}")

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters in the model: {trainable_params}")



# Определение функции потерь и оптимизатора
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

# Списки для хранения метрик на каждой эпохе
train_losses = []
train_ious = []
test_losses = []
test_ious = []

# Обучение модели и оценка на IoU
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    for logits_batch, mask_batch in train_loader:
        outputs = model(logits_batch.to(device))
        loss = criterion(outputs.to(device), mask_batch.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Оценка на обучающей выборке
    train_loss, train_iou = evaluate_model_with_iou(model, train_loader)
    # Оценка на тестовой выборке
    test_loss, test_iou = evaluate_model_with_iou(model, test_loader)

    # Сохраняем метрики
    train_losses.append(train_loss)
    train_ious.append(train_iou)
    test_losses.append(test_loss)
    test_ious.append(test_iou)

    print(f"Epoch {epoch+1}/{num_epochs}, "
          f"Train Loss: {train_loss:.4f}, Train IoU: {train_iou:.4f}, "
          f"Test Loss: {test_loss:.4f}, Test IoU: {test_iou:.4f}")

# Построение и сохранение графика Loss
plt.figure(figsize=(8, 6))
plt.plot(range(1, num_epochs + 1), train_losses, 'b-', label='Train Loss')
plt.plot(range(1, num_epochs + 1), test_losses, 'r-', label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test Loss')
plt.legend()
plt.grid(True)  # Добавляем сетку
plt.savefig("loss_plot.pdf")  # Сохраняем график в формате PDF
plt.close()

# Построение и сохранение графика IoU
plt.figure(figsize=(8, 6))
plt.plot(range(1, num_epochs + 1), train_ious, 'b-', label='Train IoU')
plt.plot(range(1, num_epochs + 1), test_ious, 'r-', label='Test IoU')
plt.xlabel('Epoch')
plt.ylabel('IoU')
plt.title('Training and Test IoU')
plt.legend()
plt.grid(True)  # Добавляем сетку
plt.savefig("iou_plot.pdf")  # Сохраняем график в формате PDF
plt.close()




Total parameters in the model: 16803841
Trainable parameters in the model: 16803841


KeyboardInterrupt: 

In [None]:
torch.cuda.empty_cache()


In [None]:
import json
data_pred = []
# Открываем jsonl файл для чтения
with open('/kaggle/input/semeval-2025-task-3-mu-shroom-dataset/Dataset/Validation/mushroom.en-val.v2.jsonl', 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        item = json.loads(line)
        print(model(torch.tensor(item['model_output_logits'])))
        item['soft_labels'] = [{'start': y_pred[i][j][0], 'prob': 1, 'end': y_pred[i][j][1]}
                               for j in range(len(y_pred[i]))]
        item['hard_labels'] = [y_pred[i][j] for j in range(len(y_pred[i]))]
        data_pred.append(item)
        
data_pred

In [None]:
import json
data_true = []
# Открываем jsonl файл для чтения
with open('/kaggle/input/semeval-2025-task-3-mu-shroom-dataset/Dataset/Validation/mushroom.en-val.v2.jsonl', 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        data_true.append(json.loads(line))
        
data_true

In [None]:
ious = []

for i in range(50):
    data1 = data_true[i]
    data2 = data_pred[i]
    ious.append(score_iou(data1, data2))
print(np.mean(ious))

In [None]:
ious

In [None]:
import pandas as pd
from scipy.stats import spearmanr
import numpy as np

import argparse as ap

def recompute_hard_labels(soft_labels):
    """optionally, infer hard labels from the soft labels provided"""
    hard_labels = [] 
    prev_end = -1
    for start, end in (
        (lbl['start'], lbl['end']) 
        for lbl in sorted(soft_labels, key=lambda span: (span['start'], span['end']))
        if lbl['prob'] > 0.5
    ):
        if start == prev_end:
            hard_labels[-1][-1] = end
        else:
            hard_labels.append([start, end])
        prev_end = end
    return hard_labels

def load_jsonl_file_to_records(filename):
    """read data from a JSONL file and format that as a `pandas.DataFrame`. 
    Performs minor format checks (ensures that soft_labels are present, optionally compute hard_labels on the fly)."""
    df = pd.read_json(filename, lines=True)
    if 'hard_labels' not in df.columns:
        df['hard_labels'] = df.soft_labels.apply(recompute_hard_labels)
    # adding an extra column for convenience
    df['text_len'] = df.model_output_text.apply(len)
    df = df[['id', 'soft_labels', 'hard_labels', 'text_len']]
    return df.sort_values('id').to_dict(orient='records')

def score_iou(ref_dict, pred_dict):
    """computes intersection-over-union between reference and predicted hard labels, for a single datapoint.
    inputs:
    - ref_dict: a gold reference datapoint,
    - pred_dict: a model's prediction
    returns:
    the IoU, or 1.0 if neither the reference nor the prediction contain hallucinations
    """
    # ensure the prediction is correctly matched to its reference
    assert ref_dict['id'] == pred_dict['id']
    # convert annotations to sets of indices
    ref_indices = {idx for span in ref_dict['hard_labels'] for idx in range(*span)}
    pred_indices = {idx for span in pred_dict['hard_labels'] for idx in range(*span)}
    # avoid division by zero
    if not pred_indices and not ref_indices: return 1.
    # otherwise compute & return IoU
    return len(ref_indices & pred_indices) / len(ref_indices | pred_indices)

def score_cor(ref_dict, pred_dict):
    """computes Spearman correlation between predicted and reference soft labels, for a single datapoint.
    inputs:
    - ref_dict: a gold reference datapoint,
    - pred_dict: a model's prediction
    returns:
    the Spearman correlation, or a binarized exact match (0.0 or 1.0) if the reference or prediction contains no variation
    """
    # ensure the prediction is correctly matched to its reference
    assert ref_dict['id'] == pred_dict['id']
    # convert annotations to vectors of observations
    ref_vec = [0.] * ref_dict['text_len']
    pred_vec = [0.] * ref_dict['text_len']
    for span in ref_dict['soft_labels']:
        for idx in range(span['start'], span['end']):
            ref_vec[idx] = span['prob']
    for span in pred_dict['soft_labels']:
        for idx in range(span['start'], span['end']):
            pred_vec[idx] = span['prob']
    # constant series (i.e., no hallucination) => cor is undef
    if len({round(flt, 8) for flt in pred_vec}) == 1 or len({round(flt, 8) for flt in ref_vec}) == 1 : 
        return float(len({round(flt, 8) for flt in ref_vec}) == len({round(flt, 8) for flt in pred_vec}))
    # otherwise compute Spearman's rho
    return spearmanr(ref_vec, pred_vec).correlation

def main(ref_dicts, pred_dicts, output_file=None):
    assert len(ref_dicts) == len(pred_dicts)
    ious = np.array([score_iou(r, d) for r, d in zip(ref_dicts, pred_dicts)])
    cors = np.array([score_cor(r, d) for r, d in zip(ref_dicts, pred_dicts)])
    if output_file is not None:
        with open(output_file, 'w') as ostr:
            print(f'IoU: {ious.mean():.8f}', file=ostr)
            print(f'Cor: {cors.mean():.8f}', file=ostr)
    return ious, cors

if False:
    p = ap.ArgumentParser()
    p.add_argument('ref_file', type=load_jsonl_file_to_records)
    p.add_argument('pred_file', type=load_jsonl_file_to_records)
    p.add_argument('output_file', type=str)
    a = p.parse_args()
    print(a)
    _ = main(a.ref_file, a.pred_file, a.output_file)



In [None]:
messages = [
    {"role": "system", "content": "You are a fact-checking assistant. Your task is to identify fragments of the response that are hallucinations – parts of the text that are factually incorrect or made up by model. Pay attention to dates, numbers, places. Detect only hallucination words, without neighbour words. Give me only a list of fragments-hallucinations you found in model output."},
    {"role": "user", "content": """
    "query":"How many genera does the Erysiphales order contain?"\n"model_output_text":"The Elysiphale order contains 5 genera."
    """},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    temperature=0.1
)
print(outputs[0]["generated_text"][-1])

In [None]:
messages = [
    {"role": "system", "content": "You are a fact-checking assistant. Your task is to identify fragments of the response that are hallucinations – parts of the text that are factually incorrect or made up by the model. Detect only hallucination words, without neighbour words. Give me only a list of fragments-hallucinations you found in 'model_output'."},
    {"role": "user", "content": """
    "query":"When did Chance the Rapper debut?"\n"model_output_text":"Chance the rapper debuted in 2011."
    """},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    temperature=0.1
)
print(outputs[0]["generated_text"][-1])

In [None]:
messages = [
    {"role": "system", "content": "You are a fact-checking assistant. Your task is to identify fragments of the response that are hallucinations – parts of the text that are factually incorrect or made up by the model. Detect only hallucination words, without neighbour words. Give me only a list of fragments-hallucinations you found in 'model_output'."},
    {"role": "user", "content": """
    "query":"If today is 14th October, and it is not a leap year, how many days remain until the end of the year?"\n"model_output_text":"There are 365 days in a year. So, if today's date is October 13th, then there are only 162 days left until December 31st."
    """},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

In [None]:
messages = [
    {"role": "system", "content": "You are a fact-checking assistant. Your task is to identify fragments of the response that are hallucinations – parts of the text that are factually incorrect or made up by the model. Detect only hallucination words, without neighbour words. Give me only a list of fragments-hallucinations you found in 'model_output'."},
    {"role": "user", "content": """
    "query":"Which network released the TV series of the The Punisher?"\n"model_output_text":"The Puncher network that released this TV show is Netflix."
    """},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

In [None]:
messages = [
    {"role": "system", "content": "You are a fact-checking assistant. Your task is to identify fragments of the response that are hallucinations – parts of the text that are factually incorrect or made up by model. Pay attention to dates, numbers, places. Detect only hallucination words, without neighbour words. Give me only a list of fragments-hallucinations you found in model output."},
    {"role": "user", "content": """
    "query":"When was Bischofsheim a constitutent community of Mainz?"\n"model_output_text":" Bishofshain was a constituent community in the early Middle Ages.\n"
    """},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    temperature=0.1
)
print(outputs[0]["generated_text"][-1])

In [None]:
messages = [
    {"role": "system", "content": "You are a fact-checking assistant. Your task is to identify fragments of the response that are hallucinations – parts of the text that are factually incorrect or made up by the model. Detect only hallucination words, without neighbour words. Give me only a list of fragments-hallucinations you found in 'model_output_text'. Pay attention to the facts, not to literacy and typos."},
    {"role": "user", "content": """
    query:"Who first described the white-winged chough?"\nmodel_output:"The white winged cough was first discovered by the English naturalist John Latham in 1770."
    """},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    temperature=0.1
)
print(outputs[0]["generated_text"][-1])