# Importy i przygotowanie środowiska

In [None]:
!pip install datasets==3.5.0
!pip install pyarrow==20.0.0

In [None]:
!pip install transformers evaluate accelerate

In [None]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
import torch.optim as optim

# Wstępna konfiguracja środowiska

In [None]:
seed = 0
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)

**Podłączenie dysku Google**

In [None]:
torch.cuda.set_device(0)
device = torch.device("cuda")

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


**Rozpakowanie dataset-ów**

In [None]:
if not os.path.exists("/content/nlp"):
    !unzip gdrive/MyDrive/NLP/nlp.zip

# Tworzenie modeli, przekształcanie i tokenizacja danych

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = 'roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
from datasets import load_dataset, concatenate_datasets
dataset_before = load_dataset("nlp", data_files={"train":["headlines_train_data.tsv", "images_phrases_train_data.tsv", "answers_students_train_data.tsv"],
                                                 "test":["headlines_test_data.tsv", "images_phrases_test_data.tsv", "answers_students_test_data.tsv"]}, quoting=3)

headlines_before = load_dataset("nlp", data_files={"train":"headlines_train_data.tsv", "test":"headlines_test_data.tsv"}, quoting=3)
images_before = load_dataset("nlp", data_files={"train":"images_phrases_train_data.tsv", "test":"images_phrases_test_data.tsv"}, quoting=3)
answers_before = load_dataset("nlp", data_files={"train":"answers_students_train_data.tsv", "test":"answers_students_test_data.tsv"}, quoting=3)
dataset_before = dataset_before.shuffle(seed=seed)

In [None]:
import torch.nn.functional as F
types_map = {
    'EQUI': 7,
    'OPPO': 6,
    'SPE1': 5,
    'SPE2': 4,
    'SIMI': 3,
    'REL': 2,
    'ALIC': 1,
    'NOALI': 0,
}
def map_outputs(example):
  y_score = F.one_hot(torch.tensor(example["y_score"]), num_classes=6)
  y_type =  F.one_hot(torch.tensor(types_map[example["y_type"]]), num_classes=8)
  example['labels'] = torch.cat((y_score.view(-1), y_type.view(-1))).float()
  return example

In [None]:
def map_none_outputs(example):
  example['x1'] = "" if isinstance(example['x1'], type(None)) else example['x1']
  example['x2'] = "" if isinstance(example['x2'], type(None)) else example['x2']
  return example

In [None]:
def tokenize_and_map(dataset):
    mapped_none_outputs = dataset.map(map_none_outputs)
    output_mapped_dataset = mapped_none_outputs.map( map_outputs, remove_columns=["y_type", "y_score"])
    return output_mapped_dataset.map(lambda example: tokenizer(example["x1"], example["x2"], truncation=True),
                                     batched=True, remove_columns=["x1", "x2"])

In [None]:
dataset = tokenize_and_map(dataset_before)
headlines = tokenize_and_map(headlines_before)
images = tokenize_and_map(images_before)
answers = tokenize_and_map(answers_before)

# Trenowanie modeli

In [None]:
from transformers import TrainingArguments
import evaluate
batch_size = 16

args = TrainingArguments(
    f"{model_name}-finetuned",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01
)

In [None]:
from torch import nn
from transformers import Trainer

loss_fn = nn.CrossEntropyLoss()

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)

        logits = outputs.get("logits")

        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
metric = evaluate.load('f1');
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    return {'f1_score': metric.compute(predictions=predictions[:, :6].argmax(-1), references=labels[:, :6].argmax(-1), average="micro")['f1'],
            'f1_type':  metric.compute(predictions=predictions[:, 6:].argmax(-1), references=labels[:, 6:].argmax(-1), average="micro")['f1']}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

def train_model_on_subset(tokenized_dataset,model_name, name, num_labels=14):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    args = TrainingArguments(
        f"{model_name}-finetuned-{name}",
        eval_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=10,
        weight_decay=0.01
    )

    trainer = CustomTrainer(
    model,
    args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

    print(f"==> Trenowanie modelu: {name}")
    trainer.train()
    print(f"✔️ Zakończono trenowanie: {name}\n")

**Trenowanie modelu ogólnego**

In [None]:
train_model_on_subset(dataset,model_name, "dataset")

Trenowanie modelu *Images*

In [None]:
train_model_on_subset(images,model_name, "images")

Trenowanie modelu *Headlines*

In [None]:
train_model_on_subset(headlines,model_name, "headlines")

Trenowanie modelu *Answers-students*

In [None]:
train_model_on_subset(answers,model_name, "answers")

In [None]:
#!cp -r /content/roberta-large-finetuned-answers/checkpoint-1160 /content/gdrive/MyDrive/NLP

In [None]:
#!mv /content/gdrive/MyDrive/NLP/checkpoint-1160 /content/gdrive/MyDrive/NLP/headlines

In [None]:
#!rm -rf /content/roberta-large-finetuned-headlines/

Ładowanie modelu *ogólnego*

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/NLP/dataset", num_labels=14)
model.to(device)

Ładowanie modelu *Images*

In [None]:
model_images = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/NLP/images", num_labels=14)
model_images.to(device)

Ładowanie modelu *Headlines*

In [None]:
model_headlines = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/NLP/headlines", num_labels=14)
model_headlines.to(device)

Ładowanie modelu *Answers-students*

In [None]:
model_answers = AutoModelForSequenceClassification.from_pretrained("/content/gdrive/MyDrive/NLP/answers", num_labels=14)
model_answers.to(device)

# Ewaluacja modelu ogólnego

In [None]:
trainerHeadlines = CustomTrainer(
    model,
    args,
    train_dataset=headlines['train'],
    eval_dataset=headlines['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainerHeadlines.evaluate()

In [None]:
trainerImages = CustomTrainer(
    model,
    args,
    train_dataset=images['train'],
    eval_dataset=images['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainerImages.evaluate()

In [None]:
trainerAnswers = CustomTrainer(
    model,
    args,
    train_dataset=answers['train'],
    eval_dataset=answers['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainerAnswers.evaluate()

In [None]:
trainerDataset = CustomTrainer(
    model,
    args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainerDataset.evaluate()

Ewaluacja modelu *Images*

In [None]:
trainerImages_uniq = CustomTrainer(
    model_images,
    args,
    train_dataset=images['train'],
    eval_dataset=images['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainerImages_uniq.evaluate()

Ewaluacja modelu *Headlines*

In [None]:
trainerHeadlines_uniq = CustomTrainer(
    model_headlines,
    args,
    train_dataset=headlines['train'],
    eval_dataset=headlines['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainerHeadlines_uniq.evaluate()

Ewaluacja modelu *Answers-students*

In [None]:
trainerAnswers_uniq = CustomTrainer(
    model_answers,
    args,
    train_dataset=answers['train'],
    eval_dataset=answers['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainerAnswers_uniq.evaluate()

# Macierze pomyłek

In [None]:
def pr_re(trainer, _model):
  eval_dataloader = trainer.get_eval_dataloader()
  score_preds = torch.empty((0), dtype=torch.int64).to(device)
  score_refs = torch.empty((0), dtype=torch.int64).to(device)

  type_preds = torch.empty((0), dtype=torch.int64).to(device)
  type_refs = torch.empty((0), dtype=torch.int64).to(device)
  _model.eval()
  for batch in eval_dataloader:
      labels = batch["labels"].to(device)
      batch = {k: v.to(device) for k, v in batch.items()}

      with torch.no_grad():
          outputs = _model(**batch)
      outputs = outputs.logits

      score_preds = torch.cat((score_preds, outputs[:, :6].argmax(-1)), dim=0)
      score_refs = torch.cat((score_refs, labels[:, :6].argmax(-1)), dim=0)

      type_preds = torch.cat((type_preds, outputs[:, 6:].argmax(-1)), dim=0)
      type_refs = torch.cat((type_refs, labels[:, 6:].argmax(-1)), dim=0)
  return score_preds,score_refs,type_preds,type_refs

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

def img(score_preds,score_refs,type_preds,type_refs, name):
  cf_matrix = confusion_matrix(score_refs.cpu(), score_preds.cpu(), labels=[0, 1, 2, 3, 4,5])
  df_cm = pd.DataFrame(cf_matrix / np.sum(cf_matrix, axis=1)[:, None])
  plt.figure(figsize = (12,7))
  sn.heatmap(df_cm, annot=True)
  plt.title(name+" Score")
  plt.xlabel("Predicted Labels")
  plt.ylabel("Real Labels")
  # Build type confusion matrix
  cf_matrix = confusion_matrix(type_refs.cpu(), type_preds.cpu(), labels=[0, 1, 2, 3, 4,5,6,7])
  df_cm = pd.DataFrame(cf_matrix / np.sum(cf_matrix, axis=1)[:, None])
  plt.figure(figsize = (12,7))
  sn.heatmap(df_cm, annot=True)
  plt.title(name+" Type")
  plt.xlabel("Predicted Labels")
  plt.ylabel("Real Labels")

In [None]:
nameToDataset = {'images': trainerImages, 'answers-students':trainerAnswers, 'headlines':trainerHeadlines }
namesList = ['images', 'answers-students', 'headlines']


Macierz pomyłek modelu ogólnego

In [None]:
for name in namesList:
  score_preds,score_refs,type_preds,type_refs = pr_re(nameToDataset[name], model)
  img(score_preds,score_refs,type_preds,type_refs,name.capitalize())

Macierz pomyłek modelu *Images*

In [None]:
  score_preds,score_refs,type_preds,type_refs = pr_re(trainerImages_uniq, model_images)
  img(score_preds,score_refs,type_preds,type_refs,'images'.capitalize())

Macierz pomyłek modelu *Headlines*

In [None]:
  score_preds,score_refs,type_preds,type_refs = pr_re(trainerHeadlines_uniq, model_headlines)
  img(score_preds,score_refs,type_preds,type_refs,'headlines'.capitalize())

Macierz pomyłek modelu *Answers-students*

In [None]:
  score_preds,score_refs,type_preds,type_refs = pr_re(trainerAnswers_uniq, model_answers)
  img(score_preds,score_refs,type_preds,type_refs,'answers'.capitalize())

# Walidacja plikami zewnętrznymi

In [None]:
fields_sep = ' // '

def preds_to_wa(wa_content, preds_lines):
    wa_lines = wa_content.splitlines()

    idx = 0
    result = []

    for line in wa_lines:
        line_res = line

        if '<==>' in line :
            fields = line.split(fields_sep)
            preds_fields = preds_lines[idx].split()

            fields[1] = preds_fields[1]
            fields[2] = preds_fields[2]

            line_res = fields_sep.join(fields)
            idx += 1
        result.append(line_res)
    return '\n'.join(result)

In [None]:
from subprocess import check_output
def fileF(DATASET,predictions):
  wa_file = os.path.join(f"/content/nlp/STSint.testinput.{DATASET}.wa")
  wa_output_file = os.path.join(f"/content/nlp/STSint.input.{DATASET}-predictions.wa")

  with open(wa_file) as file:
      wa_test = file.read()

  wa_predictions = preds_to_wa(wa_test, predictions)

  with open(wa_output_file, "w") as file:
      file.write(wa_predictions)

  cmds = [
      f"perl evalF1_penalty.pl {wa_file} {wa_output_file}",
      f"perl evalF1_no_penalty.pl {wa_file} {wa_output_file}",
  ]
  outputs = []
  for cmd in cmds:
    print(f"Executing {cmd}")
    output = check_output(cmd.split(), cwd="/content/nlp").decode()
    outputs.append(output)
    print(output)
  return outputs

In [None]:
types_d = {number: Type for Type, number in types_map.items()}
def to_str(llist):
  nl = []
  for i in llist:
    nl.append(types_d[i])
  return nl

Walidacja modelu ogólnego

In [None]:
for name in namesList:
  score_preds, score_refs, type_preds, type_refs = pr_re(nameToDataset[name], model)
  types = to_str(type_preds.tolist())
  scores = score_preds.tolist()
  predictions = [
      f"{index}\t{Type} {score}\n" for index, (Type, score) in enumerate(zip(types, scores))
  ]
  fileF(name, predictions)

Walidacja modelu *Images*

In [None]:
score_preds, score_refs, type_preds, type_refs = pr_re(nameToDataset['images'], model_images)
types = to_str(type_preds.tolist())
scores = score_preds.tolist()
predictions = [
    f"{index}\t{Type} {score}\n" for index, (Type, score) in enumerate(zip(types, scores))
]
fileF('images', predictions)

Walidacja modelu *Headlines*

In [None]:
score_preds, score_refs, type_preds, type_refs = pr_re(nameToDataset['headlines'], model_headlines)
types = to_str(type_preds.tolist())
scores = score_preds.tolist()
predictions = [
    f"{index}\t{Type} {score}\n" for index, (Type, score) in enumerate(zip(types, scores))
]
fileF('headlines', predictions)

Walidacja modelu *Answers-students*

In [None]:
score_preds, score_refs, type_preds, type_refs = pr_re(nameToDataset['answers-students'], model_answers)
types = to_str(type_preds.tolist())
scores = score_preds.tolist()
predictions = [
    f"{index}\t{Type} {score}\n" for index, (Type, score) in enumerate(zip(types, scores))
]
fileF('answers-students', predictions)