In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!pip install transformers==4.15.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.15.0
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m61.1 MB

In [None]:
import pandas as pd
from transformers import RobertaTokenizer
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaTokenizer, RobertaModel, AdamW
import numpy as np
import shutil
from tqdm import tqdm
from configparser import ConfigParser
import os
import json
import argparse
import shutil
import sys
import transformers
from nltk.tokenize import sent_tokenize
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.simplefilter("ignore")

In [None]:
def load_dataframes():
    df = {}
    df["train"] = pd.read_pickle("data/train.pkl")
    df["valid"] = pd.read_pickle("data/valid.pkl")
    df["test"] = pd.read_pickle("data/test.pkl")
    return df


class ClaimData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['text']
        self.evidence = dataframe['evidence_sents']
        self.targets = dataframe['rating']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        evidence = str(self.evidence[index])
        evidence = " ".join(evidence.split())

        inputs = self.tokenizer(
            text,
            evidence,
            truncation="longest_first",
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(self.targets[index], dtype=torch.long)
        }

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    target_names = [0, 1, 2]
    output_dict = classification_report(labels, predictions, labels=range(3), target_names=target_names, output_dict=True)
    return {'accuracy': accuracy, 'macro f1-score': output_dict['macro avg']['f1-score']}




In [None]:
rand_seed = 42
torch.manual_seed(rand_seed)
num_epochs = 3
output_dir = "output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

df = load_dataframes()

model_checkpoint = "output/checkpoint-5620"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

MAX_SEQ_LEN = 512
print("creating data....")
training_set = ClaimData(df['train'], tokenizer, MAX_SEQ_LEN)
validation_set = ClaimData(df['valid'], tokenizer, MAX_SEQ_LEN)
testing_set = ClaimData(df['test'], tokenizer, MAX_SEQ_LEN)
print("setting model....")
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

args = TrainingArguments(
    output_dir = output_dir,
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    learning_rate=5e-6,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=num_epochs,
    weight_decay=0.001,
    load_best_model_at_end=True,
    metric_for_best_model='macro f1-score',
    seed=rand_seed
)

trainer = Trainer(
    model,
    args,
    train_dataset=training_set,
    eval_dataset=validation_set,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Didn't find file output/checkpoint-5620/added_tokens.json. We won't load it.
loading file output/checkpoint-5620/vocab.json
loading file output/checkpoint-5620/merges.txt
loading file output/checkpoint-5620/tokenizer.json
loading file None
loading file output/checkpoint-5620/special_tokens_map.json
loading file output/checkpoint-5620/tokenizer_config.json
loading configuration file output/checkpoint-5620/config.json
Model config RobertaConfig {
  "_name_or_path": "output/checkpoint-5620",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position

creating data....
setting model....


All model checkpoint weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the model checkpoint at output/checkpoint-5620.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
predictions, labels, metrics = trainer.predict(testing_set)
metrics["predict_samples"] = len(testing_set)

trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)

predictions = np.argmax(predictions, axis=1)
output_predict_file = os.path.join(output_dir, "predictions.csv")

***** Running Prediction *****
  Num examples = 2357
  Batch size = 6


***** predict metrics *****
  predict_samples         =       2357
  test_accuracy           =     0.3975
  test_loss               =     1.2556
  test_macro f1-score     =     0.1896
  test_runtime            = 0:02:28.11
  test_samples_per_second =     15.913
  test_steps_per_second   =      2.653


In [None]:
test_ids = df["test"]["claim_id"].values
import csv
if trainer.is_world_process_zero():
    with open(output_predict_file, "w") as f:
        writer = csv.writer(f)
        writer.writerow(["id", "rating"])
        for index, item in zip(test_ids, predictions):
            writer.writerow([index,item])
        writer.writerow([30380, 0])
        writer.writerow([31035, 0])
        writer.writerow([31420, 0])