In [2]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from datasets import Dataset, DatasetDict
import pandas as pd
from huggingface_hub import HfFolder, notebook_login

In [3]:
label_dict = {"phrase": 0, "passage": 1, "multi": 2}

def load_input(df):
    org_df = df
    if type(df) != pd.DataFrame:
        df = pd.read_json(df, lines=True)
    
    ret = []
    for _, i in df.iterrows():
        if org_df.endswith('test.jsonl'):
            label = 3
        else:
            label = label_dict[i['tags'][0]]
        try:
            ret += [{'text': ' '.join(i['postText']) + ' - ' + i['targetTitle'] + ' ' + ' '.join(i['targetParagraphs']), 'uuid': i['uuid'], 'label': label}]
        except:
            ret += [{'text': ' '.join(i['postText']) + ' - ' + i['targetTitle'] + ' ' + ' '.join(i['targetParagraphs']), 'uuid': i['postId'], 'label': label}]
    
    return pd.DataFrame(ret)

In [4]:
input_path_train = '~/clickbait/train.jsonl'
input_path_val = '~/clickbait/validation.jsonl'
input_path_test = '~/clickbait/test.jsonl'
input_data_train = load_input(input_path_train)
input_data_val = load_input(input_path_val)
input_data_test = load_input(input_path_test)

In [6]:
model_id = "roberta-large"
# relace the value with your model: ex <hugging-face-user>/<model-name>
repository_id = "tianleli/roberta-large"

In [64]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(input_data_train),
    "val": Dataset.from_pandas(input_data_val),
    "test": Dataset.from_pandas(input_data_test),
    })
train_dataset = dataset['train']
test_dataset = dataset["test"]
val_dataset = dataset['val']


In [65]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# This function tokenizes the input text using the RoBERTa tokenizer. 
# It applies padding and truncation to ensure that all sequences have the same length (512 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

Downloading (…)olve/main/vocab.json: 100%|██████████████████████████████████████████████████████████████████████████████| 899k/899k [00:00<00:00, 11.0MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 24.8MB/s]
Downloading (…)/main/tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████| 1.36M/1.36M [00:00<00:00, 21.2MB/s]
Downloading (…)lve/main/config.json: 100%|█████████████████████████████████████████████████████████████████████████████████| 482/482 [00:00<00:00, 328kB/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3200/3200 [00:05<00:00, 561.48 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 800/800 [00:01<00:00, 546.21 examples/s]
Map: 100%|██████████████████████████████████████████████████████

In [66]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [69]:
# ! pip install transformers
# ! pip install accelerate -U

# Model
config = AutoConfig.from_pretrained(model_id)
id2label = {i: label for i, label in enumerate([0, 1, 2])}

# Update the model's configuration with the id2label mapping
# config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
#     report_to="tensorboard",
    push_to_hub=False,
#     hub_strategy="every_save",
#     hub_model_id=repository_id,
#     hub_token=HfFolder.get_token(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0419,1.029953
2,1.019,1.09176
3,0.8609,0.801605
4,0.6882,0.696797
5,0.556,0.657365
6,0.38,0.7616
7,0.2278,1.037642




RuntimeError: [enforce fail at inline_container.cc:319] . unexpected pos 2573160704 vs 2573160596

In [7]:
from transformers import pipeline

tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

classifier = pipeline('text-classification', model='tianleli/roberta-large_ag_news/checkpoint-250', tokenizer=tokenizer, batch_size=16)

text = "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his innocence and vowing: quot;After the crucifixion comes the resurrection. quot; .."
result = classifier(text)
print(result)

predicted_label = result[0]["label"]
print(f"Predicted label: {predicted_label}")

Downloading (…)olve/main/vocab.json: 100%|██████████████████████████████████████████████████████████████████████████████| 899k/899k [00:00<00:00, 21.0MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 11.5MB/s]
Downloading (…)/main/tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████| 1.36M/1.36M [00:00<00:00, 22.2MB/s]
Downloading (…)lve/main/config.json: 100%|█████████████████████████████████████████████████████████████████████████████████| 482/482 [00:00<00:00, 273kB/s]


[{'label': 1, 'score': 0.855288565158844}]
Predicted label: 1


In [None]:
id_to_label = {0:'phrase', 1:'passage', 2: 'multi'}

def predict(df):

    labels = [0,1,2]

    uuids = list(df['uuid'])
    texts = list(df['text'])
    texts = [x[:1500] for x in texts]
    print(texts[:2])
    gt = list(df['label'])
    correct = 0
    
    
    # for i in range(len(df)):
#         text = df['text'][i][:2000]
#         print("text: ", text)
#         gt = df['label'][i]
    predictions = classifier(texts)
    pred = [x['label'] for x in predictions]
    print("pred: ", pred)
    print("gt: ", gt)
    count = 0
    id_list = [i for i in range(400)]
    type_list = [id_to_label[x] for x in pred]
#     for p, y in zip(pred,gt):
#         if p == y:
#             correct+=1
    
    # return correct/len(gt)
    return id_list, type_list

id_list, type_list = predict(input_data_test)

In [88]:
import csv

id_list = ["id"] + id_list
type_list = ["spoilerType"] + type_list

with open('outputs.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(zip(id_list, type_list))