https://huggingface.co/openai-community/gpt2

### Install libraries

In [None]:
!pip install -q transformers[torch] datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import random
import numpy as np

torch.manual_seed(7)
torch.cuda.manual_seed(7)
torch.cuda.manual_seed_all(7)
np.random.seed(7)
random.seed(7)

## Import dataset

In [None]:
from google.colab import drive
# mount your google drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/TFG/AuTexTification/subtask_1/en')

import pickle
with open("dict_dataset.pkl", "rb") as myFile:
    dict_dataset = pickle.load(myFile)

print(dict_dataset)

Mounted at /content/drive
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'domain', '__index_level_0__'],
        num_rows: 27076
    })
    validation: Dataset({
        features: ['text', 'label', 'domain', '__index_level_0__'],
        num_rows: 6769
    })
    test: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 21832
    })
})


In [None]:
NUM_LABELS = len(dict_dataset['train'].unique('label'))
LABELS = ['generated', 'human']  # 'generated': 0, 'human': 1

## Tokenize
Load the tokenizer associated to BERT base.

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Since BERT only allows 512 tokens as input size, let's see the size of our dataset:

In [None]:
MAX_LENGTH= max([len(tokenizer(text).input_ids) for text in dict_dataset['train']['text']])
print("Maximum length", MAX_LENGTH)

Maximum length 137


Tokenize by batches

In [None]:
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
tokenizer.pad_token = tokenizer.eos_token

def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length",max_length=MAX_LENGTH)
    # return tokenizer(examples, padding="longest")

In [None]:
encoded_data = dict_dataset.map(tokenize, batched=True)
encoded_data

Map:   0%|          | 0/27076 [00:00<?, ? examples/s]

Map:   0%|          | 0/6769 [00:00<?, ? examples/s]

Map:   0%|          | 0/21832 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'domain', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 27076
    })
    validation: Dataset({
        features: ['text', 'label', 'domain', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 6769
    })
    test: Dataset({
        features: ['text', 'label', 'domain', 'input_ids', 'attention_mask'],
        num_rows: 21832
    })
})

Let's see some text examples.

In [None]:
import random
for i in range(10):
    index = random.randint(0,encoded_data['train'].num_rows)
    print('text:', index, ' len:', len(encoded_data['train'][index]['input_ids']))  # input_ids represents each word to their corresponding ID within the tokenizer's vocabulary

text: 10611  len: 137
text: 4943  len: 137
text: 12937  len: 137
text: 21329  len: 137
text: 1582  len: 137
text: 2373  len: 137
text: 26911  len: 137
text: 17559  len: 137
text: 3084  len: 137
text: 11982  len: 137


## Fine-tuning pre-trained model

In [None]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("gpt2", num_labels=NUM_LABELS)



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

#### Hyperparameters


ner of epochs, batch size, learning rate, etc.


Set TrainingArguments object that contains all default parameters.

In [None]:
from transformers import TrainingArguments
args = TrainingArguments(output_dir="./outputs")
args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_

Modify some of these parameters:

In [None]:
args.per_device_train_batch_size = 32
args.per_device_eval_batch_size = 32
args.evaluation_strategy="epoch"
# args.evaluation_strategy="step"
# args.learning_rate=5e-05

### Metrics

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = pred.predictions.argmax(-1)

    acc = accuracy_score(y_true, y_pred)

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Trainer

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    train_dataset = encoded_data['train'],
    eval_dataset = encoded_data['validation'],

    args = args,     # hyperparameters
    compute_metrics=compute_metrics,    # metrics
)

In [None]:
encoded_data['train']

Dataset({
    features: ['text', 'label', 'domain', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 27076
})

In [None]:
encoded_data['test']

Dataset({
    features: ['text', 'label', 'domain', 'input_ids', 'attention_mask'],
    num_rows: 21832
})

#### Train

In [None]:
trainer.train()

ValueError: Expected input batch_size (4352) to match target batch_size (31).

#### Evaluate with Validation dataset

In [None]:
trainer.evaluate()

We get an overall f1-score of 0.93.

#### Save and access model

In [None]:
# Save model to local folder
trainer.save_model("/content/drive/My Drive/TFG/AuTexTification/subtask_1/en/gpt2")

In [None]:
# Load Hugging Face credentials
import json
with open('config.json') as f:
    config = json.load(f)

# Access model
from transformers import AutoModelForSequenceClassification
# from local folder
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/My Drive/TFG/AuTexTification/subtask_1/en/gpt2")

# Push model to Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()
model.push_to_hub(repo_id="luciayn/gpt2", token=config['hf_token'])
tokenizer.push_to_hub(repo_id="luciayn/gpt2", token=config['hf_token'])

from transformers import AutoModelForSequenceClassification
# access model from Hugging Face Hub
model = AutoModelForSequenceClassification.from_pretrained("luciayn/gpt2")

In [None]:
from transformers import AutoModelForSequenceClassification
# access model from Hugging Face Hub
model = AutoModelForSequenceClassification.from_pretrained("luciayn/gpt2")

In [None]:
model.to('cuda')

## Evaluation

Create a function that tokenizes the input text and apply the trained model. Then the softmax function is applied to calculate the probabilities of the classes, and returns the greatest one.


In [None]:
def get_prediction(text):
    # prepara el texto, aplicamos la misma tokenización que la utilizada en el training
    inputs = tokenizer(text, padding="max_length", max_length=MAX_LENGTH, truncation= True, return_tensors="pt").to("cuda")

    # aplicamos el modelo
    pred = model(**inputs).logits

    # obtenemos la probabilidad para cada clase
    probs = pred.softmax(1)
    # devolvemos la mayor
    return probs.argmax().item()

In [None]:
# Predict class for each text in the test dataset
y_pred=[get_prediction(text) for text in dict_dataset['test']['text']]
y_true = dict_dataset['test']['label']

In [None]:
# Show the precision, recall, and f1-score of the predictions
from sklearn.metrics import classification_report
print(classification_report(y_true=y_true, y_pred=y_pred, target_names=LABELS))

We get a f1-score of 0.82 for the generated class, and 0.71 for the human class.
Moreover, we obtain an macro f1-score of 0.76.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
print(confusion_matrix(y_true, y_pred))
disp=ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap=plt.cm.Blues)