<a href="https://colab.research.google.com/github/ozguozkan/DI725_Assignment_1/blob/main/finetune_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mozgukan[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
!pip install transformers datasets evaluate wandb scikit-learn --quiet

In [4]:
from transformers import GPT2Tokenizer
import pandas as pd
from datasets import Dataset
from transformers import GPT2ForSequenceClassification
import torch
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from transformers import Trainer
import os

In [5]:
wandb.init(
    project="DI725_assignment1",     # WANDB project
    name="gpt2-finetune",            # gpt2 finetune part
    config={
        "model": "GPT2-finetuned",
        "batch_size": 8,
        "epochs": 5,
        "learning_rate": 5e-5
    }
)

Loading the data

In [6]:
#my drive path
path = "/content/drive/MyDrive/DI725_Assignment1/data/customer_service"

#loading the csv files
train_df = pd.read_csv(f"{path}/processed_train.csv")
val_df = pd.read_csv(f"{path}/processed_val.csv")
test_df = pd.read_csv(f"{path}/processed_test.csv")

#check
print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)
train_df.head()


Train: (776, 2)
Val: (194, 2)
Test: (30, 1)


Unnamed: 0,text,label
0,customer hi im calling because i have an issue...,0
1,customer hi i received an email from brownbox ...,1
2,agent thank you for contacting brownbox custom...,1
3,agent thank you for calling brownbox customer ...,1
4,agent hello thank you for contacting brownbox ...,1


In [7]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") #loading gpt2 tokenizer

tokenizer.pad_token = tokenizer.eos_token #i use eos_token as padding token because in gpt2 there is not any padding token

print(f"Vocab size: {tokenizer.vocab_size}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Vocab size: 50257


In [8]:
train_datas = Dataset.from_pandas(train_df)
val_datas = Dataset.from_pandas(val_df)
test_datas = Dataset.from_pandas(test_df)
# using pandas i convert my data into huggingface dataset format for gpt2 finetune


In [9]:
def tokenize(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

train_datas = train_datas.map(tokenize, batched=True)
val_datas = val_datas.map(tokenize, batched=True)
test_datas = test_datas.map(tokenize, batched=True)

#in huggingface data it becomes python dictionary however i will train gpt2 finetune
# with pytorch so i need to convert them into pytorch tensor
train_datas.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_datas.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_datas.set_format(type="torch", columns=["input_ids", "attention_mask"])



Map:   0%|          | 0/776 [00:00<?, ? examples/s]

Map:   0%|          | 0/194 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [10]:
#our problem is for 3 classes (negative, neutral, positive)
model = GPT2ForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=3
)

#as i mentioned before in gpt2 there is no padding token so i add it like this
model.config.pad_token_id = tokenizer.pad_token_id
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

In [11]:
batch_size = 8
num_epochs = 5

training_args = TrainingArguments( #this class is from huggingface so the variables are standart
    output_dir="/content/drive/MyDrive/DI725_Assignment1/models/gpt2-finetuned",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,                #num of my epochs
    evaluation_strategy="epoch",                #eval after every epoch
    save_strategy="epoch",                      #save the model after every epoch
    logging_dir="./logs",                       #my log file
    logging_steps=10,
    load_best_model_at_end=True,                # best model
    report_to="wandb",                          #logging wadb
    metric_for_best_model="eval_loss"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }




**train**

In [12]:
def compute_metrics(eval_pred): #my metrics i am gonna use accuracy and f1 score
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

# huggingface training function
trainer = Trainer(
    model=model,                            # pre-loaded gpt2 model
    args=training_args,
    train_dataset=train_datas,                 # preprocessed train
    eval_dataset=val_datas,                    # preprocessed val
    tokenizer=tokenizer,                    # tokenizer used for preprocessing
    compute_metrics=compute_metrics         # performance metrics
)

# starting the huggingface trainer func
trainer.train()

trainer.save_model("/content/drive/MyDrive/DI725_Assignment1/models/gpt2-finetuned/final")

#test set eval
print("\n test set eval:")
test_results = trainer.evaluate(test_datas)

#below class report and conf matrix
all_preds = []
all_labels = []

model.eval()
for batch in test_datas:
    inputs = {k: torch.tensor(v).unsqueeze(0).to(model.device) for k, v in batch.items() if k != 'label'}
    with torch.no_grad():
        outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=-1).item()
    all_preds.append(pred)

#save preds
label_map = {0: "negative", 1: "neutral", 2: "positive"}
pred_labels = [label_map[p] for p in all_preds]

# test_datas has original text too, match them
test_texts = test_datas["text"]

import pandas as pd
df_results = pd.DataFrame({
    "text": test_texts,
    "predicted_label": pred_labels
})
os.makedirs("/content/drive/MyDrive/DI725_Assignment1/results", exist_ok=True)
df_results.to_csv("/content/drive/MyDrive/DI725_Assignment1/results/gpt2_test_predictions.csv", index=False)
print("preds saved to gpt2_test_predictions.csv")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5443,0.535465,0.78866,0.779585
2,0.5195,0.417293,0.865979,0.856482
3,0.2753,0.550726,0.876289,0.865649
4,0.1137,0.540304,0.886598,0.87632
5,0.1023,0.528002,0.902062,0.892496



 test set eval:


  inputs = {k: torch.tensor(v).unsqueeze(0).to(model.device) for k, v in batch.items() if k != 'label'}


preds saved to gpt2_test_predictions.csv
