In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
from torch.optim import Adam
from transformers import GPT2Model, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModelWithLMHead
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import random
import torch.nn.init as init

In [2]:
# Seeting seed for PyTorch, Python i Numpy
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [3]:
def transform_sentiment(value):
    if value.startswith('+'):
        return 'positive'
    elif value.startswith('-'):
        return 'negative'
    else:
        return value

In [4]:
df = pd.read_csv("podaci_trening.csv")
df['label'] = df['label'].apply(transform_sentiment)

In [5]:
df.head()

Unnamed: 0,label,id,text
0,positive,1-1,♥ Znao sam da će ovaj biti prvi! :)
1,positive,1-2,pa mora... The Dude Abides! :)
2,positive,1-6,Film gledam već godinama i svaki put otkrijem ...
3,positive,1-7,Svaki faktor je podjednako krucijalan i savrše...
4,positive,1-8,john goodman je obeležio ovaj film sa svojim p...


In [6]:
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# tokenizer = AutoTokenizer.from_pretrained('macedonizer/sr-gpt2')
# tokenizer = GPT2Tokenizer.from_pretrained('JeRTeh/sr-gpt2-large')
tokenizer = GPT2Tokenizer.from_pretrained("datatab/gpt2-serbian-base")
# tokenizer = GPT2Tokenizer.from_pretrained("procesaur/gpt2-srlat")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [7]:
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# tokenizer = AutoTokenizer.from_pretrained('macedonizer/sr-gpt2')
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
# tokenizer = AutoTokenizer.from_pretrained("JeRTeh/sr-gpt2-large")
tokenizer = GPT2Tokenizer.from_pretrained("datatab/gpt2-serbian-base")
#tokenizer = GPT2Tokenizer.from_pretrained("procesaur/gpt2-srlat")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
labels = { "positive": 0, "negative": 1 }

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['label']]
        self.texts = [tokenizer(text,
                                padding='max_length',
                                max_length=128,
                                truncation=True,
                                return_tensors="pt") for text in df['text']]
        
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        batch_texts = self.texts[idx]
        batch_y = torch.tensor(self.labels[idx], dtype=torch.long)  # Konvertujemo u torch.long
        return batch_texts, batch_y


In [8]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=35),
                                     [int(0.8*len(df)), int(0.9*len(df))])

print(len(df_train), len(df_val), len(df_test))

2792 349 349


In [9]:
np.random.seed(112)
df_train, df_val = np.split(df.sample(frac=1, random_state=35),
                            [int(0.8*len(df))])

df_test = pd.read_csv("podaci_test.csv")
df_test = df_test[['1', 'comment']]
df_test = df_test.rename(columns = {'1' : 'label', 'comment' : 'text'})
df_test['label'] = df_test['label'].apply(transform_sentiment)

print(len(df_train), len(df_val), len(df_test))

2792 698 464


In [11]:
class SimpleGPT2SequenceClassifier(nn.Module):
    def __init__(self, hidden_size: int, num_classes:int ,max_seq_len:int, gpt_model_name:str):
        super(SimpleGPT2SequenceClassifier,self).__init__()
        self.gpt2model = GPT2Model.from_pretrained(gpt_model_name)
        # self.gpt2model = AutoModelForCausalLM.from_pretrained(gpt_model_name)
        
        self.fc1 = nn.Linear(hidden_size*max_seq_len, num_classes)
        
    def forward(self, input_id, mask):
        """
        Args:
                input_id: encoded inputs ids of sent.
        """
        gpt_out, _ = self.gpt2model(input_ids=input_id, attention_mask=mask, return_dict=False)
        batch_size = gpt_out.shape[0]
        linear_output = self.fc1(gpt_out.view(batch_size,-1))
        return linear_output

In [11]:
def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)
    
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
    
    # use_cuda = torch.cuda.is_available()
    # device = torch.device("cuda" if use_cuda else "cpu")
    device = 'cuda'
    use_cuda = True
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        
        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input["input_ids"].squeeze(1).to(device)
            
            model.zero_grad()

            output = model(input_id, mask)
            
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1)==train_label).sum().item()
            total_acc_train += acc

            batch_loss.backward()
            optimizer.step()
            
        total_acc_val = 0
        total_loss_val = 0
        
        with torch.no_grad():
            
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)
                
                output = model(input_id, mask)
                
                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                acc = (output.argmax(dim=1)==val_label).sum().item()
                total_acc_val += acc
                
            print(
            f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train/len(train_data): .3f} \
            | Train Accuracy: {total_acc_train / len(train_data): .3f} \
            | Val Loss: {total_loss_val / len(val_data): .3f} \
            | Val Accuracy: {total_acc_val / len(val_data): .3f}")

In [65]:
EPOCHS = 1
model = SimpleGPT2SequenceClassifier(hidden_size=768, num_classes = 2, max_seq_len=128, gpt_model_name="gpt2")
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)

  0%|                                                                                         | 0/1396 [00:00<?, ?it/s]


IndexError: index out of range in self

In [12]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

        
    # Tracking variables
    predictions_labels = []
    true_labels = []
    
    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
            
            # add original labels
            true_labels += test_label.cpu().numpy().flatten().tolist()
            # get predicitons to list
            predictions_labels += output.argmax(dim=1).cpu().numpy().flatten().tolist()
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    return true_labels, predictions_labels

In [17]:
true_labels, pred_labels = evaluate(model, df_test)

Test Accuracy:  0.608


In [100]:
EPOCHS = 1
model = SimpleGPT2SequenceClassifier(hidden_size = 768, num_classes=2, max_seq_len = 128, gpt_model_name="datatab/gpt2-serbian-base")
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [05:55<00:00,  3.92it/s]


Epochs: 1 | Train Loss:  0.344             | Train Accuracy:  0.586             | Val Loss:  0.310             | Val Accuracy:  0.653


In [103]:
true_labels, pred_labels = evaluate(model, df_test)

Test Accuracy:  0.573


In [104]:
EPOCHS = 5
model = SimpleGPT2SequenceClassifier(hidden_size = 768, num_classes=2, max_seq_len = 128, gpt_model_name="datatab/gpt2-serbian-base")
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [05:48<00:00,  4.01it/s]


Epochs: 1 | Train Loss:  0.349             | Train Accuracy:  0.585             | Val Loss:  0.347             | Val Accuracy:  0.640


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [05:33<00:00,  4.19it/s]


Epochs: 2 | Train Loss:  0.276             | Train Accuracy:  0.718             | Val Loss:  0.323             | Val Accuracy:  0.653


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [04:49<00:00,  4.83it/s]


Epochs: 3 | Train Loss:  0.199             | Train Accuracy:  0.822             | Val Loss:  0.310             | Val Accuracy:  0.685


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:54<00:00,  5.94it/s]


Epochs: 4 | Train Loss:  0.102             | Train Accuracy:  0.915             | Val Loss:  0.411             | Val Accuracy:  0.662


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:54<00:00,  5.94it/s]


Epochs: 5 | Train Loss:  0.052             | Train Accuracy:  0.961             | Val Loss:  0.469             | Val Accuracy:  0.666


In [105]:
true_labels, pred_labels = evaluate(model, df_test)

Test Accuracy:  0.642


In [26]:
EPOCHS = 10
model = SimpleGPT2SequenceClassifier(hidden_size = 768, num_classes=2, max_seq_len = 128, gpt_model_name="datatab/gpt2-serbian-base")
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:58<00:00,  5.86it/s]


Epochs: 1 | Train Loss:  0.343             | Train Accuracy:  0.596             | Val Loss:  0.300             | Val Accuracy:  0.650


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:55<00:00,  5.92it/s]


Epochs: 2 | Train Loss:  0.265             | Train Accuracy:  0.739             | Val Loss:  0.310             | Val Accuracy:  0.655


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:55<00:00,  5.92it/s]


Epochs: 3 | Train Loss:  0.157             | Train Accuracy:  0.864             | Val Loss:  0.417             | Val Accuracy:  0.679


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:52<00:00,  6.01it/s]


Epochs: 4 | Train Loss:  0.070             | Train Accuracy:  0.947             | Val Loss:  0.499             | Val Accuracy:  0.669


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:52<00:00,  6.01it/s]


Epochs: 5 | Train Loss:  0.036             | Train Accuracy:  0.974             | Val Loss:  0.576             | Val Accuracy:  0.679


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:52<00:00,  6.00it/s]


Epochs: 6 | Train Loss:  0.017             | Train Accuracy:  0.989             | Val Loss:  0.625             | Val Accuracy:  0.688


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.97it/s]


Epochs: 7 | Train Loss:  0.021             | Train Accuracy:  0.989             | Val Loss:  0.571             | Val Accuracy:  0.659


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.98it/s]


Epochs: 8 | Train Loss:  0.017             | Train Accuracy:  0.988             | Val Loss:  0.625             | Val Accuracy:  0.656


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:54<00:00,  5.96it/s]


Epochs: 9 | Train Loss:  0.006             | Train Accuracy:  0.997             | Val Loss:  0.791             | Val Accuracy:  0.668


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.97it/s]


Epochs: 10 | Train Loss:  0.001             | Train Accuracy:  1.000             | Val Loss:  0.838             | Val Accuracy:  0.689


In [29]:
true_labels, pred_labels = evaluate(model, df_test)

Test Accuracy:  0.631


In [30]:
EPOCHS = 3
model = SimpleGPT2SequenceClassifier(hidden_size = 768, num_classes=2, max_seq_len = 128, gpt_model_name="datatab/gpt2-serbian-base")
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.99it/s]


Epochs: 1 | Train Loss:  0.351             | Train Accuracy:  0.603             | Val Loss:  0.310             | Val Accuracy:  0.645


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:52<00:00,  6.00it/s]


Epochs: 2 | Train Loss:  0.270             | Train Accuracy:  0.727             | Val Loss:  0.384             | Val Accuracy:  0.632


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:52<00:00,  6.00it/s]


Epochs: 3 | Train Loss:  0.181             | Train Accuracy:  0.829             | Val Loss:  0.361             | Val Accuracy:  0.660


In [31]:
true_labels, pred_labels = evaluate(model, df_test)

Test Accuracy:  0.644


<h3>Dodavanje pooling sloja</h3>

In [11]:
import torch.nn.functional as F

class SimpleGPT2SequenceClassifier(nn.Module):
    def __init__(self, hidden_size: int, num_classes:int ,max_seq_len:int, gpt_model_name:str):
        super(SimpleGPT2SequenceClassifier,self).__init__()
        self.gpt2model = GPT2Model.from_pretrained(gpt_model_name)
        self.pooling = nn.AdaptiveAvgPool1d(1)  # Global average pooling

        self.fc1 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, input_id, mask):
        """
        Args:
            input_id: encoded input ids of sent.
        """
        gpt_out, _ = self.gpt2model(input_ids=input_id, attention_mask=mask, return_dict=False)
        gpt_out = gpt_out.permute(0, 2, 1)  # Permute to (batch_size, hidden_size, seq_len)
        pooled_output = self.pooling(gpt_out).squeeze(-1)  # Apply pooling and remove singleton dimension
        linear_output = self.fc1(pooled_output)
        return linear_output

In [13]:
EPOCHS = 3
model = SimpleGPT2SequenceClassifier(hidden_size = 768, num_classes=2, max_seq_len = 128, gpt_model_name="datatab/gpt2-serbian-base")
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [05:04<00:00,  4.59it/s]


Epochs: 1 | Train Loss:  0.320             | Train Accuracy:  0.621             | Val Loss:  0.299             | Val Accuracy:  0.649


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [05:10<00:00,  4.50it/s]


Epochs: 2 | Train Loss:  0.276             | Train Accuracy:  0.704             | Val Loss:  0.287             | Val Accuracy:  0.663


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [05:09<00:00,  4.51it/s]


Epochs: 3 | Train Loss:  0.222             | Train Accuracy:  0.789             | Val Loss:  0.305             | Val Accuracy:  0.673


<h3>Dodavanje dropout sloja</h3>

In [10]:
class SimpleGPT2SequenceClassifier(nn.Module):
    def __init__(self, hidden_size: int, num_classes:int ,max_seq_len:int, gpt_model_name:str):
        super(SimpleGPT2SequenceClassifier,self).__init__()
        self.gpt2model = GPT2Model.from_pretrained(gpt_model_name)
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(0.2)  # Add dropout with 20% probability

        self.fc1 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, input_id, mask):
        """
        Args:
            input_id: encoded input ids of sent.
        """
        gpt_out, _ = self.gpt2model(input_ids=input_id, attention_mask=mask, return_dict=False)
        gpt_out = gpt_out.permute(0, 2, 1)
        pooled_output = self.pooling(gpt_out).squeeze(-1)
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        linear_output = self.fc1(pooled_output)
        return linear_output


In [25]:
EPOCHS = 3
model = SimpleGPT2SequenceClassifier(hidden_size = 768, num_classes=2, max_seq_len = 128, gpt_model_name="datatab/gpt2-serbian-base")
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [04:52<00:00,  4.77it/s]


Epochs: 1 | Train Loss:  0.326             | Train Accuracy:  0.613             | Val Loss:  0.305             | Val Accuracy:  0.629


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [04:52<00:00,  4.77it/s]


Epochs: 2 | Train Loss:  0.279             | Train Accuracy:  0.691             | Val Loss:  0.291             | Val Accuracy:  0.663


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [05:05<00:00,  4.58it/s]


Epochs: 3 | Train Loss:  0.229             | Train Accuracy:  0.779             | Val Loss:  0.289             | Val Accuracy:  0.678


In [26]:
true_labels, pred_labels = evaluate(model, df_test)

Test Accuracy:  0.675


In [13]:
EPOCHS = 10
model = SimpleGPT2SequenceClassifier(hidden_size = 768, num_classes=2, max_seq_len = 128, gpt_model_name="datatab/gpt2-serbian-base")
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:59<00:00,  5.84it/s]


Epochs: 1 | Train Loss:  0.326             | Train Accuracy:  0.613             | Val Loss:  0.305             | Val Accuracy:  0.629


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.97it/s]


Epochs: 2 | Train Loss:  0.279             | Train Accuracy:  0.691             | Val Loss:  0.291             | Val Accuracy:  0.663


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.99it/s]


Epochs: 3 | Train Loss:  0.229             | Train Accuracy:  0.779             | Val Loss:  0.289             | Val Accuracy:  0.678


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.99it/s]


Epochs: 4 | Train Loss:  0.150             | Train Accuracy:  0.870             | Val Loss:  0.338             | Val Accuracy:  0.679


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.99it/s]


Epochs: 5 | Train Loss:  0.082             | Train Accuracy:  0.935             | Val Loss:  0.491             | Val Accuracy:  0.688


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.97it/s]


Epochs: 6 | Train Loss:  0.048             | Train Accuracy:  0.962             | Val Loss:  0.632             | Val Accuracy:  0.698


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.97it/s]


Epochs: 7 | Train Loss:  0.034             | Train Accuracy:  0.977             | Val Loss:  0.655             | Val Accuracy:  0.673


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.98it/s]


Epochs: 8 | Train Loss:  0.037             | Train Accuracy:  0.974             | Val Loss:  0.575             | Val Accuracy:  0.702


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.99it/s]


Epochs: 9 | Train Loss:  0.021             | Train Accuracy:  0.987             | Val Loss:  0.957             | Val Accuracy:  0.665


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.98it/s]


Epochs: 10 | Train Loss:  0.032             | Train Accuracy:  0.980             | Val Loss:  0.783             | Val Accuracy:  0.666


In [14]:
true_labels, pred_labels = evaluate(model, df_test)

Test Accuracy:  0.662


<h3>Dodavanje strategije za inicijalizaciju tezina</h3>

In [12]:
class SimpleGPT2SequenceClassifier(nn.Module):
    def __init__(self, hidden_size: int, num_classes:int ,max_seq_len:int, gpt_model_name:str):
        super(SimpleGPT2SequenceClassifier,self).__init__()
        self.gpt2model = GPT2Model.from_pretrained(gpt_model_name)
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(0.2)

        self.fc1 = nn.Linear(hidden_size, num_classes)
        # Initialize the weights of the fully connected layer with Xavier initialization
        init.xavier_uniform_(self.fc1.weight)
        
    def forward(self, input_id, mask):
        """
        Args:
            input_id: encoded input ids of sent.
        """
        gpt_out, _ = self.gpt2model(input_ids=input_id, attention_mask=mask, return_dict=False)
        gpt_out = gpt_out.permute(0, 2, 1)
        pooled_output = self.pooling(gpt_out).squeeze(-1)
        pooled_output = self.dropout(pooled_output)
        linear_output = self.fc1(pooled_output)
        return linear_output

In [15]:
EPOCHS = 3
model = SimpleGPT2SequenceClassifier(hidden_size = 768, num_classes=2, max_seq_len = 128, gpt_model_name="datatab/gpt2-serbian-base")
LR = 1e-5

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:55<00:00,  5.94it/s]


Epochs: 1 | Train Loss:  0.403             | Train Accuracy:  0.595             | Val Loss:  0.326             | Val Accuracy:  0.626


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:52<00:00,  6.00it/s]


Epochs: 2 | Train Loss:  0.316             | Train Accuracy:  0.667             | Val Loss:  0.431             | Val Accuracy:  0.626


100%|██████████████████████████████████████████████████████████████████████████████| 1396/1396 [03:53<00:00,  5.98it/s]


Epochs: 3 | Train Loss:  0.276             | Train Accuracy:  0.719             | Val Loss:  0.295             | Val Accuracy:  0.658


In [16]:
true_labels, pred_labels = evaluate(model, df_test)

Test Accuracy:  0.642
