In [None]:
import os
import itertools
import random
import regex as re
import string
import pickle

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Subset

import gensim.downloader
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,KFold
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import ast

In [None]:
path = "/content/drive/MyDrive"
# Should we load it from kaggle each time?
drive.mount('/content/drive', force_remount=True)

! ls

In [None]:
def build_vocab_from_docs(list_of_documents):
    temp = []
    for document in list_of_documents:
        temp.extend(document)
        temp = list(set(temp))
    vocabulary = []
    return vocabulary

def word_frequency_dictionary(list_of_documents):
    frequency_dictionary = {}
    for document in list_of_documents:
        for word in document.split(" "):
            try:
                frequency_dictionary[word] += 1
            except:
                frequency_dictionary[word] = 1
    return frequency_dictionary

def build_vocab_from_dictionary_atleast_N(frequency_dictionary, N):
    temp = [word for word, count in frequency_dictionary.items() if count > N]
    vocabulary = []
    for word in temp:
        if word not in stopwords.words('english'):
            vocabulary.append(word)

    return vocabulary

def build_vocab_from_dictionary_top_N(frequency_dictionary, N):

    # Use sorted with a custom key to sort by counts in descending order
    sorted_words = sorted(frequency_dictionary.items(), key=lambda x: x[1], reverse=True)

    # Get the top N words from the sorted list
    vocabulary = [word for word, count in sorted_words[:N]]
    return vocabulary

def remove_stop_words(documents):
    clean_documents = []
    current_document = []

    stop_words = set(stopwords.words("english"))
    for document in documents:
        current_document = [word for word in document if word not in stop_words]
        clean_documents.append(current_document)
    return clean_documents

def clean(sentance):
    '''
    Input:
        review: a string containing a review.
    Output:
        review_cleaned: a processed review.
    '''
    sentance_lower = sentance.lower()

    sentance_links_removed = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentance_lower, flags=re.MULTILINE)

    sentance_stopwords_removed = ' '.join([word for word in sentance_links_removed.split() if word not in (stopwords.words('english'))])

    regex = re.compile('<([^>]+)>')
    sentance_escape = regex.sub("",sentance_stopwords_removed)

    modified_string = ''.join([i for i in sentance_escape if not i.isdigit()])

    sentance_punctuation_space = modified_string.replace("-"," ")
    sentance_punctuation_space = sentance_punctuation_space.replace("_"," ")
    sentance_punctuation_space = sentance_punctuation_space.replace("."," ")

    regex = re.compile('[%s]' % re.escape(string.punctuation))
    sentance_punctuation_remove = regex.sub('', sentance_punctuation_space)

    #porter = nltk.stem.PorterStemmer()
    #sentance_cleaned = " ".join([porter.stem(word = word) for word in sentance_punctuation_remove.split(" ")])

    return sentance_punctuation_remove

def remove_nonvocab_words(documents):
    clean_documents = []
    current_document = []

    vocab = set(vocabulary)
    for document in documents:
        current_document = [word for word in document.split() if word in vocab]
        clean_documents.append(" ".join(current_document))
    return clean_documents

def encode_and_pad(doc, length):
    sos = [word2index["<SOS>"]]
    eos = [word2index["<EOS>"]]
    pad = [word2index["<PAD>"]]

    if len(doc) < length - 2: # -2 for SOS and EOS
        n_pads = length - 2 - len(doc)
        encoded = [word2index[w] for w in doc]
        return sos + encoded + eos + pad * n_pads
    else: # tweet is longer than possible; truncating
        encoded = [word2index[w] for w in doc]
        truncated = encoded[:length - 2]
        return sos + truncated + eos
def convert_to_one_hot(lst):
    # Find the index of the maximum element
    max_index = max(range(len(lst)), key=lst.__getitem__)

    # Create a new list with all zeros
    one_hot = [0] * len(lst)

    # Set the maximum element's index to 1
    one_hot[max_index] = 1

    return one_hot

def labels_to_one_hot(labels, lowest, highest):
    """
    Convert labels to one-hot encoded labels based on the given lowest and highest values.

    Parameters:
    - labels (numpy array or list): The input labels.
    - lowest (int): The lowest value in the labels.
    - highest (int): The highest value in the labels.

    Returns:
    - one_hot_labels (numpy array): The one-hot encoded labels.
    """
    num_classes = highest - lowest + 1
    one_hot_labels = np.eye(num_classes)[np.array(labels) - lowest]
    return one_hot_labels

def train(model,train_x,train_y):
    epochs = 50
    train_losses = []
    val_accuracy = []

    h0, c0 = model.init_hidden(batch_size)

    h0 = h0.to(device)
    c0 = c0.to(device)

    for fold, (train_idx, val_idx) in enumerate(kf.split(train_x,train_y)):

        train_data, val_data = train_x[train_idx], train_x[val_idx]
        train_target, val_target = train_y[train_idx], train_y[val_idx]


        # Convert to PyTorch tensors
        train_data_tensor = torch.from_numpy(train_data)
        val_data_tensor = torch.from_numpy(val_data)
        train_target_tensor = torch.from_numpy(train_target)
        val_target_tensor = torch.from_numpy(val_target)


        # Create TensorDatasets with Long indices
        train_ds = TensorDataset(train_data_tensor.long(), train_target_tensor)
        val_ds = TensorDataset(val_data_tensor.long(), val_target_tensor)

        # Create DataLoaders
        train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
        val_dl = DataLoader(val_ds, shuffle=False, batch_size=batch_size, drop_last=True)

        for e in range(epochs):

            for batch_idx, batch in enumerate(train_dl):

                input = batch[0].to(device)
                target = batch[1].float().to(device)  # Convert target to float

                optimizer.zero_grad()
                with torch.set_grad_enabled(True):
                    out = model(input, (h0, c0))
                    loss = criterion(out, target)
                    loss.backward()
                    optimizer.step()

            batch_acc = []

            for batch_idx, batch in enumerate(val_dl):
                input = batch[0].to(device)
                target = batch[1].to(device)

                optimizer.zero_grad()
                with torch.set_grad_enabled(False):
                    out = model(input, (h0, c0))
                    _, preds = torch.max(out, 1)
                    preds = preds.to("cpu").tolist()
                    target = target.to("cpu").tolist()
                    preds = labels_to_one_hot(preds, 0, 2)

                    batch_acc.append(accuracy_score(preds, target))
            val_acc = sum(batch_acc) / len(batch_acc)
            train_losses.append(loss.item())
            val_accuracy.append(val_acc)
    return train_losses, val_accuracy, model

def test(model,test_dl):

    batch_acc = []
    h0, c0 = model.init_hidden(batch_size)

    h0 = h0.to(device)
    c0 = c0.to(device)


    for batch_idx, batch in enumerate(test_dl):
        input = batch[0].to(device)
        target = batch[1].to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(False):
            out = model(input, (h0, c0))
            _, preds = torch.max(out, 1)
            preds = preds.to("cpu").tolist()
            target = target.to("cpu").tolist()
            preds = labels_to_one_hot(preds, 0, 2)

            batch_acc.append(accuracy_score(preds, target))

    return sum(batch_acc) / len(batch_acc)


#Data Cleaning

In [None]:
if "dataset_clean.csv" not in os.listdir(path):
    dataset = pd.read_csv(path+'/data.csv')
    dataset_X = dataset['Sentence']
    dataset_Y = dataset['Sentiment']
    Y_hot_encoded = []
    for i in dataset_Y:
      if i == 'positive':
        Y_hot_encoded.append([0,0,1])
      if i == 'neutral':
        Y_hot_encoded.append([0,1,0])
      if i == 'negative':
        Y_hot_encoded.append([1,0,0])
    X_cleaned = [clean(data_point) for data_point in dataset_X]
    frequency_dictionary = word_frequency_dictionary(X_cleaned)
    vocabulary = build_vocab_from_dictionary_top_N(frequency_dictionary, 10000)
    vocabulary = sorted(vocabulary)
    vocabulary = list(filter(lambda el: len(el) > 2, vocabulary))
    X_cleaned = remove_nonvocab_words(X_cleaned)
    dataset_clean = pd.DataFrame({"Data":X_cleaned,"Labels":Y_hot_encoded})
    dataset_clean['Data'] = remove_nonvocab_words(dataset_clean['Data'])
    dataset_clean.to_csv(path+ "/dataset_clean.csv")
    with open(path+"/vocabulary.txt","w+") as file:
      for word in vocabulary:
        file.write(word)
        file.write("\n")

else:
    dataset_clean = pd.read_csv(path+'/dataset_clean.csv').dropna()
    dataset_clean['Labels'] = dataset_clean['Labels'].apply(ast.literal_eval)
    dataset_clean = dataset_clean.drop(dataset_clean.columns[0],axis=1)
    with open(path+"/vocabulary.txt","r") as file:
      vocabulary = file.read().split("\n")


# Dataset Loader

# LSTM

Consider adding a CNN layer

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Tokenize the text data
max_len = 36  # Largest sentence is 36 tokens long
tokenizer = lambda x: x.split()[:max_len]  # Use a tokenizer that limits the length
dataset_clean['Data'] = dataset_clean['Data'].apply(tokenizer)


train_clean_df, test_clean_df = train_test_split(dataset_clean, test_size=0.2)

train_clean_df[:3]

index2word = ["<PAD>", "<SOS>", "<EOS>"]
index2word.extend(vocabulary)
word2index = {token: idx for idx, token in enumerate(index2word)}

train_encoded = [(encode_and_pad(row['Data'], max_len), row['Labels']) for index, row in train_clean_df.iterrows()]
test_encoded = [(encode_and_pad(row['Data'], max_len), row['Labels']) for index, row in test_clean_df.iterrows()]

batch_size = 20

train_x = np.array([doc for doc, label in train_encoded])
train_y = np.array([label for doc, label in train_encoded])

test_x = np.array([doc for doc, label in test_encoded])
test_y = np.array([label for doc, label in test_encoded])

test_ds = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

test_dl = DataLoader(test_ds, shuffle=True, batch_size=batch_size, drop_last=True)


In [None]:
LSTM_path = path + "/LSTM_Model_Weights_History"
if not os.path.exists(LSTM_path):
    # Folder doesn't exist, so create it
    os.makedirs(LSTM_path)
print(os.listdir(path))

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ScaledDotProductAttention(nn.Module):
    def __init__(self, scale_factor, attn_dropout=0.1):
        super(ScaledDotProductAttention, self).__init__()
        self.scale_factor = scale_factor
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, query, key, value, mask=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) / self.scale_factor

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        output = torch.matmul(attn_weights, value)

        return output, attn_weights




class BiLSTM_SentimentAnalysis(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout, bidirectional=False, attention=False):
        super(BiLSTM_SentimentAnalysis, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=bidirectional)
        self.attention = attention
        if self.attention :
            self.attn = ScaledDotProductAttention(scale_factor=(hidden_dim * 2) ** 0.5 if bidirectional else (hidden_dim) ** 0.5)
            self.attn_dropout = nn.Dropout(dropout)
            self.lstm_k = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=bidirectional)
            self.lstm_v = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=bidirectional)
            self.lstm_q = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=bidirectional)


        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

    def forward(self, x, hidden):
        embedded = self.embedding(x)

        if self.attention:
            lstm_out_k, (hidden_k, cell_k) = self.lstm_k(embedded)
            lstm_out_v, (hidden_v, cell_v) = self.lstm_v(embedded)
            lstm_out_q, (hidden_q, cell_q) = self.lstm_q(embedded)

            lstm_out_q = lstm_out_q.permute(1, 0, 2)
            lstm_out_k = lstm_out_k.permute(1, 0, 2)
            lstm_out_v = lstm_out_v.permute(1, 0, 2)
            attn_output, _ = self.attn(lstm_out_q, lstm_out_k, lstm_out_v)
            attn_output = self.attn_dropout(attn_output)
            lstm_out = attn_output.permute(1, 0, 2)
            # Aggregate hidden and cell, max pooling/average pooling or averaging or weighted sum(could learn weighinh)
            #Maxpooling rn
            hidden, _ = torch.max(torch.stack([hidden_k, hidden_v, hidden_q]), dim=0)
            cell, _ = torch.max(torch.stack([cell_k, cell_v, cell_q]), dim=0)

        else:
            lstm_out, (hidden, cell) = self.lstm(embedded)

        hidden = hidden[-1, :, :] if not self.lstm.bidirectional else torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = self.dropout(hidden)

        return self.fc(hidden)

    def init_hidden(self, batch_size):
        num_directions = 2 if self.lstm.bidirectional else 1
        return (torch.zeros(num_directions, batch_size, hidden_dim),
                torch.zeros(num_directions, batch_size, hidden_dim))


In [None]:
embedding_dim = 100
hidden_dims = [16,32,64,128,256,512]
output_dim = 3
dropouts = [0.0,0.25,0.5]
vocab_size = len(word2index)
num_epochs = 3
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:

history = {}
model_configs = {"BiLSTM_History_WO_Attention":[True,False],"LSTM_History_W_Scaled_Dot_Attention":[False,True],"BiLSTM_History_W_Scaled_Dot_Attention":[True,True]}

for model_config_name,model_config in model_configs.items():
  print(model_config_name)
  for hidden_dim in hidden_dims:
    for dropout in dropouts:
      model = BiLSTM_SentimentAnalysis(vocab_size, embedding_dim, hidden_dim, output_dim, dropout, bidirectional=model_config[0],attention = model_config[1])
      model = model.to(device)

      criterion = nn.CrossEntropyLoss()
      optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4)

      train_losses, val_accuracy, model = train(model,train_x,train_y)
      model_name = model_config_name +"hidden_dim_"+str(hidden_dim)+"_dropout"+str(dropout)
      history[model_name] = (train_losses, val_accuracy, test(model,test_dl))
      print("_________________________________________________________________________")

      print(model_name," Test Accuracy: ",test(model,test_dl))

      plt.plot(train_losses)
      plt.title("Training Losses")
      plt.show()
      plt.plot(val_accuracy)
      plt.title("Validation Accuracy")
      plt.show()

      print("_________________________________________________________________________")
      torch.save(model.state_dict(),LSTM_path + "/" + model_name +".pt")

  with open(LSTM_path + "/" + model_config_name + '.pkl', 'wb') as file:
    pickle.dump(history, file)

In [None]:

history = {}
model_config_name = "LSTM_History_W_Scaled_Dot_Attention"
model_config = [False,True]

print(model_config_name)
for hidden_dim in hidden_dims:
    for dropout in dropouts:
        model = BiLSTM_SentimentAnalysis(vocab_size, embedding_dim, hidden_dim, output_dim, dropout, bidirectional=model_config[0],attention = model_config[1])
        model = model.to(device)

        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4)

        train_losses, val_accuracy, model = train(model,train_x,train_y)
        model_name = model_config_name +"hidden_dim_"+str(hidden_dim)+"_dropout"+str(dropout)
        history[model_name] = (train_losses, val_accuracy, test(model,test_dl))
        print("_________________________________________________________________________")

        print(model_name," Test Accuracy: ",test(model,test_dl))

        plt.plot(train_losses)
        plt.title("Training Losses")
        plt.show()
        plt.plot(val_accuracy)
        plt.title("Validation Accuracy")
        plt.show()

        print("_________________________________________________________________________")
        torch.save(model.state_dict(),LSTM_path + "/" + model_name +".pt")

with open(LSTM_path + "/" + model_config_name + '.pkl', 'wb') as file:
    pickle.dump(history, file)

In [None]:

history = {}
model_config_name = "LSTM_History_WO_Attention"
model_config = [False,False]

print(model_config_name)
for hidden_dim in hidden_dims:
    for dropout in dropouts:
        model = BiLSTM_SentimentAnalysis(vocab_size, embedding_dim, hidden_dim, output_dim, dropout, bidirectional=model_config[0],attention = model_config[1])
        model = model.to(device)

        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4)

        train_losses, val_accuracy, model = train(model,train_x,train_y)
        model_name = model_config_name +"hidden_dim_"+str(hidden_dim)+"_dropout"+str(dropout)
        history[model_name] = (train_losses, val_accuracy, test(model,test_dl))
        print("_________________________________________________________________________")

        print(model_name," Test Accuracy: ",test(model,test_dl))

        plt.plot(train_losses)
        plt.title("Training Losses")
        plt.show()
        plt.plot(val_accuracy)
        plt.title("Validation Accuracy")
        plt.show()

        print("_________________________________________________________________________")
        torch.save(model.state_dict(),LSTM_path + "/" + model_name +".pt")

with open(LSTM_path + "/" + model_config_name + '.pkl', 'wb') as file:
    pickle.dump(history, file)

In [None]:
# Pick the ideal parameters based on the previous set of tests and use them to train the different types of models on multiple epochs
# NOT DONE YET, RAN BY MISTAKE, DISREGARD TILL THE HYPERPARAMS ARE SET TO ONE VALUE
embedding_dim = 100
hidden_dims = [16,32,64,128,256,512]
output_dim = 3
dropouts = [0.0,0.25,0.5]
vocab_size = len(word2index)
num_epochs = 3
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

history = {}
model_configs = {"BiLSTM_History_WO_Attention_3_epochs":[True,False],"LSTM_History_W_Scaled_Dot_Attention_3_epochs":[False,True],"BiLSTM_History_W_Scaled_Dot_Attention_3_epochs":[True,True]}

for model_config_name,model_config in model_configs.items():
  for epoch in range(num_epochs):
      print(model_config_name)
      model = BiLSTM_SentimentAnalysis(vocab_size, embedding_dim, hidden_dim, output_dim, dropout, bidirectional=model_config[0],attention = model_config[1])
      model = model.to(device)

      criterion = nn.CrossEntropyLoss()
      optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4)

      train_losses, val_accuracy, model = train(model,train_x,train_y)
      model_name = model_config_name+"_" + str(epoch) +"_epoch"
      history[model_name] = (train_losses, val_accuracy, test(model,test_dl))
      print("_________________________________________________________________________")

      print(model_name," Test Accuracy: ",test(model,test_dl))

      plt.plot(train_losses)
      plt.title("Training Losses")
      plt.show()
      plt.plot(val_accuracy)
      plt.title("Validation Accuracy")
      plt.show()

      print("_________________________________________________________________________")
      torch.save(model.state_dict(),LSTM_path + "/" + model_name +".pt")

  with open(LSTM_path + "/" + model_config_name + '.pkl', 'wb') as file:
    pickle.dump(history, file)

# Transformer

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score

In [None]:
# Read dataset into a DataFrame
df = pd.read_csv('dataset_clean.csv')

df['Labels'] = df['Labels'].apply(lambda x: eval(x)).apply(lambda x: x.index(1))

# Split the data into a training set and a hold-out test set
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from torch.optim import AdamW  # Use the PyTorch AdamW optimizer

# Define the training process
def train_model(model, train_loader, val_loader, device, optimizer, scheduler, num_epochs=3):
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            model.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            train_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss}")

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_acc = 0.0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()

                # Convert logits to predictions
                preds = torch.argmax(outputs.logits, dim=1)

                # Calculate accuracy
                val_acc += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())

        avg_val_loss = val_loss / len(val_loader)
        avg_val_acc = val_acc / len(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss}, Validation Accuracy: {avg_val_acc}")


In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
num_epochs = 1

# Run the K-Fold cross-validation
for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Create dataset for the current fold
    train_dataset = CustomDataset(
        texts=df.iloc[train_idx]['Data'].to_numpy(),
        labels=df.iloc[train_idx]['Labels'].tolist(),
        tokenizer=tokenizer,
        max_len=128
    )

    val_dataset = CustomDataset(
        texts=df.iloc[val_idx]['Data'].to_numpy(),
        labels=df.iloc[val_idx]['Labels'].tolist(),
        tokenizer=tokenizer,
        max_len=128
    )

    # Create DataLoaders for the current fold
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    # Initialize the model for the current fold
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=3,
        output_attentions=False,
        output_hidden_states=False
    )
    model.to(device)

    # Initialize optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=5e-5)
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Train the model on the current fold
    train_model(model, train_loader, val_loader, device, optimizer, scheduler, num_epochs=3)

FOLD 0
--------------------------------
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Train Loss: 0.6744642976811315
Epoch 1/3, Validation Loss: 0.5109011908439366, Validation Accuracy: 0.7660472972972973
Epoch 2/3, Train Loss: 0.4453526675167141
Epoch 2/3, Validation Loss: 0.5109011908439366, Validation Accuracy: 0.7660472972972973
Epoch 3/3, Train Loss: 0.45124847894540826
Epoch 3/3, Validation Loss: 0.5109011908439366, Validation Accuracy: 0.7660472972972973
FOLD 1
--------------------------------
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Train Loss: 0.7722194909337438
Epoch 1/3, Validation Loss: 0.5854998283289574, Validation Accuracy: 0.7305743243243243
Epoch 2/3, Train Loss: 0.5113978568502661
Epoch 2/3, Validation Loss: 0.5854998283289574, Validation Accuracy: 0.7305743243243243
Epoch 3/3, Train Loss: 0.5123804962268869
Epoch 3/3, Validation Loss: 0.5854998283289574, Validation Accuracy: 0.7305743243243243
FOLD 2
--------------------------------
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Train Loss: 0.7310387798340248
Epoch 1/3, Validation Loss: 0.5351066613850528, Validation Accuracy: 0.7671232876712328
Epoch 2/3, Train Loss: 0.4782958377038253
Epoch 2/3, Validation Loss: 0.5351066613850528, Validation Accuracy: 0.7671232876712328
Epoch 3/3, Train Loss: 0.474815690781883
Epoch 3/3, Validation Loss: 0.5351066613850528, Validation Accuracy: 0.7671232876712328
FOLD 3
--------------------------------
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Train Loss: 0.6929047598722852
Epoch 1/3, Validation Loss: 0.5681205738897193, Validation Accuracy: 0.7303082191780822
Epoch 2/3, Train Loss: 0.45392714695108627
Epoch 2/3, Validation Loss: 0.5681205738897193, Validation Accuracy: 0.7303082191780822
Epoch 3/3, Train Loss: 0.461759096373877
Epoch 3/3, Validation Loss: 0.5681205738897193, Validation Accuracy: 0.7303082191780822
FOLD 4
--------------------------------
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Train Loss: 0.7161020683347161
Epoch 1/3, Validation Loss: 0.5810992050252549, Validation Accuracy: 0.7200342465753424
Epoch 2/3, Train Loss: 0.4491735412500824
Epoch 2/3, Validation Loss: 0.5810992050252549, Validation Accuracy: 0.7200342465753424
Epoch 3/3, Train Loss: 0.44975869113471323
Epoch 3/3, Validation Loss: 0.5810992050252549, Validation Accuracy: 0.7200342465753424

In [None]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'first_model.pth')

In [None]:
test_dataset = CustomDataset(
    texts=test_df['Data'].to_numpy(),
    labels=test_df['Labels'].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

# Create the DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
model.eval()

# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.2f}')

Test Accuracy: 0.83


# Second BERT

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
num_epochs = 3

# Run the K-Fold cross-validation
for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Create dataset for the current fold
    train_dataset = CustomDataset(
        texts=df.iloc[train_idx]['Data'].to_numpy(),
        labels=df.iloc[train_idx]['Labels'].tolist(),
        tokenizer=tokenizer,
        max_len=128
    )

    val_dataset = CustomDataset(
        texts=df.iloc[val_idx]['Data'].to_numpy(),
        labels=df.iloc[val_idx]['Labels'].tolist(),
        tokenizer=tokenizer,
        max_len=128
    )

    # Create DataLoaders for the current fold
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    # Initialize the model for the current fold
    model3 = BertForSequenceClassification.from_pretrained(
        'bert-large-uncased',
        num_labels=3,
        output_attentions=False,
        output_hidden_states=False
    )
    model3.to(device)

    # Initialize optimizer and scheduler
    optimizer = AdamW(model3.parameters(), lr=5e-5)
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Train the model on the current fold
    train_model(model3, train_loader, val_loader, device, optimizer, scheduler, num_epochs=3)

FOLD 0
--------------------------------
model.safetensors: 100%
1.34G/1.34G [00:05<00:00, 150MB/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Train Loss: 0.7998394565370709
Epoch 1/3, Validation Loss: 0.5376094245427364, Validation Accuracy: 0.7559121621621622
Epoch 2/3, Train Loss: 0.5190620001774192
Epoch 2/3, Validation Loss: 0.4861597678548581, Validation Accuracy: 0.7820945945945946
Epoch 3/3, Train Loss: 0.3650102964745447
Epoch 3/3, Validation Loss: 0.4956785092583379, Validation Accuracy: 0.7694256756756757
FOLD 1
--------------------------------
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Train Loss: 0.7596423810049129
Epoch 1/3, Validation Loss: 0.5687227275323223, Validation Accuracy: 0.7356418918918919
Epoch 2/3, Train Loss: 0.4679378428505956
Epoch 2/3, Validation Loss: 0.5206806018344454, Validation Accuracy: 0.7736486486486487
Epoch 3/3, Train Loss: 0.30033265167383205
Epoch 3/3, Validation Loss: 0.5763120387957709, Validation Accuracy: 0.7677364864864865
FOLD 2
--------------------------------
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Train Loss: 1.0044639240759632
Epoch 1/3, Validation Loss: 0.9736260396160491, Validation Accuracy: 0.5522260273972602
Epoch 2/3, Train Loss: 1.0000703308362602
Epoch 2/3, Validation Loss: 0.9667051214061372, Validation Accuracy: 0.5522260273972602
Epoch 3/3, Train Loss: 0.9998794724916842
Epoch 3/3, Validation Loss: 0.9657683421487677, Validation Accuracy: 0.5522260273972602
FOLD 3
--------------------------------
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Train Loss: 1.001695082049321
Epoch 1/3, Validation Loss: 0.9773599366619162, Validation Accuracy: 0.553082191780822
Epoch 2/3, Train Loss: 0.9990754469262862
Epoch 2/3, Validation Loss: 0.9743413541414966, Validation Accuracy: 0.553082191780822
Epoch 3/3, Train Loss: 0.9943690568513838
Epoch 3/3, Validation Loss: 0.9745499067110558, Validation Accuracy: 0.553082191780822
FOLD 4
--------------------------------
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3, Train Loss: 0.791604362962189
Epoch 1/3, Validation Loss: 0.6261102246911558, Validation Accuracy: 0.7106164383561644
Epoch 2/3, Train Loss: 0.45570625481116284
Epoch 2/3, Validation Loss: 0.5671376418577482, Validation Accuracy: 0.7268835616438356
Epoch 3/3, Train Loss: 0.25945606787050135
Epoch 3/3, Validation Loss: 0.6489546046885726, Validation Accuracy: 0.7388698630136986

In [None]:
test_dataset = CustomDataset(
    texts=test_df['Data'].to_numpy(),
    labels=test_df['Labels'].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

# Create the DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
model3.eval()

# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model3(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.2f}')

Test Accuracy: 0.91


# Third BERT

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup


In [None]:
# Customize BERT's configuration
config = BertConfig(
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=16,
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False,
)

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
num_epochs = 1

# Run the K-Fold cross-validation
for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Create dataset for the current fold
    train_dataset = CustomDataset(
        texts=df.iloc[train_idx]['Data'].to_numpy(),
        labels=df.iloc[train_idx]['Labels'].tolist(),
        tokenizer=tokenizer,
        max_len=128
    )

    val_dataset = CustomDataset(
        texts=df.iloc[val_idx]['Data'].to_numpy(),
        labels=df.iloc[val_idx]['Labels'].tolist(),
        tokenizer=tokenizer,
        max_len=128
    )

    # Create DataLoaders for the current fold
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    # Initialize the model for the current fold
    model4 = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        config=config
    )
    model4.to(device)

    # Initialize optimizer and scheduler
    optimizer = AdamW(model4.parameters(), lr=5e-5)
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Train the model on the current fold
    train_model(model4, train_loader, val_loader, device, optimizer, scheduler, num_epochs=3)

FOLD 0
--------------------------------
Epoch 1/3, Train Loss: 0.759006376476774
Epoch 1/3, Validation Loss: 0.5358082576579338, Validation Accuracy: 0.7618243243243243
Epoch 2/3, Train Loss: 0.5042485594240879
Epoch 2/3, Validation Loss: 0.5358082576579338, Validation Accuracy: 0.7618243243243243
Epoch 3/3, Train Loss: 0.5057545585348032
Epoch 3/3, Validation Loss: 0.5358082576579338, Validation Accuracy: 0.7618243243243243

FOLD 1
--------------------------------
Epoch 1/3, Train Loss: 0.7794085784017429
Epoch 1/3, Validation Loss: 0.6341053394449724, Validation Accuracy: 0.7119932432432432
Epoch 2/3, Train Loss: 0.5506814068291375
Epoch 2/3, Validation Loss: 0.6341053394449724, Validation Accuracy: 0.7119932432432432
Epoch 3/3, Train Loss: 0.5513873748201559
Epoch 3/3, Validation Loss: 0.6341053394449724, Validation Accuracy: 0.7119932432432432

FOLD 2
--------------------------------
Epoch 1/3, Train Loss: 0.7698235646551379
Epoch 1/3, Validation Loss: 0.5582803849079837, Validation Accuracy: 0.7602739726027398
Epoch 2/3, Train Loss: 0.5208293152750556
Epoch 2/3, Validation Loss: 0.5582803849079837, Validation Accuracy: 0.7602739726027398
Epoch 3/3, Train Loss: 0.5221011580155571
Epoch 3/3, Validation Loss: 0.5582803849079837, Validation Accuracy: 0.7602739726027398

FOLD 3
--------------------------------
Epoch 1/3, Train Loss: 0.6842181886015491
Epoch 1/3, Validation Loss: 0.5422686889563522, Validation Accuracy: 0.761986301369863
Epoch 2/3, Train Loss: 0.4475179272401862
Epoch 2/3, Validation Loss: 0.5422686889563522, Validation Accuracy: 0.761986301369863
Epoch 3/3, Train Loss: 0.44771099100747613
Epoch 3/3, Validation Loss: 0.5422686889563522, Validation Accuracy: 0.761986301369863

FOLD 4
--------------------------------
Epoch 1/3, Train Loss: 0.7059079891534795
Epoch 1/3, Validation Loss: 0.5996909766164544, Validation Accuracy: 0.7217465753424658
Epoch 2/3, Train Loss: 0.47543662216883065
Epoch 2/3, Validation Loss: 0.5996909766164544, Validation Accuracy: 0.7217465753424658
Epoch 3/3, Train Loss: 0.468779429335643
Epoch 3/3, Validation Loss: 0.5996909766164544, Validation Accuracy: 0.7217465753424658

In [None]:
test_dataset = CustomDataset(
    texts=test_df['Data'].to_numpy(),
    labels=test_df['Labels'].tolist(),
    tokenizer=tokenizer,
    max_len=128
)

# Create the DataLoader for the test dataset
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
model4.eval()

# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model4(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.2f}')

*Test* Accuracy: 0.80
