In [1]:
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score
import pandas as pd
import ast

In [2]:
data = pd.read_csv('final_db.csv')    # Load the data

In [3]:

data['tokenized'] = data['tokenized'].apply(ast.literal_eval)


In [4]:
data.head()

Unnamed: 0,sentimate,tokenized
0,4,"[101, 14163, 12789, 4819, 1030, 2310, 2099, 19..."
1,0,"[101, 8869, 16147, 2620, 19841, 2575, 1998, 20..."
2,4,"[101, 3441, 11253, 22911, 1030, 6097, 21756, 2..."
3,0,"[101, 3763, 7474, 2854, 18827, 1030, 13451, 90..."
4,0,"[101, 23371, 8095, 15864, 1045, 4299, 1045, 20..."


In [5]:
tokenized_data = data['tokenized'].to_list()  # Your tokenized data
labels = data['sentimate']         # Your corresponding labels
labels = [1 if label == 4 else label for label in labels]

# Split the data into training and testing sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(tokenized_data, labels, test_size=0.2, random_state=42)

In [6]:
# Convert data to PyTorch tensors
train_inputs = torch.tensor(train_inputs)
test_inputs = torch.tensor(test_inputs)
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

In [7]:
# Create a DataLoader for training and testing data
train_data = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
test_data = TensorDataset(test_inputs, test_labels)
test_dataloader = DataLoader(test_data, batch_size=32)

In [8]:
# Load pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
# print(torch.cuda.is_available())    
model = model.to(device)

In [19]:
# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Average training loss: {avg_train_loss:.4f}")

Epoch 1/3 - Average training loss: 0.4773
Epoch 2/3 - Average training loss: 0.3896
Epoch 3/3 - Average training loss: 0.2891


In [27]:
# Evaluation
input_ids = None
model.eval()
all_preds = []
all_labels = []
for batch in test_dataloader:
    with torch.no_grad():
        input_ids = batch[0].to(device)
        labels = batch[1].to(device)
        outputs = model(input_ids)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [21]:
# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.7846


In [32]:
model.save_pretrained('fine_tuned_bert_model_directory')