In [None]:
!pip install tensorflow transformers






In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


file_path = '/content/training.1600000.processed.noemoticon (1).csv'
columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
data = pd.read_csv(file_path, encoding='ISO-8859-1', names=columns)

data = data[['sentiment', 'text']].sample(n=5000, random_state=42)
data['sentiment'] = data['sentiment'].map({0: 0, 4: 1})


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


In [None]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=32):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = SentimentDataset(train_data.text.tolist(), train_data.sentiment.tolist(), tokenizer)
test_dataset = SentimentDataset(test_data.text.tolist(), test_data.sentiment.tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
from transformers import BertForSequenceClassification, AdamW
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)


model.to(device)


optimizer = AdamW(model.parameters(), lr=5e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from tqdm.auto import tqdm
import torch


model.train()

epochs = 5

for epoch in range(epochs):

    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0


    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}')

    for batch in progress_bar:

        optimizer.zero_grad()


        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)


        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits


        loss.backward()
        optimizer.step()


        _, predictions = torch.max(logits, dim=1)


        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)
        running_loss += loss.item()


        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item())})


    epoch_accuracy = correct_predictions / total_predictions
    average_loss = running_loss / len(train_loader)


    print(f'Epoch {epoch+1}: Training Loss: {average_loss:.3f}, Accuracy: {epoch_accuracy:.3f}')


Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 1: Training Loss: 0.522, Accuracy: 0.748


Epoch 2:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 2: Training Loss: 0.299, Accuracy: 0.878


Epoch 3:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 3: Training Loss: 0.142, Accuracy: 0.951


Epoch 4:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 4: Training Loss: 0.078, Accuracy: 0.977


Epoch 5:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 5: Training Loss: 0.066, Accuracy: 0.979


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report


def get_predictions(model, data_loader):
    model.eval()
    predictions = []
    real_values = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            real_values.extend(labels.cpu().numpy())

    return predictions, real_values


test_preds, test_labels = get_predictions(model, test_loader)

print("Test Accuracy:", accuracy_score(test_labels, test_preds))
print("\nClassification Report:\n", classification_report(test_labels, test_preds))


Test Accuracy: 0.784

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.83      0.79       496
           1       0.82      0.73      0.77       504

    accuracy                           0.78      1000
   macro avg       0.79      0.78      0.78      1000
weighted avg       0.79      0.78      0.78      1000

