<a href="https://colab.research.google.com/github/nandini-n-123/DAALEETCODE/blob/main/Sentiment_Analysis_using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



df = pd.read_csv('/content/IMDB Dataset.csv').head(4000)
print(df.head())


 #Preprocess the labels into numerical form
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])  # 0=negative, 1=neutral, 2=positive

#splitting the datset into test and train datasets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['review'], df['sentiment_encoded'], test_size=0.2)

#initializing the bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#Tokenize and encode sequences
def preprocess_texts(texts, tokenizer, max_length=512):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = preprocess_texts(train_texts, tokenizer)
val_encodings = preprocess_texts(val_texts, tokenizer)

# Create Torch Dataset
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels.values))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels.values))

# DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# The training loop
for epoch in range(3):  # You can increase the number of epochs based on dataset size
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()  # Clear gradients
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)  # Forward pass
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()  # Backward pass
        optimizer.step()  # Update model parameters

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}')

    # Validation after each epoch
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, axis=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

# printing the accuracy and precision and recall f1_score of the model
accuracy = accuracy_score(true_labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='weighted')
print(f'Validation Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')




                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Training Loss: 0.3782
Epoch 2, Training Loss: 0.1424
Epoch 3, Training Loss: 0.0661
Validation Accuracy: 0.9062, Precision: 0.9063, Recall: 0.9062, F1-Score: 0.9062
