also access through google colab 

https://colab.research.google.com/drive/1W0zMqxQjc6-wOs0PoUiX9oq6CApkOona?usp=sharing

In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m117.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
Co

In [40]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.metrics import fbeta_score

import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split




In [41]:
# 1. Read the dataset from a CSV file
df = pd.read_csv('data/cleaned_training.csv')


In [42]:
df.columns

Index(['review_text', 'reviews_processed', 'true_sentiment'], dtype='object')

In [43]:
df.head(5)

Unnamed: 0,review_text,reviews_processed,true_sentiment
0,Love this lip mask nothing else will do Aquaph...,love lip mask nothing will aquaphor vaseline a...,positive
1,I really enjoy this product! Amazing smell and...,really enjoy amazing smell perfect consistency...,positive
2,The best. I have nothing else to really say ot...,best nothing really say run buy,positive
3,I had chapped lips suddenly (normally not a pr...,chapped lips suddenly normally problem several...,positive
4,i love this so much its perfect for winter dry...,love much perfect winter dry lips smells amazi...,positive


In [44]:
# Convert sentiment labels from strings to integers
sentiment_mapping = {"positive": 1, "negative": 0}
df['true_sentiment'] = df['true_sentiment'].map(sentiment_mapping)

In [46]:
# 2. Split the data into training and test sets
train_data, test_data = train_test_split(df, test_size=0.4, random_state=2023)
train_data['reviews_processed'] = train_data['reviews_processed'].astype(str)
train_data = train_data[train_data['reviews_processed'].apply(lambda x: isinstance(x, str))]

In [48]:
# train
def train_bert_classifier(data, num_labels=2, epochs=3, batch_size=16, learning_rate=3e-5):
    # Load the pre-trained DistilBERT model and tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

    # # Load the pre-trained BERT model and tokenizer
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

    # Check if all entries in 'reviews_processed' are strings
    assert all(isinstance(review, str) for review in data['reviews_processed']), "All reviews should be strings"

    # Tokenize input data
    encodings = tokenizer(list(data['reviews_processed']), truncation=True, padding=True, max_length=128)
    labels = data['true_sentiment'].tolist()


    # Custom dataset class for PyTorch DataLoader
    class SentimentDataset(Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    # Create dataset and data loader
    dataset = SentimentDataset(encodings, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Define optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Training loop
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        all_true_labels = []
        all_predictions = []
        for batch in tqdm(dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            all_true_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

        avg_loss = total_loss / len(dataloader)
        precision = precision_score(all_true_labels, all_predictions)
        recall = recall_score(all_true_labels, all_predictions)
        conf_matrix = confusion_matrix(all_true_labels, all_predictions)
        f2 = fbeta_score(all_true_labels, all_predictions, beta=2)

        print(f'Epoch {epoch + 1}/{epochs} - Average training loss: {avg_loss:.4f} - Precision: {precision:.4f} - Recall: {recall:.4f} - F2 Score: {f2:.4f}')
        print('Confusion Matrix:\n', conf_matrix)

    return model

taking batch size = 16, and learning rate = 3e-5

https://wandb.ai/jack-morris/david-vs-goliath/reports/Does-Model-Size-Matter-A-Comparison-of-BERT-and-DistilBERT--VmlldzoxMDUxNzU

In [49]:
# 3. Train the BERT model on the training set
trained_model = train_bert_classifier(train_data, num_labels=2, epochs=3, batch_size=16, learning_rate=3e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 5901/5901 [16:55<00:00,  5.81it/s]


Epoch 1/3 - Average training loss: 0.1056 - Precision: 0.9720 - Recall: 0.9849 - F2 Score: 0.9823
Confusion Matrix:
 [[ 6450  2423]
 [ 1289 84249]]


Epoch 2/3: 100%|██████████| 5901/5901 [17:05<00:00,  5.76it/s]


Epoch 2/3 - Average training loss: 0.0553 - Precision: 0.9875 - Recall: 0.9907 - F2 Score: 0.9901
Confusion Matrix:
 [[ 7803  1070]
 [  793 84745]]


Epoch 3/3: 100%|██████████| 5901/5901 [17:04<00:00,  5.76it/s]


Epoch 3/3 - Average training loss: 0.0298 - Precision: 0.9937 - Recall: 0.9949 - F2 Score: 0.9947
Confusion Matrix:
 [[ 8335   538]
 [  436 85102]]


In [50]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encodings = tokenizer(list(test_data['reviews_processed']), truncation=True, padding=True, max_length=128, return_tensors="pt")


In [51]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model.to(device)
trained_model.eval()

all_predictions = []

with torch.no_grad():
    for i in range(0, len(encodings['input_ids']), 128):  # Assuming batch_size of 128
        input_ids = encodings['input_ids'][i:i+128].to(device)
        attention_mask = encodings['attention_mask'][i:i+128].to(device)
        outputs = trained_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        all_predictions.extend(predictions.cpu().tolist())


In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

true_labels = test_data['true_sentiment'].tolist()
accuracy = accuracy_score(true_labels, all_predictions)
precision = precision_score(true_labels, all_predictions)
recall = recall_score(true_labels, all_predictions)
conf_matrix = confusion_matrix(true_labels, all_predictions)


print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print('Confusion Matrix:')
print(conf_matrix)


Accuracy: 0.9732
Precision: 0.9838
Recall: 0.9868
Confusion Matrix:
[[ 4888   929]
 [  756 56368]]


In [37]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [39]:
model_save_path = "/content/drive/My Drive/sentiment/"
tokenizer_save_path = "/content/drive/My Drive/sentiment/"

trained_model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)


('/content/drive/My Drive/sentiment/tokenizer_config.json',
 '/content/drive/My Drive/sentiment/special_tokens_map.json',
 '/content/drive/My Drive/sentiment/vocab.txt',
 '/content/drive/My Drive/sentiment/added_tokens.json')