## DistilBERT - A lighter, Faster and Computational friendly version of BERT

In [2]:
# Run this cell to install the required libraries
%pip install transformers torch sklearn pandas
%pip install transformers

Collecting transformersNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/12/dd/f17b11a93a9ca27728e12512d167eb1281c151c4c6881d3ab59eb58f4127/transformers-4.35.2-py3-none-any.whl.metadata
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
     ---------------------------------------- 0.0/123.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/123.5 kB ? eta -:--:--
     --- ------------------------------------ 10.2/123.5 kB ? eta -:--:--
     --------- --------------------------- 30.7/123.5 kB 262.6 kB/s eta 0:00:01
     ------------ ------------------------ 41.0/123.5 kB 279.3 kB/s eta 0:00:01
     ------------------------ ------------ 81.9/123.5 kB 416.7 kB/s eta 0:00:01
     ------------------------------------ 123.5/123.5 kB 557.9 kB/s eta 0:00:00
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Installing build depe

  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-

## Importing neccessary libraries and methods

In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


## Loading and Data Preprocessing

In [3]:
# Load the dataset
data = pd.read_csv(r'/Users/abdulrabbani/Desktop/SEM-2/Information Storage and retrieval/Project/complete_work/complete_work/data.csv')

# Function to preprocess the data
def preprocess_data(data):
    data = data.dropna(subset=['selected_text', 'sentiment'])
    sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
    data['sentiment_label'] = data['sentiment'].map(sentiment_mapping)
    return data

# Preprocess the data
processed_data = preprocess_data(data)

# Splitting the data into train and test sets
train_data, test_data = train_test_split(processed_data, test_size=0.2, random_state=42)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sentiment_label'] = data['sentiment'].map(sentiment_mapping)


## Initialization of DistilBERT Tokenizer to do encoding of text

In [4]:
# Tokenizer for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Creating a custom dataset for PyTorch
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }




## Data set and data loaders with batch sizes for the model

In [None]:
# Create datasets and dataloaders
train_dataset = SentimentDataset(train_data['selected_text'], train_data['sentiment_label'], tokenizer)
test_dataset = SentimentDataset(test_data['selected_text'], test_data['sentiment_label'], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

## Training and Evaluation methods for the Model

In [5]:
# Preparing for model training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Function for training and evaluation
def train_eval_model(model, train_loader, test_loader, optimizer, device, epochs=1):
    for epoch in range(epochs):
        # Training
        model.train()
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        predictions, true_labels = [], []
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                logits = outputs[1]
                predictions.extend(torch.argmax(logits, dim=1).tolist())
                true_labels.extend(labels.tolist())

        # Calculate metrics
        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
        print(f'Epoch {epoch + 1}/{epochs} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')
        report = classification_report(true_labels, predictions, output_dict=True)
        bert_classification_report = pd.DataFrame(report).transpose()
        return bert_classification_report 




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training the DistilBERT model on our dataset

In [9]:
# Start training and evaluation
train_eval_model(model, train_loader, test_loader, optimizer, device)

Epoch 1/1 - Accuracy: 0.8905, Precision: 0.8906, Recall: 0.8905, F1: 0.8904


## Classification Report of DistilBERT model

In [6]:
# Train the model and generate classification report
classification_report_df = train_eval_model(model, train_loader, test_loader, optimizer, device)

# Display the classification report
print(classification_report_df)

Epoch 1/1 - Accuracy: 0.8816, Precision: 0.8834, Recall: 0.8816, F1: 0.8816
              precision    recall  f1-score     support
0              0.831585  0.907761  0.868005  1572.00000
1              0.901335  0.845707  0.872635  2236.00000
2              0.907848  0.904621  0.906231  1688.00000
accuracy       0.881550  0.881550  0.881550     0.88155
macro avg      0.880256  0.886029  0.882290  5496.00000
weighted avg   0.883385  0.881550  0.881629  5496.00000


## THE END