In [1]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from torch.nn import CrossEntropyLoss
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


### Loading the Dataset

In [2]:
df = pd.read_csv('dataset.csv')

### Converting entries in Comment column to Strings

In [3]:
print(df["Comment"].head())
print(df["Comment"].apply(type).value_counts())

0    when modi promised a minimum government maximu...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
3    asking his supporters prefix chowkidar their n...
4    answer who among these the most powerful world...
Name: Comment, dtype: object
Comment
<class 'str'>      269652
<class 'float'>       105
Name: count, dtype: int64


In [4]:
df["Comment"] = df["Comment"].fillna("").astype(str)

In [5]:
print(df["Comment"].head())
print(df["Comment"].apply(type).value_counts())

0    when modi promised a minimum government maximu...
1    talk all the nonsense and continue all the dra...
2    what did just say vote for modi  welcome bjp t...
3    asking his supporters prefix chowkidar their n...
4    answer who among these the most powerful world...
Name: Comment, dtype: object
Comment
<class 'str'>    269757
Name: count, dtype: int64


In [6]:
dataset = Dataset.from_pandas(df)

In [7]:
print(df['Sentiment'].isnull().sum())  # Count missing values

7


In [8]:
df = df.dropna(subset=['Sentiment'])

In [9]:
print(df['Sentiment'].isnull().sum())  # Count missing values

0


In [10]:
dataset = Dataset.from_pandas(df)

### Mapping the Data

In [11]:
def map_labels(example):
    if example["Sentiment"] == -1:
        example["Sentiment"] = 0  # Negative
    elif example["Sentiment"] == 0:
        example["Sentiment"] = 1  # Neutral
    elif example["Sentiment"] == 1:
        example["Sentiment"] = 2  # Positive
    return example

mapped_dataset = dataset.map(map_labels)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 269750/269750 [00:07<00:00, 34152.28 examples/s]


### Tokenize the Data

In [12]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example["Comment"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = mapped_dataset.map(tokenize_function, batched=True)

# Ensure the labels are properly added
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

# Map the labels to ensure they are integers
train_dataset = train_dataset.map(lambda x: {'labels': torch.tensor(x['Sentiment'], dtype=torch.long)}, batched=True)
val_dataset = val_dataset.map(lambda x: {'labels': torch.tensor(x['Sentiment'], dtype=torch.long)}, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 269750/269750 [01:22<00:00, 3272.37 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 215800/215800 [00:06<00:00, 33309.00 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 53950/53950 [00:01<00:00, 34073.22 examples/s]


### Loading BERT (DistilBERT)

In [14]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Defining our Evaluation Metrics

In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

### Defining the Training Arguments

In [16]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True
)

### Initializing the Trainer

In [17]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Get the labels from the inputs
        labels = inputs.get("labels")
        
        # Convert labels to long type as CrossEntropyLoss expects long tensor for labels
        labels = labels.long()

        # Forward pass
        outputs = model(**inputs)  # This gets the logits
        logits = outputs.logits  # Extract logits from the model's output
        
        # Use CrossEntropyLoss, which takes logits of shape [batch_size, num_classes] and labels of shape [batch_size]
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)  # Pass logits and labels directly to the loss function
        
        if return_outputs:
            return loss, outputs
        return loss

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,  
    data_collator=data_collator, 
    compute_metrics=compute_metrics
)

### Training the Model

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2212,0.21043,0.929175,0.929472,0.930209,0.929175
2,0.1949,0.188503,0.942428,0.9426,0.943038,0.942428
3,0.0466,0.20512,0.947322,0.947287,0.947268,0.947322


TrainOutput(global_step=40464, training_loss=0.19069658165266196, metrics={'train_runtime': 84066.529, 'train_samples_per_second': 7.701, 'train_steps_per_second': 0.481, 'total_flos': 2.14402308217344e+16, 'train_loss': 0.19069658165266196, 'epoch': 3.0})

### Evaluating the Model

In [21]:
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

Evaluation results: {'eval_loss': 0.18850284814834595, 'eval_accuracy': 0.9424281742354031, 'eval_f1': 0.9426000387059595, 'eval_precision': 0.9430379900320718, 'eval_recall': 0.9424281742354031, 'eval_runtime': 1727.906, 'eval_samples_per_second': 31.223, 'eval_steps_per_second': 1.951, 'epoch': 3.0}


### Saving the Model & Tokenizer

In [22]:
model.save_pretrained("./db_model")
tokenizer.save_pretrained("./db_model")

('./db_model\\tokenizer_config.json',
 './db_model\\special_tokens_map.json',
 './db_model\\vocab.txt',
 './db_model\\added_tokens.json')