In [1]:
# Install required libraries
%pip install datasets transformers huggingface_hub -q
%pip install torch accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Import key libraries
import numpy as np
import os
import pandas as pd
import re

from datasets import load_dataset

from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer
from transformers import AutoTokenizer, RobertaModel
from sklearn.metrics import mean_squared_error

import warnings
warnings.simplefilter('ignore')

In [4]:
# Disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

In [5]:
# Loading the datasets
train_df = pd.read_csv('/content/drive/MyDrive/P5-Sentiment-Analysis/data/Train.csv').dropna(axis = 0)
test_df = pd.read_csv('/content/drive/MyDrive/P5-Sentiment-Analysis/data/Test.csv').fillna("")

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9999 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   9999 non-null   object 
 1   safe_text  9999 non-null   object 
 2   label      9999 non-null   float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 390.6+ KB


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5177 entries, 0 to 5176
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   5177 non-null   object
 1   safe_text  5177 non-null   object
dtypes: object(2)
memory usage: 81.0+ KB


## Lets check null values

In [8]:
train_df.isna().sum()
     

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [9]:
train_df[train_df.isna().any(axis=1)]

Unnamed: 0,tweet_id,safe_text,label,agreement


In [10]:
test_df[test_df.isna().any(axis=1)]

Unnamed: 0,tweet_id,safe_text


In [11]:
# Drop the rows with nulls from the training data
train_df.dropna(inplace = True)
     

In [12]:
#Rechecking the data types
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9999 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   9999 non-null   object 
 1   safe_text  9999 non-null   object 
 2   label      9999 non-null   float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 390.6+ KB


# Training

Spliting train data into training and validation sets

In [13]:
train, eval = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'],shuffle=True)

# Fine-tuning the DistilBERT Model










In [14]:
# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", num_labels=3)

In [15]:
# Save split data subsets
train.to_csv('/content/drive/MyDrive/P5-Sentiment-Analysis/data/train_subset.csv', index=False)
eval.to_csv('/content/drive/MyDrive/P5-Sentiment-Analysis/data/eval_subset.csv', index=False)

In [16]:
# Load the subsetted data
data = load_dataset ("csv", 
                    data_files={"train": '/content/drive/MyDrive/P5-Sentiment-Analysis/data/train_subset.csv',
                                "eval": '/content/drive/MyDrive/P5-Sentiment-Analysis/data/eval_subset.csv'}, encoding = "ISO-8859-1")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-6261323f75965a69/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-6261323f75965a69/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
# Define helper functions
## Function to transform labels
def transform_labels(label):

    label = label['label']
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2

    return {'labels': num}

## Function to tokenize data
def tokenize_data(example):
    return tokenizer(example['safe_text'], padding='max_length')

In [18]:
# Tokenize the tweets
dataset = data.map(tokenize_data, batched=True)

# Transform	labels and limit the columns
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [19]:
training_args = TrainingArguments(
    "covid_tweets_sentiment_analysis_model", 
                                  num_train_epochs=3, 
                                  load_best_model_at_end=True, 
                                  save_strategy='epoch',
                                  evaluation_strategy='epoch',
                                  logging_strategy='epoch',
                                  logging_steps=100,
                                  per_device_train_batch_size=16,
                                  )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
# Load the pretrained model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
     

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [21]:

def compute_metrics_regression(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"rmse": mean_squared_error(labels, predictions, squared=False)}

In [22]:
# Formed  train and evaluation datasets
train_dataset = dataset["train"].shuffle(seed=24) 
eval_dataset = dataset["eval"].shuffle(seed=24)

In [23]:
#converting training data to PyTorch tensors 
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
trainer = Trainer(
    model,
    training_args, 
    train_dataset=train_dataset, 
    eval_dataset=eval_dataset,
    # data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_regression,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rmse
1,0.7025,0.626143,0.73212
2,0.5401,0.578013,0.65
3,0.3603,0.634734,0.638749


TrainOutput(global_step=1500, training_loss=0.5342643229166667, metrics={'train_runtime': 1208.5317, 'train_samples_per_second': 19.856, 'train_steps_per_second': 1.241, 'total_flos': 3178876855292928.0, 'train_loss': 0.5342643229166667, 'epoch': 3.0})

In [25]:

# Launch the final evaluation 
trainer.evaluate()

{'eval_loss': 0.5780133605003357,
 'eval_rmse': 0.65,
 'eval_runtime': 33.8909,
 'eval_samples_per_second': 59.013,
 'eval_steps_per_second': 7.377,
 'epoch': 3.0}

In [26]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
# Push model and tokenizer to HugginFace 
model.push_to_hub("muiga-mwangi/Finetuned-sieBert-base-model")
tokenizer.push_to_hub("muiga-mwangi/Finetuned-sieBert-base-model")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/muiga-mwangi/Finetuned-sieBert-base-model/commit/4cc3710d192af38110075f01cc755cacdf64003e', commit_message='Upload tokenizer', commit_description='', oid='4cc3710d192af38110075f01cc755cacdf64003e', pr_url=None, pr_revision=None, pr_num=None)