<a href="https://colab.research.google.com/github/nebyu08/data_sc/blob/main/experiment_with_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'hate-speech-detection-curated-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4205998%2F7257995%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240415%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240415T061035Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D23a25ea99ee47508531cafc6d5e42b7160ca70110a7b2b6ece355162f705cebf21ab56ffcba0f187ee3e9603d4a4704c826c055ab73fdcfd2c5c56720ed06001401de58a4076f6f70b7801327f502812291d22ddaf0eaaeb5d6db7580282d9294af1a252f1d549fb09a07ec1703f0595888f9d801aa339d9cad1e895623b6620222d6674fccdb513e09366563561b6822afb1e2b884f8e4b2d4ff406de3f8cbcf825493881578acaf4d095a3c028d3bab9d40214558a2ec72707c543cef81827e5ad986fef9482a8e69e6a208d447d788f6e5e7394e42adcb92186df741d38236d47708f3dd40402bdcb6f3157902c363c713dc7b72baa72c61078c5393525e5'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading hate-speech-detection-curated-dataset, 119628445 bytes compressed
Downloaded and uncompressed: hate-speech-detection-curated-dataset
Data source import complete.


In [2]:
import torch
import re
import string
import pandas as pd
import numpy as np
from __future__ import division
from tqdm import tqdm
from torch.utils.data import SequentialSampler,SequentialSampler
from torch.optim import lr_scheduler
from torch.utils.data import Dataset,DataLoader,Subset
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer,RobertaForSequenceClassification,RobertaConfig
from transformers import TrainingArguments,Trainer
from sklearn.model_selection import train_test_split

In [3]:
data_frame=pd.read_csv("/kaggle/input/hate-speech-detection-curated-dataset/HateSpeechDatasetBalanced.csv")

In [4]:
label2id={'hate':1,'no_hate':0}
id2label={ids:label for label,ids in label2id.items()}

In [5]:
df=data_frame.copy()

In [6]:
def remove_urls(text):
  return re.sub(r'http\S+',"",text)

df.loc[:,"Content"]=df["Content"].apply(remove_urls)

In [7]:
#remove emojies
def remove_emojies(text):
  patterns=re.compile(
    "["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"
    "]+"
)

  return re.sub(patterns,"",text)
df["Content"]=df["Content"].apply(remove_emojies)

In [8]:
#lets remove non enlgish
def remove_non_english(text):
  pattern=re.compile(r'\b(?![a-zA-Z]+\b)\w+\b')
  result=re.sub(pattern,"",text)
  if len(result)==0:
    return None
  else:
    return result

df["Content"]=df["Content"].apply(remove_non_english)

In [9]:
def remove_puctuations(text):
    if text is None:
        return None
    else:
        return text.translate(str.maketrans("","",string.punctuation))

df["Content"]=df["Content"].apply(remove_puctuations)

In [10]:
df.isna().sum()/len(df)

Content    0.000017
Label      0.000000
dtype: float64

In [11]:
#remove the na rows
df.dropna(inplace=True,axis=0)

In [12]:
model_name="cardiffnlp/twitter-roberta-base-sentiment-latest"

In [13]:
tokenizer=AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [14]:
class Custom_Dataset(Dataset):
  def __init__(self,data,tokenizer,max_length=512):
    super().__init__()
    self.text=data["Content"]
    self.label=data["Label"]
    self.tokenizer=tokenizer
    self.max_length=max_length

  def __len__(self):
    return len(self.text)

  def __getitem__(self,idx):
    text=self.text.iloc[idx]
    label=self.label.iloc[idx]
    #assert that the data input is of string
    assert type(text)==str,f"the expected data type is str but found {type(text)}"

    encoded_text=self.tokenizer.encode_plus(
        text,
        padding="max_length",
        truncation=True,
        max_length=self.max_length,
        return_tensors="pt",
    )
    label=torch.tensor([label],dtype=torch.long)


    return {
        "input_ids":encoded_text["input_ids"].detach(),
        "attention_mask":encoded_text["attention_mask"].detach(),
        "labels":label
    }

In [15]:
train_df,test_df=train_test_split(df,random_state=42)

In [16]:
train_dataset=Custom_Dataset(train_df,tokenizer)
test_dataset=Custom_Dataset(test_df,tokenizer)

In [17]:
input_ids=train_dataset[0]["input_ids"].shape
attention_mask=train_dataset[0]["attention_mask"].shape
labels=train_dataset[0]["labels"]
print(type(input_ids))
print(type(attention_mask))
print(type(labels))
print(input_ids)
print(attention_mask)

<class 'torch.Size'>
<class 'torch.Size'>
<class 'torch.Tensor'>
torch.Size([1, 512])
torch.Size([1, 512])


In [18]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
def custom_collate_fn(batch):
    input_ids=[item["input_ids"] for item in batch]
    attention_mask=[item["attention_mask"] for item in batch]
    labels=[item["labels"] for item in batch]

      #lets pad the sequence
    padded_input_ids=pad_sequence(input_ids,batch_first=True,padding_value=0)
    padded_attention_mask=pad_sequence(attention_mask,batch_first=True,padding_value=0)
    labels=torch.stack(labels,dim=0)
    return {
          "input_ids":padded_input_ids.to("cuda"),
          "attention_mask":padded_attention_mask.to("cuda"),
          "labels":labels.to("cuda")
  }

In [20]:
batch_size=32

#make the data loader
train_dataloader=DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    sampler=SequentialSampler(train_dataset),
    collate_fn=custom_collate_fn,
)
#doing the same thing for test data loader
test_dataloader=DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    sampler=SequentialSampler(test_dataset),
    collate_fn=custom_collate_fn,
)

In [21]:
#lets check for the shapes of the values
for batch in train_dataloader:
    input_ids,attention_mask,labels=batch["input_ids"],batch["attention_mask"],batch["labels"]
    print(f"shape of the input ids is :{input_ids.shape}")
    print(f"shape of the attention mask:{attention_mask.shape}")
    print(f"shape of the lables is:{labels.shape}")
    #device
    print(f"shape of the input ids is :{input_ids.device}")
    print(f"shape of the attention mask:{attention_mask.device}")
    print(f"shape of the lables is:{labels.device}")
    break

shape of the input ids is :torch.Size([32, 1, 512])
shape of the attention mask:torch.Size([32, 1, 512])
shape of the lables is:torch.Size([32, 1])
shape of the input ids is :cuda:0
shape of the attention mask:cuda:0
shape of the lables is:cuda:0


In [22]:
#lets set up the config
config=RobertaConfig.from_pretrained(model_name)
config.label2id=label2id
config.id2label=id2label
config.num_labels=len(id2label)

In [23]:
model=RobertaForSequenceClassification.from_pretrained(model_name,config=config,ignore_mismatched_sizes=True)

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

In [24]:
#lets change some architurectural stuff
for layer in model.roberta.embeddings.parameters():
  layer.requires_grad=False

for layer in model.roberta.encoder.layer[:8].parameters():
  layer.requires_grad=False

model.classifier.out_proj=torch.nn.Linear(in_features=model.config.hidden_size,out_features=len(id2label))

In [25]:
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [26]:
optimizer=torch.optim.AdamW(params=model.parameters(),lr=0.01)
loss_fn=torch.nn.CrossEntropyLoss()

todo:
* try to over fit the model and study the model
* add learning rate shceduler
* early stopping and make number of epoch

# lets try to overfit the model over one batch of data

In [27]:
for batch in train_dataloader:
    sample=batch
    break

In [28]:
for batch in test_dataloader:
    test_sample=batch
    break

In [29]:
sample["input_ids"][0].is_cuda,sample["attention_mask"][0].is_cuda,sample["labels"].is_cuda

(True, True, True)

In [30]:
len(sample['labels'])

32

In [31]:
#!nvidia-smi --gpu-reset

In [32]:
# # Release all tensors from GPU memory
# torch.cuda.empty_cache()

# # Reset GPU context (this effectively clears the entire GPU memory)
# torch.cuda.reset_max_memory_allocated()
# torch.cuda.empty_cache()


In [33]:
# epoch_nums=300
# for epoch in tqdm(range(epoch_nums)):
#     optimizer.zero_grad()
#     input_ids,attention_mask,label=sample["input_ids"].squeeze(dim=1),sample["attention_mask"].squeeze(dim=1),batch["labels"].squeeze(dim=1)
#     logits=model(input_ids,attention_mask).logits
#     loss=loss_fn(logits,label)

#     #backprop
#     loss.backward()
#     #update the parameter
#     optimizer.step()
#     #extract the loss from the loss
#     loss_item=loss.item()
#     if epoch%10==0:
#         print(f"{epoch}/{epoch_nums} loss:{loss_item:.2f}")

#     #lets evalaute on testing dataset
#     model.eval()
#     with torch.no_grad():
#         test_input_ids,test_attention_mask,test_labels=test_sample["input_ids"].squeeze(dim=1),test_sample["attention_mask"].squeeze(dim=1),test_sample["labels"].squeeze(dim=1)
#         test_logits=model(test_input_ids,test_attention_mask).logits
#         loss=loss_fn(test_logits,test_labels).item()
#         if epoch%10==0:

#             print(f"eval_loss:{epoch}/{epoch_nums} loss:{loss:.2f}")

# new approach:
* adding a new learning rate scheduler
* early stoping(manually)

In [34]:
model2=model=RobertaForSequenceClassification.from_pretrained(model_name,config=config,ignore_mismatched_sizes=True)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

In [35]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [36]:
model2=model2.to(device)

In [37]:
#lets change some architurectural stuff
for layer in model2.roberta.embeddings.parameters():
  layer.requires_grad=False

for layer in model2.roberta.encoder.layer[:8].parameters():
  layer.requires_grad=False

model2.classifier.out_proj=torch.nn.Linear(in_features=model.config.hidden_size,out_features=len(id2label))

In [38]:
#traingin setup..
learning_rate=1e-3
num_epochs=1000
optimizer=torch.optim.AdamW(model2.parameters(),lr=learning_rate)
scheduler=lr_scheduler.ReduceLROnPlateau(optimizer,mode='min',factor=0.1,patience=2)
loss_fn=torch.nn.CrossEntropyLoss()
best_loss=float("inf")
patience=3

## lets make a new data loader

In [39]:
len(train_dataset)//3

181526

In [40]:
slice_1=list(range(len(train_dataset)//2))
slice_2=list(range(len(train_dataset)//2,len(train_dataset)))

In [41]:
train_set1=Subset(train_dataset,slice_1)
train_set2=Subset(train_dataset,slice_2)

In [42]:
train_dataloader_1=DataLoader(
    dataset=train_set1,
    batch_size=batch_size,
    collate_fn=custom_collate_fn,
)

train_dataloader_2=DataLoader(
    dataset=train_set2,
    batch_size=batch_size,
    collate_fn=custom_collate_fn
)

In [43]:
#move model tinto gpu
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2=model2.to(device)

In [44]:
# # Release all tensors from GPU memory
# torch.cuda.empty_cache()

# # Reset GPU context (this effectively clears the entire GPU memory)
# torch.cuda.reset_max_memory_allocated()
# torch.cuda.empty_cache()

In [45]:
#accumulation the gradient
accumulate_gradient=4

In [46]:
len(train_dataloader),len(test_dataloader)

(17019, 5673)

In [47]:
# for epoch in tqdm(range(num_epochs)):
#     model2.train()
#     #lets train it on the first half of the dataset
#     train_loss=0.0
#     for idx,batch in enumerate(train_dataloader):
#         input_ids,attention_mask,label=batch["input_ids"].squeeze(dim=1),batch["attention_mask"].squeeze(dim=1),batch["labels"].squeeze(dim=1)

#         #batchifying our input over 4 mini batches
#         with torch.set_grad_enabled(True):
#             logits=model2(input_ids,attention_mask).logits
#             loss=loss_fn(logits,label)

#             #normalize our loss
#             loss/=accumulate_gradient

#             loss.backward()
#             train_loss+=loss.item()

#             #checking for condition
#             if (((idx+1)%accumulate_gradient==0) or (idx+1==len(train_dataloader))):
#                 optimizer.step()  #change the gradient
#                 optimizer.zero_grad()

#             #lets display the logits
#             if (idx%1000==0):
#                 print(f"{epoch}/{num_epochs}: loss:{loss.item()}")

#     #lets accumulate the training loss
#     train_loss/=len(train_dataloader.dataset)

#     eval_loss=0.0
#     #lets evaluate it
#     model.eval()
#     with torch.no_grad():
#         for idx,batch in enumerate(test_dataloader):
#             input_ids_test,attention_mask_test,label_test=batch["input_ids"].squeeze(dim=1),batch["attention_mask"].squeeze(),batch["labels"].squeeze(dim=1)
#             logits=model2(input_ids_test,attention_mask_test).logits
#             loss=loss_fn(logits,label_test)
#             loss/=accumulate_gradient

#             eval_loss+=loss.item()*input_ids_test.size(0)

#             if ((idx+1)%accumulate_gradient==0 or (idx+1==len(train_dataloader))):
#                 optimizer.step()  #change the gradient
#                 optimizer.zero_grad()



#     eval_loss/=len(test_dataloader.dataset)
#     scheduler.step(eval_loss)  #looks close at the evaluation metrics

#      #the early stoping part of the code
#     if eval_loss<best_loss:
#         best_loss=eval_loss
#         counter=0
#     else:
#         counter+=1
#         if counter>patience:
#             break

## ToDo:
* train on one batch
* using mixed precision training
* gradient accumulation
* adjusting the learning rate

# experiment with one batch training compiled model

In [48]:
for batch in train_dataloader:
  one_batch=batch
  break

In [49]:
type(one_batch)

dict

In [50]:
print(f"input ids:{one_batch['input_ids'].size()}")
print(f"attention mask: {one_batch['attention_mask'].size()}")
print(f"labels:{one_batch['labels'].size()}")

input ids:torch.Size([32, 1, 512])
attention mask: torch.Size([32, 1, 512])
labels:torch.Size([32, 1])


In [51]:
compiled_model=torch.compile(model2)
compiled_model.to("cuda")

  self.pid = os.fork()


OptimizedModule(
  (_orig_mod): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (de

In [52]:
# Release all tensors from GPU memory
torch.cuda.empty_cache()

# Reset GPU context (this effectively clears the entire GPU memory)
torch.cuda.reset_max_memory_allocated()
torch.cuda.empty_cache()



In [54]:
# num_epoch=300
# compiled_model.train()
# for epoch in range(num_epoch):
#   optimizer.zero_grad()
#   input_ids,attention_mask,labels=one_batch["input_ids"].squeeze(dim=1),one_batch["attention_mask"].squeeze(dim=1),one_batch["labels"].squeeze(dim=1)
#   logits=compiled_model(input_ids,attention_mask).logits
#   loss=loss_fn(logits,labels)
#   #back-prop
#   loss.backward()
#   if epoch%10==0:
#     loss_item=loss.item()
#     print(f"{epoch}/{num_epoch} the loss: {loss_item}")

# lets try using torch's mixed precision:
* auto-cast  
* grad-scaler
* and accumulated gradient

In [55]:
model_name

'cardiffnlp/twitter-roberta-base-sentiment-latest'

In [57]:
model3=RobertaForSequenceClassification.from_pretrained(model_name)

#lets change some architurectural stuff
for layer in model3.roberta.embeddings.parameters():
  layer.requires_grad=False

for layer in model3.roberta.encoder.layer[:8].parameters():
  layer.requires_grad=False

model3.classifier.out_proj=torch.nn.Linear(in_features=model.config.hidden_size,out_features=len(id2label))

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [76]:
optimizer=torch.optim.AdamW(params=model3.parameters(),lr=0.01)
loss_fn=torch.nn.CrossEntropyLoss()

In [77]:
compiled_model3=torch.compile(model3,backend="eager")
compiled_model3=compiled_model3.to(device)

In [78]:
device="cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [79]:
use_amp=True
scaler=torch.cuda.amp.GradScaler()  #this clips the gradient during the backprop
accum_iter=4
num_epochs=100

for epoch in range(num_epochs):
  input_ids,attention_mask,labels=one_batch['input_ids'].squeeze(dim=1),one_batch['attention_mask'].squeeze(dim=1),one_batch['labels'].squeeze(dim=1)
  with torch.autocast(device_type=device,dtype=torch.float16,enabled=use_amp):
    logits=compiled_model3(input_ids,attention_mask).logits
    loss=loss_fn(logits,labels)
    if epoch%10==0:
      print(f"{epoch}/{num_epochs}:loss {loss.item()}")


  #calcualte the gradient and scale it
  scaler.scale(loss).backward()

  if epoch%accum_iter==0:
    scaler.step(optimizer)
    #update the scaler
    scaler.update()
    optimizer.zero_grad()

0/100:loss 2.809880495071411
10/100:loss 3.9845128059387207
20/100:loss 1.0869827270507812
30/100:loss 0.8290252685546875
40/100:loss 0.6970367431640625
50/100:loss 0.7324676513671875
60/100:loss 0.72857666015625
70/100:loss 0.707061767578125
80/100:loss 0.699188232421875
90/100:loss 0.691925048828125


## same thing as above but with more epochs

In [82]:
# use_amp=True
# scaler=torch.cuda.amp.GradScaler()  #this clips the gradient during the backprop
# accum_iter=4
# num_epochs=10000

# for epoch in range(num_epochs):
#   input_ids,attention_mask,labels=one_batch['input_ids'].squeeze(dim=1),one_batch['attention_mask'].squeeze(dim=1),one_batch['labels'].squeeze(dim=1)
#   with torch.autocast(device_type=device,dtype=torch.float16,enabled=use_amp):
#     logits=compiled_model3(input_ids,attention_mask).logits
#     loss=loss_fn(logits,labels)
#     if epoch%1000==0:
#       print(f"{epoch}/{num_epochs}:loss {loss.item()}")


#   #calcualte the gradient and scale it
#   scaler.scale(loss).backward()

#   if epoch%accum_iter==0:
#     scaler.step(optimizer)
#     #update the scaler
#     scaler.update()
#     optimizer.zero_grad()


# #the output of this model were mostly 0.7.... nothing less than it and it stopped after like 7000 epochs

# same as above:
* added learing rate scheduler.
* changing the optimizer from adama into adam
* this all done by adding a new validation data into the mix

In [83]:
model4=RobertaForSequenceClassification.from_pretrained(model_name)

#lets change some architurectural stuff
for layer in model4.roberta.embeddings.parameters():
  layer.requires_grad=False

for layer in model4.roberta.encoder.layer[:8].parameters():
  layer.requires_grad=False

model4.classifier.out_proj=torch.nn.Linear(in_features=model.config.hidden_size,out_features=len(id2label))

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [84]:
model4=model4.to(device)

In [87]:
opt=torch.optim.Adam(params=model4.parameters(),lr=1e4)
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=opt,
                                                    mode='min',
                                                    factor=0.1,
                                                    patience=2)

In [88]:
for batch in test_dataloader:
  one_val=batch
  break

In [106]:
compiled_model4=torch.compile(model4,backend="eager")
compiled_model4.to(device)

OptimizedModule(
  (_orig_mod): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (de

In [101]:
len(train_dataloader)

17019

In [None]:
use_amp=True
scaler=torch.cuda.amp.GradScaler()  #this clips the gradient during the backprop
accum_iter=4
num_epochs=10000

for epoch in tqdm(range(num_epochs)):
  for idx,batch in enumerate(train_dataloader):
    input_ids,attention_mask,labels=batch["input_ids"].squeeze(dim=1),batch["attention_mask"].squeeze(dim=1),batch["labels"].squeeze(dim=1)
    with torch.autocast(device_type=device,dtype=torch.float16,enabled=use_amp):
      logits=compiled_model4(input_ids,attention_mask).logits
      loss=loss_fn(logits,labels)
      if idx%10==0:
        print(f"{epoch}/{num_epochs}:loss {loss.item()}")


    #calcualte the gradient and scale it
    scaler.scale(loss).backward()



    if idx%accum_iter==0:
      scaler.step(opt)
      #update the scaler
      scaler.update()
      optimizer.zero_grad()

    #lets evaluate the model
  model.eval()
  with torch.no_grad():
    for idx,batch in enumerate(test_dataloader):
      test_inputs,test_attention,test_labels=batch["input_ids"].squeeze(dim=1),batch["attention_mask"].squeeze(dim=1),batch["labels"].squeeze(dim=1)
      with torch.autocast(device_type=device,dtype=torch.float16,enabled=True):
        logits=compiled_model4(test_inputs,test_attention).logits
        val_loss=loss_fn(logits,test_labels)
      #check for the learning rate scheduler
      scheduler.step(val_loss)
      if idx%1000==0:
        print(f"{epoch}/{num_epochs}:loss {val_loss.item()}")

# new todo:
* use new model:
-> new model:xlm roberta base

In [None]:
new_model_name="FacebookAI/xlm-roberta-base"