<a href="https://colab.research.google.com/github/mridul-eecs/Transformers/blob/main/transformerSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets transformers wandb



In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import wandb
model_name= "distilbert-base-uncased"
tokenizer= AutoTokenizer.from_pretrained(model_name)
model= AutoModelForSequenceClassification.from_pretrained(model_name, num_labels= 2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from datasets import load_dataset
data= load_dataset("imdb", revision= "main")

In [4]:
import copy
tokenized_data= copy.deepcopy(data)
def tokenizer_function(examples):
    return tokenizer(examples['text'], truncation= True, max_length=512)
def format_labels(batch):
    batch["label"] = torch.tensor(batch["label"])  # Convert int → Tensor
    return batch

tokenized_data= tokenized_data.map(tokenizer_function, batched= True, batch_size= 1024)
# tokenized_data= tokenized_data.map(format_labels)

tokenized_data.set_format(type= 'torch', columns= ['input_ids', 'attention_mask', 'label'])

data_collator= DataCollatorWithPadding(
    tokenizer= tokenizer,
    padding= "longest",
    return_tensors= 'pt'
)

tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [5]:
trainer= Trainer(


    model= model,
    train_dataset= tokenized_data['train'],
    eval_dataset= tokenized_data['test'],
    data_collator= data_collator,
    args= TrainingArguments(
        output_dir= '.',
        num_train_epochs= 100,
        logging_steps=10,
        learning_rate= 0.0001,
        per_device_train_batch_size=60,   # Batch size
        per_device_eval_batch_size=60,
        logging_strategy= "steps",
        report_to= "wandb",
        no_cuda=False                    # Force usage of GPU if available

        )

    )

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmridul-eecs[0m ([33mmridul-eecs-individual[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
10,0.6202
20,0.4367
30,0.3807
40,0.3471
50,0.3526
60,0.3029
70,0.3763
80,0.291
90,0.3101
100,0.2432


KeyboardInterrupt: 

In [9]:
tokenizer.save_pretrained("/content/checkpoint-1000/")


('/content/checkpoint-1000/tokenizer_config.json',
 '/content/checkpoint-1000/special_tokens_map.json',
 '/content/checkpoint-1000/vocab.txt',
 '/content/checkpoint-1000/added_tokens.json',
 '/content/checkpoint-1000/tokenizer.json')

In [12]:
import numpy as np
model_path = "/content/checkpoint-1000"

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer= AutoTokenizer.from_pretrained(model_path)



text= "top gun is a bad movie"
inputs= tokenizer(text=text, padding= True, truncation= True, max_length= 512, return_tensors= 'pt')

print(np.shape(inputs))
print(inputs)
import torch

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

# Get prediction scores
logits = outputs.logits

# Convert logits to probabilities (for classification tasks)
probs = torch.nn.functional.softmax(logits, dim=-1)

# Get predicted class
predicted_class = torch.argmax(probs, dim=-1).item()

print(f"Predicted Class: {predicted_class}")
print(f"Confidence Scores: {probs}")


(2,)
{'input_ids': tensor([[ 101, 2327, 3282, 2003, 1037, 2919, 3185,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
Predicted Class: 0
Confidence Scores: tensor([[0.9929, 0.0071]])


In [8]:
!ls /content/checkpoint-1000

config.json	   optimizer.pt   scheduler.pt	      training_args.bin
model.safetensors  rng_state.pth  trainer_state.json


In [20]:
from huggingface_hub import login
from google.colab import userdata
HUGGINGFACE_ACCESS_TOKEN= userdata.get('HUGGINGFACE_ACCESS_TOKEN')
login(token=HUGGINGFACE_ACCESS_TOKEN)

In [21]:
model.push_to_hub(
    "mriduleecs/mriduleecs-imdb-distilbert-uncased"
)
tokenizer.push_to_hub("mriduleecs/mriduleecs-imdb-distilbert-uncased")


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mriduleecs/mriduleecs-imdb-distilbert-uncased/commit/5ae72cd16f8c9571b290ed42b766d95e1ade8ef7', commit_message='Upload tokenizer', commit_description='', oid='5ae72cd16f8c9571b290ed42b766d95e1ade8ef7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mriduleecs/mriduleecs-imdb-distilbert-uncased', endpoint='https://huggingface.co', repo_type='model', repo_id='mriduleecs/mriduleecs-imdb-distilbert-uncased'), pr_revision=None, pr_num=None)