In [1]:
import os
from dotenv import load_dotenv

# Load variables from .env file
load_dotenv()

# Access environment variables
os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

### Let's try original model using pipeline

In [2]:
from transformers import pipeline

In [3]:
pipe = pipeline("fill-mask", model="distilbert/distilbert-base-uncased")

Device set to use mps:0


In [4]:
pipe("Today I will [MASK] you.")

[{'score': 0.11076898872852325,
  'token': 5914,
  'token_str': 'marry',
  'sequence': 'today i will marry you.'},
 {'score': 0.039211247116327286,
  'token': 3113,
  'token_str': 'meet',
  'sequence': 'today i will meet you.'},
 {'score': 0.031348153948783875,
  'token': 2393,
  'token_str': 'help',
  'sequence': 'today i will help you.'},
 {'score': 0.030156586319208145,
  'token': 2156,
  'token_str': 'see',
  'sequence': 'today i will see you.'},
 {'score': 0.02977384440600872,
  'token': 2425,
  'token_str': 'tell',
  'sequence': 'today i will tell you.'}]

### Load IMDb dataset

In [5]:
from datasets import load_dataset

In [6]:
imdb = load_dataset("imdb").shuffle()
imdb

Reusing dataset imdb (/Users/lucky/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [7]:
imdb['train'] = imdb['train'].select(range(10))
imdb['test'] = imdb['test'].select(range(10))

In [8]:
imdb['test'].to_pandas()['label'].value_counts()

label
1    7
0    3
Name: count, dtype: int64

In [9]:
imdb['test'][-1]

{'text': '"Bride of Chucky" is one of the better horror movies to come out in the past ten years and could be one of the best horror films of the 90\'s.<br /><br />**SPOILERS**<br /><br />Chucky\'s girlfriend, Tiffany (Jennifer Tilly) manages to find his battered remains after being sucked into the fan at the end of part 3 and brings him to life in her trailer park. Her neighbor, Jessie (Nick Stabile) and his girlfriend Jade (Katherine Heigl) are being tormented by her uncle. (John Ritter) Tiffany upsets Chucky when he refuses to marry her, so she buys a doll for him to play with. Chucky kills Tiffany, and then transfers her soul into the doll she got him. In order for them to be placed back into human bodies, they have to travel to New Jersey to retrieve an amulet to do so. Jessie sees this as an opportunity to escape from Ritter, and they set out on the journey, but not before Ritter is killed by Chucky and Tiffany. Along the way, several bizarre incidents force them to stop at a bed

### let's preprocess it

In [10]:
from transformers import AutoTokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

In [12]:
def preprocess_fxn(examples):
    return tokenizer(examples['text'], truncation=True)

In [13]:
tokenized_imdb = imdb.map(preprocess_fxn, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [14]:
print(tokenized_imdb['test'][-1])

{'text': '"Bride of Chucky" is one of the better horror movies to come out in the past ten years and could be one of the best horror films of the 90\'s.<br /><br />**SPOILERS**<br /><br />Chucky\'s girlfriend, Tiffany (Jennifer Tilly) manages to find his battered remains after being sucked into the fan at the end of part 3 and brings him to life in her trailer park. Her neighbor, Jessie (Nick Stabile) and his girlfriend Jade (Katherine Heigl) are being tormented by her uncle. (John Ritter) Tiffany upsets Chucky when he refuses to marry her, so she buys a doll for him to play with. Chucky kills Tiffany, and then transfers her soul into the doll she got him. In order for them to be placed back into human bodies, they have to travel to New Jersey to retrieve an amulet to do so. Jessie sees this as an opportunity to escape from Ritter, and they set out on the journey, but not before Ritter is killed by Chucky and Tiffany. Along the way, several bizarre incidents force them to stop at a bed

### let's add padding to our input seq to make all i/p of equal length -> dynamic padding

In [15]:
len(tokenized_imdb['test'][-1]['input_ids'])

512

In [16]:
from transformers import DataCollatorWithPadding

In [17]:
data_collator = DataCollatorWithPadding(tokenizer)

For training a model using trainer for which we need eval metric

In [18]:
import evaluate
accuracy = evaluate.load("accuracy")

In [19]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [21]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [22]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
training_args = TrainingArguments(
    output_dir='my_awesome_model',
    learning_rate = 2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.691493,0.6


TrainOutput(global_step=1, training_loss=0.6994994878768921, metrics={'train_runtime': 66.4071, 'train_samples_per_second': 0.151, 'train_steps_per_second': 0.015, 'total_flos': 1324673986560.0, 'train_loss': 0.6994994878768921, 'epoch': 1.0})

In [40]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

In [41]:
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier(text)

Device set to use mps:0


[{'label': 'POSITIVE', 'score': 0.5024219155311584}]

Since we barely trained our model, its score is always .50 for all text.

## let's try to use already finetune distilbert from hub

In [42]:
pipe = pipeline("text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use mps:0


In [43]:
pipe(text)

[{'label': 'POSITIVE', 'score': 0.9994964599609375}]

In [44]:
pipe('I love it')

[{'label': 'POSITIVE', 'score': 0.9998799562454224}]

In [45]:
pipe('I hate it')

[{'label': 'NEGATIVE', 'score': 0.9996398687362671}]