# Fine_tune_pretrained_model_using_Trainer_API for supervised Task

## 1. Prepare the Dataset

In [10]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets_tweets = load_dataset("tweet_eval","emoji")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_train_datasets_small = raw_datasets_tweets["train"].select(range(8000))
raw_validaion_datasets_small = raw_datasets_tweets["validation"].select(range(2000))
raw_test_datasets_small = raw_datasets_tweets["test"].select(range(2000))

def tokenize_function(example):
    return tokenizer(example["text"],padding=True, truncation=True)


tokenized_train_datasets_tweets = raw_train_datasets_small.map(tokenize_function, batched=True)
tokenized_validaion_datasets_tweets = raw_validaion_datasets_small.map(tokenize_function, batched=True)
tokenized_test_datasets_tweets = raw_test_datasets_small.map(tokenize_function, batched=True)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

## 3.using the Trainer API of the transformer Library

### 3.1 Define Training args

In [17]:
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer",report_to="none")

### 3.2 Define the Model

In [18]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=20)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 3.3 Define a Trainer

In [19]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_datasets_tweets,
    eval_dataset=tokenized_validaion_datasets_tweets,
    tokenizer=tokenizer,
)

### 3.4 Launch the training

In [20]:
trainer.train()



Step,Training Loss
500,2.5782
1000,2.1234
1500,1.6893




TrainOutput(global_step=1500, training_loss=2.1302825520833335, metrics={'train_runtime': 304.7252, 'train_samples_per_second': 78.759, 'train_steps_per_second': 4.922, 'total_flos': 773901778653696.0, 'train_loss': 2.1302825520833335, 'epoch': 3.0})

### 3.5 inference from the trainer 

In [58]:
from datasets import Dataset
def predict(element, trainer, tokenizer, datasetlabel):
    # Tokenize the input
    inputs = tokenizer(element, padding=True, truncation=True, return_tensors="pt")
    # Convert to a format compatible with the Trainer API (wrap as a dataset)
    dataset = Dataset.from_dict({key: value.numpy() for key, value in inputs.items()})
    predictions = trainer.predict(dataset)
    predicted_label_index = predictions.predictions.argmax(-1)
    return datasetlabel[predicted_label_index[0]]

value = predict("I will take a photo.",trainer, tokenizer, raw_train_datasets_small.features['label'].names)
print(value)

📸


## 4.Saving and Sharing your model

### 4.1 Save the Model Locally

In [59]:
# Save the model and the tokenizer to a directory
model_save_path = "/kaggle/working/models"
trainer.save_model(model_save_path)

### 4.2 Load a local Model 

In [60]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Set the path to your model directory
model_directory = "/kaggle/working/models" 

# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained(model_directory)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_directory)


### 4.3 Apply inference using the transformer pipeline class

In [61]:
from transformers import pipeline

# Set the path to your model directory
model_directory = "/kaggle/working/models"  # Update this to your directory path

# Create a pipeline for text classification
# Make sure to specify the correct task if it's different (e.g., sentiment-analysis, token-classification, etc.)
text_classification_pipeline = pipeline(
    "text-classification",
    model=model_directory,
    tokenizer=model_directory
)

num_labels = model.config.num_labels
print(f"This model can predict {num_labels} different labels.")

# Text to classify
text = "I will take a photo."
# Perform inference
result = text_classification_pipeline(text)
# The result will be a list with a dictionary for each text classified
# If you provided a single text, you can just access the first result
print(result)
# To get the predicted label and score
predicted_label = result[0]['label']
confidence_score = result[0]['score']
print(f"Predicted label: {predicted_label}")
print(f"Confidence score: {confidence_score}")


This model can predict 20 different labels.
[{'label': 'LABEL_18', 'score': 0.30742329359054565}]
Predicted label: LABEL_18
Confidence score: 0.30742329359054565


In [62]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [63]:
trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/4.66k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1708190350.7b303a7d903d.34.0:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/msehsah/test-trainer/commit/4e8e71cc347419bd73a4e341320723f0c86d91e1', commit_message='End of training', commit_description='', oid='4e8e71cc347419bd73a4e341320723f0c86d91e1', pr_url=None, pr_revision=None, pr_num=None)