# Custom Training

In [None]:
#imports basics

import torch
from datasets import load_dataset
from transformers import(
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline,
)
from sklearn.metrics import accuracy_score,precision_recall_fscore_support

#model calling and storing

ModelName = 'distilbert-base-uncased'
MainModel = AutoModelForSequenceClassification.from_pretrained(ModelName)

#dataset custom IMdb

DataSet = load_dataset('imdb')
DataSet

#token intializing

tokeny = AutoTokenizer.from_pretrained(ModelName)
def TokenizerData(data):
  return tokeny(data['text'],truncation=True)

#range of what dataset
train_ds = DataSet['train'].shuffle(seed = 69).select(range(2000))
test_ds = DataSet['test'].shuffle(seed = 69).select(range(1000))
#batching tokeinization of training data
train_token = train_ds.map(TokenizerData,batched=True)
test_token = test_ds.map(TokenizerData,batched=True)
#token cleaning and converting define
train_token = train_token.remove_columns(['text']).rename_column('label','labels')
test_token = test_token.remove_columns(['text']).rename_column('label','labels')
train_token.set_format('torch')
test_token.set_format('torch')

#data collator intialize

data_collator = DataCollatorWithPadding(tokenizer=tokeny)


#dynamaic model logits extraction and inputing

num_labels = DataSet["train"].features["label"].num_classes
print("Number of labels:", num_labels)
MainModel = AutoModelForSequenceClassification.from_pretrained(ModelName,num_labels=num_labels)

#metrics and evaluation of accuracy (we skip support)

def eva_met(eval_pred):
  logits,labels = eval_pred
  preds = logits.argmax(axis=-1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels,preds,average="binary")
  acc = accuracy_score(labels,preds)
  return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

#training arguments

training_args = TrainingArguments(
    output_dir="./results-distilbert-imdb",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
      push_to_hub=False,
      report_to="none",
)

#trainer input feeding

trainer = Trainer(
    model=MainModel,
    args=training_args,
    train_dataset=train_token,
    eval_dataset=test_token,
    #tokenizer=tokeny,
    processing_class=tokeny,
    data_collator=data_collator,
    compute_metrics=eva_met,
)


#forward pass / Training model

trainer.train()
MainModel.config.id2label = {0: "NEGATIVE", 1: "POSITIVE"}
MainModel.config.label2id = {"NEGATIVE": 0, "POSITIVE": 1}
#saving of model and final printing its done
# Save locally (optional, good for backup)
trainer.save_model("./imdb-distilbert-finetuned-custom")
tokeny.save_pretrained("./imdb-distilbert-finetuned-custom")

print("Training done. Model saved to ./imdb-distilbert-finetuned-custom")

# Push to Hugging Face Hub (public repo, under your account)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of labels: 2


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.292344,0.886,0.89662,0.879142,0.887795
2,No log,0.345771,0.881,0.848057,0.935673,0.889713
3,No log,0.333557,0.897,0.886792,0.916179,0.901246


Training done. Model saved to ./imdb-distilbert-finetuned-custom


# Inference Model

In [None]:
#Telling inference model the locations

mnameinf = './imdb-distilbert-finetuned-custom'
mtokeninf = './imdb-distilbert-finetuned-custom'

#initializing again simulating testing after training

ModelNameInf = AutoModelForSequenceClassification.from_pretrained(mnameinf)
ModelTokenInf = AutoTokenizer.from_pretrained(mtokeninf)

#test inputs for inference model

InputInf = ['this is a bad movie','I love the movie','its a okayish movie']

#setting model to evaluate mode

ModelNameInf.eval()

#tokenizing the inputs

TokenizedInfData = ModelTokenInf(
    InputInf,
    padding=True,
    truncation=True,
    return_tensors="pt",
)

#setting up no gradient in eval mode to save memory

with torch.no_grad():
    TokenWithLogits = ModelNameInf(**TokenizedInfData)

#getting logits out of token outputs

Logits = TokenWithLogits.logits

#applying softmax to get in appropiate range 0-1

InfModelProb = torch.softmax(Logits, dim=-1)

#Extracting the ids of the max one's

MaxIndInf = torch.argmax(InfModelProb, dim=-1)

#Mapping over and converting ids from id2 to labels of em
#ModelNameInf.config.id2label = {0: "NEGATIVE", 1: "POSITIVE"}
id2label = ModelNameInf.config.id2label

pred_labels = [id2label[i.item()] for i in MaxIndInf]

#Printing Results

for text, logit, pred_label in zip(InputInf, InfModelProb, pred_labels):
    print(f"Text: {text}")
    print(f"Logits/Probs: {logit}")
    print(f"Predicted Label: {pred_label}")
    print("-" * 50)

Text: this is a bad movie
Logits/Probs: tensor([0.9719, 0.0281])
Predicted Label: NEGATIVE
--------------------------------------------------
Text: I love the movie
Logits/Probs: tensor([0.0306, 0.9694])
Predicted Label: POSITIVE
--------------------------------------------------
Text: its a okayish movie
Logits/Probs: tensor([0.1683, 0.8317])
Predicted Label: POSITIVE
--------------------------------------------------


# Pipeline

In [None]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="./imdb-distilbert-finetuned-custom",
    tokenizer="./imdb-distilbert-finetuned-custom",
)
input=['damn what a movie']
output = classifier(input)

output



Device set to use cuda:0


[{'label': 'NEGATIVE', 'score': 0.8044580817222595}]

# Hugging Face Upload & Usage using Pipeline


In [None]:
hf_code = ''

In [None]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Reload trained model & tokenizer
model = AutoModelForSequenceClassification.from_pretrained("./imdb-distilbert-finetuned-custom")
tokenizer = AutoTokenizer.from_pretrained("./imdb-distilbert-finetuned-custom")

# Push to Hub (your username/repo_name)
repo_name = "Noobhacker69/imdb-distilbert-finetuned"  # you choose the name
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Noobhacker69/imdb-distilbert-finetuned/commit/b2459a1d2cdcb274d8605e152f71888ff7745395', commit_message='Upload tokenizer', commit_description='', oid='b2459a1d2cdcb274d8605e152f71888ff7745395', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Noobhacker69/imdb-distilbert-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='Noobhacker69/imdb-distilbert-finetuned'), pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="Noobhacker69/imdb-distilbert-finetuned")

print(classifier("This movie was absolutely amazing!"))
print(classifier("Terrible, I hated every moment of it."))


config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': 'POSITIVE', 'score': 0.973459005355835}]
[{'label': 'NEGATIVE', 'score': 0.9235556721687317}]
