In [None]:
import torch

In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
sentence = "Hello, how are you doing today?"
encoded_input = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
print(f"Original sentence: '{sentence}'")
print(f"Input IDs: {encoded_input['input_ids']}")
print(f"Attention Mask: {encoded_input['attention_mask']}")

decoded_tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
print(f"Decoded Tokens: {decoded_tokens}")


Original sentence: 'Hello, how are you doing today?'
Input IDs: tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 2725, 2651, 1029,  102]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Decoded Tokens: ['[CLS]', 'hello', ',', 'how', 'are', 'you', 'doing', 'today', '?', '[SEP]']


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#Define a model name and load both tokenizer and model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Text for analysis
text_negative = 'This movie is very bad, I hate it.'

#Tokenize the input text
# The tokenizer must be the one corresponding to the model
negative_token = tokenizer(text_negative, return_tensors='pt')

#Perform inference
# Call the loaded 'model' object, not the class
with torch.no_grad():
    output = model(**negative_token)

#Get predictions and confidence score
predictions = torch.argmax(output.logits, dim=-1)

#Access the config from the loaded 'model' object
prediction_label = model.config.id2label[predictions.item()]

confidence_score = torch.softmax(output.logits, dim=-1)[0][predictions.item()].item() # type: ignore

#Print the results
print(f"Text: '{text_negative}'")
print(f"Predicted Label: {prediction_label}")
print(f"Confidence Score: {confidence_score:.4f}")


Text: 'This movie is very bad, I hate it.'
Predicted Label: NEGATIVE
Confidence Score: 0.9998


In [None]:
from transformers import pipeline #type: ignore

classifier=pipeline('text-classification',model='nlptown/bert-base-multilingual-uncased-sentiment')
print(classifier("i am so happy today"))



config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


[{'label': '5 stars', 'score': 0.8002934455871582}]


In [10]:
translate=pipeline('translation_en_to_fr') # type: ignore
print(translate('hi who are you?'))

No model was supplied, defaulted to google-t5/t5-base and revision a9723ea (https://huggingface.co/google-t5/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'translation_text': 'hi qui êtes-vous?'}]


In [1]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained('bert-base-uncased')

text='hugging face makes nlp easy for everyone'

encoded_input=tokenizer(text,padding=True,truncation=True,return_tensors='pt')

print("Original Text:", text)
print("Input IDs:", encoded_input["input_ids"])
print("Attention Mask:", encoded_input["attention_mask"])
print("Decoded Tokens (for understanding):",tokenizer.decode(encoded_input["input_ids"][0]))
print("Tokens (raw):", tokenizer.tokenize (text))

Original Text: hugging face makes nlp easy for everyone
Input IDs: tensor([[  101, 17662,  2227,  3084, 17953,  2361,  3733,  2005,  3071,   102]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Decoded Tokens (for understanding): [CLS] hugging face makes nlp easy for everyone [SEP]
Tokens (raw): ['hugging', 'face', 'makes', 'nl', '##p', 'easy', 'for', 'everyone']


In [4]:
# batch

sentence=[
    'this is a short sentence',
    'hugging face api is very good',
    'hugging face makes nlp easy for everyone'
]

encoded_sentences=tokenizer(sentence,padding=True,truncation=True,return_tensors='pt')


In [7]:
print("\nEncoded Sentences (with padding & truncation):")
print("Input IDs shape:", encoded_sentences["input_ids"].shape)
print("Input IDs for sentences:", encoded_sentences["input_ids"])
print("Attention Mask for sentences:", encoded_sentences["attention_mask"])
print("Input IDs for sentences:", encoded_sentences["input_ids"])
print("Attention Mask for sentences:", encoded_sentences["attention_mask"])


Encoded Sentences (with padding & truncation):
Input IDs shape: torch.Size([3, 10])
Input IDs for sentences: tensor([[  101,  2023,  2003,  1037,  2460,  6251,   102,     0,     0,     0],
        [  101, 17662,  2227, 17928,  2003,  2200,  2204,   102,     0,     0],
        [  101, 17662,  2227,  3084, 17953,  2361,  3733,  2005,  3071,   102]])
Attention Mask for sentences: tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Input IDs for sentences: tensor([[  101,  2023,  2003,  1037,  2460,  6251,   102,     0,     0,     0],
        [  101, 17662,  2227, 17928,  2003,  2200,  2204,   102,     0,     0],
        [  101, 17662,  2227,  3084, 17953,  2361,  3733,  2005,  3071,   102]])
Attention Mask for sentences: tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [8]:
# custome tokenizer

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer=AutoTokenizer.from_pretrained('bert-base-uncased')
model=AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')

# Check initial vocabulary size
print(f"Initial vocabulary size: {len(tokenizer)}")
# Define new tokens specific to a medical domain
new_medical_tokens = ["MyocardialInfarction", "HypertensionCrisis", "[PHI]"]
# Add new tokens to the tokenizer
tokenizer.add_tokens(new_medical_tokens)
# Resize the model's token embeddings to account for new tokens
model.resize_token_embeddings (len(tokenizer))
print(f"New vocabulary size: {len(tokenizer)}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initial vocabulary size: 30522


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


New vocabulary size: 30525


In [9]:
#Test the new tokenizer
text_with_medical_term = "Patient presents with MyocardialInfarction and a severe Hypertension Crisis."
encoded_custom = tokenizer(text_with_medical_term, return_tensors="pt")
print("\nText with new medical terms:", text_with_medical_term)
print("Encoded Input IDs with new tokens:", encoded_custom["input_ids"])
print("Decoded (to see if new tokens are recognized):", tokenizer.decode(encoded_custom["input_ids"][0]))
#Verify that the new tokens are treated as single units
# If they were split, the decoded output would show subwords or special tokens like ##


Text with new medical terms: Patient presents with MyocardialInfarction and a severe Hypertension Crisis.
Encoded Input IDs with new tokens: tensor([[  101,  5776,  7534,  2007, 30522,  1998,  1037,  5729, 23760, 29048,
          5325,  1012,   102]])
Decoded (to see if new tokens are recognized): [CLS] patient presents with MyocardialInfarction and a severe hypertension crisis. [SEP]


In [11]:
from datasets import load_dataset
from transformers import AutoTokenizer
#1. Load a dataset from the Hugging Face Hub (e.g., IMDB for sentiment analysis)
# For a local CSV, you'd use: load_dataset('csv', data_files='your_reviews.csv')
dataset = load_dataset("imdb", split="train[:1000]") # Load a small subset for demonstration
print(f"Dataset features: {dataset.features}")
print(f"First example: {dataset[0]}")
#2. Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Dataset features: {'text': Value('string'), 'label': ClassLabel(names=['neg', 'pos'])}
First example: {'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is tha

In [13]:
def tokenize_function(examples):
    return tokenizer(examples['text'],padding='max_length',truncation=True)

tokenized_dataset=dataset.map(tokenize_function,batched=True)
print(tokenized_dataset)
print(tokenized_dataset[0]['input_ids'][:10]) # type: ignore
print(tokenized_dataset[0]['attention_mask'][:10]) # type: ignore

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})
[101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [14]:
#5. Rename and select columns to match model expectations (e.g., 'label' to 'labels')
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(["text"]) # Remove original text column
tokenized_dataset.set_format("torch") # Set format for PyTorch training
print("\nFinal processed dataset features:", tokenized_dataset.features)
print("First processed example (PyTorch tensor format):")
print(tokenized_dataset[0]["input_ids"].shape)


Final processed dataset features: {'labels': ClassLabel(names=['neg', 'pos']), 'input_ids': List(Value('int32')), 'token_type_ids': List(Value('int8')), 'attention_mask': List(Value('int8'))}
First processed example (PyTorch tensor format):
torch.Size([512])
