<a href="https://colab.research.google.com/github/muraleee/collab-stuff/blob/main/Random.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  #1
model = BertModel.from_pretrained("bert-base-uncased")   #1

input_text = '''
After a long day at work, Sarah decided to relax by taking her
dog for a walk in the park. As they strolled along the
tree-lined paths, Sarah's dog, Max, eagerly sniffed around,
chasing after squirrels and birds. Sarah smiled as she watched
Max enjoy himself, feeling grateful for the companionship and
joy that her furry friend brought into her life.'''

tokens = tokenizer(input_text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**tokens)  #2

last_hidden_states = outputs.last_hidden_state  #3
print("Token embeddings:")  #4
for token, embedding in zip(tokens["input_ids"][0],
                            last_hidden_states[0]):
    word = tokenizer.decode(int(token))
    print(f"{word}: {embedding}")

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, perplexity=5, random_state=42)  #1
embeddings_tsne = tsne.fit_transform(last_hidden_states[0])   #1

plt.figure(figsize=(10, 8))  #2
plt.scatter(embeddings_tsne[:, 0],   #2
            embeddings_tsne[:, 1], marker='o')   #2
for i, word in enumerate(tokenizer.convert_ids_to_tokens(   #2
    tokens["input_ids"][0])):   #2
    plt.annotate(word, xy=(embeddings_tsne[i, 0],   #2
                           embeddings_tsne[i, 1]),   #2
                 fontsize=10)   #2
plt.xlabel('t-SNE Dimension 1') #2
plt.ylabel('t-SNE Dimension 2') #2
plt.title('t-SNE Visualization of Token Embeddings')
plt.show()

In [None]:
a = [1,2,3,4,5,6,7,8,9]
b = map(lambda x: x*x, a)
b = list(b)
for i, j in zip(a,b):
  print(i, j)

In [None]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

input_text = '''
After a long day at work, Sarah decided to relax by taking her
dog for a walk in the park. As they strolled along the
tree-lined paths, Sarah's dog, Max, eagerly sniffed around,
chasing after squirrels and birds. Sarah smiled as she watched
Max enjoy himself, feeling grateful for the companionship and
joy that her furry friend brought into her life.'''

tokens = tokenizer(input_text, return_tensors="pt")
embeddings = model.embeddings  #1
positional_embeddings = embeddings.position_embeddings.weight  #2
position_ids = torch.arange(tokens['input_ids'].size(1),
                            dtype=torch.long).unsqueeze(0)  #3
input_positional_embeddings = positional_embeddings[position_ids]  #4

print("Positional embeddings shape:", input_positional_embeddings.shape)
print("Positional embeddings for each token:")

for token_id, pos_embedding in zip(tokens['input_ids'][0],
                                   input_positional_embeddings[0]):
    token = tokenizer.decode([token_id])
    print(f"{token}: {pos_embedding}")

In [9]:
import math

x = [4.5, 6, 3.2]
y = list(map(lambda x: math.exp(x), x ))
a = 0
for i in y:
  a = a + i
z = list(map(lambda x: x/a, y))
z

[0.17378547395797767, 0.7788524592209889, 0.04736206682103336]

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "distilbert/distilbert-base-uncased-finetuned-sst-2-english")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [26]:
!du -h  ~/.cache/huggingface/hub

4.0K	/root/.cache/huggingface/hub/models--distilbert--distilbert-base-uncased-finetuned-sst-2-english/.no_exist/714eb0fa89d2f80546fda750413ed43d93601a13
8.0K	/root/.cache/huggingface/hub/models--distilbert--distilbert-base-uncased-finetuned-sst-2-english/.no_exist
8.0K	/root/.cache/huggingface/hub/models--distilbert--distilbert-base-uncased-finetuned-sst-2-english/refs
240K	/root/.cache/huggingface/hub/models--distilbert--distilbert-base-uncased-finetuned-sst-2-english/blobs
4.0K	/root/.cache/huggingface/hub/models--distilbert--distilbert-base-uncased-finetuned-sst-2-english/snapshots/714eb0fa89d2f80546fda750413ed43d93601a13
8.0K	/root/.cache/huggingface/hub/models--distilbert--distilbert-base-uncased-finetuned-sst-2-english/snapshots
268K	/root/.cache/huggingface/hub/models--distilbert--distilbert-base-uncased-finetuned-sst-2-english
4.0K	/root/.cache/huggingface/hub/.locks/models--distilbert--distilbert-base-uncased-finetuned-sst-2-english
4.0K	/root/.cache/huggingface/hub/.locks/mod

In [27]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased-finetuned-sst-2-english")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [32]:
import torch

text = "I loved the movie, it was fantastic!"

inputs = tokenizer(text, return_tensors = "pt")  #1

outputs = model(**inputs)  #1
# print(outputs)
predicted_label = torch.argmax(outputs.logits)
predicted_label

tensor(1)

In [34]:
from transformers import pipeline

classifier = pipeline(task = 'text-classification',
    model = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english')

Device set to use cpu


In [35]:
review1 = '''From the warm welcome to the exquisite dishes and impeccable
 service, dining at Gourmet Haven is an unforgettable experience that
 leaves you eager to return.'''

review2 = '''Despite high expectations, our experience at Savor Bistro
 fell short; the food was bland, service was slow, and the overall
 atmosphere lacked charm, leaving us disappointed and unlikely to
 revisit.'''

In [37]:
print(classifier(review2))

[{'label': 'NEGATIVE', 'score': 0.9997773766517639}]


In [38]:
from transformers import pipeline

question_classifier = pipeline("text-classification",
                               model="huaen/question_detection")

config.json:   0%|          | 0.00/853 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [42]:
response = question_classifier(
    '''
    started from this or that but where
    ''')
print(response)

[{'label': 'non_question', 'score': 0.9984527826309204}]


In [43]:
language_classifier = pipeline("text-classification",
    model="papluca/xlm-roberta-base-language-detection")

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


In [48]:
response = language_classifier("lampa")
print(response)

[{'label': 'sw', 'score': 0.7419531941413879}]


In [1]:
from transformers import pipeline

translator = pipeline("translation",
                      model = "google-t5/t5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [4]:
translator = pipeline(task = 'translation_en_to_hi',
                      model = "google-t5/t5-base")

Device set to use cpu


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

model_name_or_path = "tencent/HY-MT1.5-1.8B"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto")  # You may want to use bfloat16 and/or move to GPU here
messages = [
    {"role": "user", "content": "Translate the following segment into Hindi, without additional explanation.\n\nRead this book and tell me what it discusses."},
]
tokenized_chat = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=False,
    return_tensors="pt"
)

outputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=2048)
output_text = tokenizer.decode(outputs[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/488 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/654 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/4.08G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

In [3]:
outputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=2048)
output_text = tokenizer.decode(outputs[0])
output_text

'<｜hy_begin▁of▁sentence｜><｜hy_User｜>Translate the following segment into Hindi, without additional explanation.\n\nRead this book and tell me what it discusses.<｜hy_place▁holder▁no▁8｜>कृपया इस पुस्तक को पढ़ें, और बताइए कि इसमें क्या चर्चा की गई है।<｜hy_place▁holder▁no▁2｜>'

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
!pip install sentencepiece
!pip install protobuf



In [4]:
from transformers import pipeline

zero_shot_classifier = pipeline("zero-shot-classification",
                                model='joeddav/xlm-roberta-large-xnli')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


In [6]:
text1 = '''
"In the intricate realm of global affairs, the interplay of power,
diplomacy, and governance stands as a defining force in the
trajectory of nations. Amidst fervent debates in legislative
chambers and pivotal dialogues among world leaders, ideologies
clash and policies take shape, shaping the course of societies.
Issues such as economic disparity, environmental stewardship, and
human rights take precedence, driving conversations and shaping
public sentiment. In an age of digital interconnectedness, social
media platforms have emerged as influential channels for discourse
and activism, amplifying voices and reshaping narratives with
remarkable speed and breadth. As citizens grapple with the
complexities of contemporary governance, the pursuit of accountable
and transparent leadership remains paramount, reflecting an
enduring quest for fairness and inclusivity in societal governance."
'''

text2 = '''
In the tender tapestry of human connection, romance weaves its
delicate threads, binding hearts in a dance of passion and longing.
From the flutter of a first glance to the warmth of an intimate
embrace, love blooms in the most unexpected places, transcending
barriers of time and circumstance. In the gentle caress of a hand
and the whispered promises of affection, two souls find solace in
each other's embrace, navigating the complexities of intimacy with
tender care. As the sun sets and stars illuminate the night sky,
lovers share stolen moments of intimacy, lost in the intoxicating
rhythm of each other's presence. In the symphony of love, every
glance, every touch, speaks volumes of a shared bond that defies
explanation, leaving hearts entwined in an eternal embrace.
'''

In [7]:
candidate_labels = ["technology", "politics", "business", "romance"]
prediction = zero_shot_classifier(text1,
                                  candidate_labels,
                                  multi_label = True)

In [9]:
prediction = zero_shot_classifier([text1, text2],
                                  candidate_labels,
                                  multi_label = True)
display(pd.DataFrame(prediction).drop(["sequence"], axis=1))

Unnamed: 0,labels,scores
0,"[politics, technology, romance, business]","[0.9909884333610535, 0.8283571004867554, 0.465..."
1,"[romance, business, politics, technology]","[0.9982308149337769, 0.11879903078079224, 0.01..."


In [10]:
from transformers import pipeline

classifier = pipeline("zero-shot-image-classification",
                      model = "openai/clip-vit-large-patch14-336")

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda:0


In [16]:
labels_for_classification =  ["airplane", "car", "train"]
scores = classifier("Emirates.png",
                    candidate_labels = labels_for_classification)
pd.DataFrame(scores)

Unnamed: 0,score,label
0,0.997713,airplane
1,0.001646,car
2,0.000641,train
