In [None]:
# | echo:false

from transformers.utils import logging

logging.set_verbosity_error()

# Lesson 2: Natural Language Processing (NLP)


### Build the `chatbot` pipeline using 🤗 Transformers Library

In [7]:
from transformers import pipeline
from transformers import Conversation

# 대화 파이프라인 정의

chatbot = pipeline(task="conversational", model="facebook/blenderbot-400M-distill")

# 사용자 메시지
user_message = """
What is the speed of light?
"""

# 대화 객체 생성 및 사용자 메시지 추가
conversation = Conversation(user_message)

# 챗봇에 대화 전달 및 응답 받기
conversation = chatbot(conversation)

# 대화 결과 출력
print(conversation)

Conversation id: 5ebf1054-f1c6-4092-a3a1-3ffcc2ca312d
user: 
What is the speed of light?

assistant:  Light speeds vary widely depending on the type of light and speed of the object being used.



- You can continue the conversation with the chatbot with:
```
print(chatbot(Conversation("What else do you recommend?")))
```
- However, the chatbot may provide an unrelated response because it does not have memory of any prior conversations.

- To include prior conversations in the LLM's context, you can add a 'message' to include the previous chat history.

In [8]:
conversation.add_message(
    {
        "role": "user",  # 메시지의 역할을 "user"로 설정합니다. (사용자 메시지)
        "content": """
        Do you think that I think you have consciousness?
        """,
    }
)

conversation = chatbot(conversation)  # 챗봇에게 대화를 전달하고 응답을 받습니다.

print(conversation)  # 대화 결과를 출력합니다.

Conversation id: 5ebf1054-f1c6-4092-a3a1-3ffcc2ca312d
user: 
What is the speed of light?

assistant:  Light speeds vary widely depending on the type of light and speed of the object being used.
user: 
        Do you think that I think you have consciousness?
        
assistant:  I don't think I have any consciousness, but I do believe that I have a good sense of self-awareness.



# Lesson 3: Translation and Summarization

### Build the `translation` pipeline using 🤗 Transformers Library

NLLB: No Language Left Behind: ['nllb-200-distilled-600M'](https://huggingface.co/facebook/nllb-200-distilled-600M).


In [10]:
from transformers import pipeline
import torch

translator = pipeline(
    task="translation",
    model="facebook/nllb-200-distilled-600M",
    torch_dtype=torch.bfloat16,
)

text = """\
My puppy is adorable, \
Your kitten is cute.
Her panda is friendly.
His llama is thoughtful. \
We all have nice pets!"""

text_translated = translator(text, src_lang="eng_Latn", tgt_lang="fra_Latn")

text_translated

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

[{'translation_text': 'Mon chiot est adorable, ton chaton est mignon, son panda est ami, sa lamme est attentive, nous avons tous de beaux animaux de compagnie.'}]

To choose other languages, you can find the other language codes on the page: [Languages in FLORES-200](https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200)

For example:
- Afrikaans: afr_Latn
- Chinese: zho_Hans
- Egyptian Arabic: arz_Arab
- French: fra_Latn
- German: deu_Latn
- Greek: ell_Grek
- Hindi: hin_Deva
- Indonesian: ind_Latn
- Italian: ita_Latn
- Japanese: jpn_Jpan
- Korean: kor_Hang
- Persian: pes_Arab
- Portuguese: por_Latn
- Russian: rus_Cyrl
- Spanish: spa_Latn
- Swahili: swh_Latn
- Thai: tha_Thai
- Turkish: tur_Latn
- Vietnamese: vie_Latn
- Zulu: zul_Latn

In [11]:
text = """\
My puppy is adorable, \
Your kitten is cute.
Her panda is friendly.
His llama is thoughtful. \
We all have nice pets!"""

text_translated = translator(text, src_lang="eng_Latn", tgt_lang="kor_Hang")

text_translated

[{'translation_text': '내 강아지는 사랑스럽고, 당신의 새끼 고양이는 귀여운데, 그녀의 팬다는 친절하고, 그의 라마는 신중합니다. 우리 모두는 좋은 애완동물들을 가지고 있습니다.'}]

In [None]:
# Free up some memory before continuing
import gc

del translator
gc.collect()

### Build the `summarization` pipeline using 🤗 Transformers Library

Model info: ['bart-large-cnn'](https://huggingface.co/facebook/bart-large-cnn)

In [None]:
summarizer = pipeline(
    task="summarization", model="facebook/bart-large-cnn", torch_dtype=torch.bfloat16
)


text = """Paris is the capital and most populous city of France, with
          an estimated population of 2,175,601 residents as of 2018,
          in an area of more than 105 square kilometres (41 square
          miles). The City of Paris is the centre and seat of
          government of the region and province of Île-de-France, or
          Paris Region, which has an estimated population of
          12,174,880, or about 18 percent of the population of France
          as of 2017."""


summary = summarizer(text, min_length=10, max_length=100)

summary



config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

[{'summary_text': 'Paris is the capital and most populous city of France, with an estimated population of 2,175,601 residents as of 2018. The City of Paris is the centre and seat of the government of the region and province of Île-de-France.'}]

In [None]:
# Free up some memory before continuing
import gc

del summarizer
gc.collect()

543

# Lesson 4: Sentence Embeddings

### Build the `sentence embedding` pipeline using 🤗 Transformers Library

More info on [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2).

In [16]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

sentences1 = [
    "The cat sits outside",
    "A man is playing guitar",
    "The movies are awesome",
]

embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings1

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

tensor([[ 0.1392,  0.0030,  0.0470,  ...,  0.0641, -0.0163,  0.0636],
        [ 0.0227, -0.0014, -0.0056,  ..., -0.0225,  0.0846, -0.0283],
        [-0.1043, -0.0628,  0.0093,  ...,  0.0020,  0.0653, -0.0150]],
       device='cuda:0')

In [17]:
sentences2 = [
    "The dog plays in the garden",
    "A woman watches TV",
    "The new movie is so great",
]
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
print(embeddings2)

tensor([[ 0.0163, -0.0700,  0.0384,  ...,  0.0447,  0.0254, -0.0023],
        [ 0.0054, -0.0920,  0.0140,  ...,  0.0167, -0.0086, -0.0424],
        [-0.0842, -0.0592, -0.0010,  ..., -0.0157,  0.0764,  0.0389]],
       device='cuda:0')


Calculate the cosine similarity between two sentences as a measure of how similar they are to each other.

In [18]:
from sentence_transformers import util

cosine_scores = util.cos_sim(embeddings1, embeddings2)
print(cosine_scores)
for i in range(len(sentences1)):
    print(
        "{} \t\t {} \t\t Score: {:.4f}".format(
            sentences1[i], sentences2[i], cosine_scores[i][i]
        )
    )

tensor([[ 0.2838,  0.1310, -0.0029],
        [ 0.2277, -0.0327, -0.0136],
        [-0.0124, -0.0465,  0.6571]], device='cuda:0')
The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The movies are awesome 		 The new movie is so great 		 Score: 0.6571
