# Install environments

In [1]:
!pip install -q transformers
!pip install -q "pinecone-client[grpc]"

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires cloudpickle~=2.2.1, but you have cloudpickle 3.0.0 which is incompatible.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires protobuf<4,>3.12.2, but you have protobuf 4.25.5 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 16.1.0 which is incompatible.
google-cloud-aiplatform 0.6.0a1 requires google-api-core[grpc]<2.0.0dev,>=1.22.2, but you have google-api-core 2.11.1 which is incompatible.
google-cloud-automl 1.0.1 requires google-api-core[grpc]<2.0.0dev,>=1.14.0, but you have google-api-core 2.11.1 which is incompatible.
google-cloud-bigquery 2.34.4 requires prot

In [2]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key='b52dac1e-0eb8-47d3-b5ca-ef64ab2dbfcd')
index_name = "vn-news"
index = pc.Index(index_name)

In [8]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {device}")

Running on cuda


## **Model Translate**

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_name = "VietAI/envit5-translation"
tokenizer_translate = AutoTokenizer.from_pretrained(model_name)  
model_translate = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)



## **Model embedding**

In [10]:
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

model_path = 'Alibaba-NLP/gte-large-en-v1.5'
tokenizer_embedding = AutoTokenizer.from_pretrained(model_path)
model_embedding = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(device)


## **Model LLM**

In [5]:
from huggingface_hub import notebook_login

notebook_login()  # same as before

## use this token
## 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
# Load model directly
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer_LLM = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model_LLM = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## **Function core**

**Function translate**

In [None]:
def translate_vi2eng(input_text):
    input_text = [f"vi: {input_text}"]
    output_encodes = model_translate.generate(tokenizer_translate(input_text, return_tensors="pt", padding=True).input_ids.to(device), max_length=1024)
    output = tokenizer_translate.batch_decode(output_encodes, skip_special_tokens=True)    
    return output[0].split(":", 1)[1]

In [36]:
def translate_eng2vi(input_text):
    input_text = [f"en: {input_text}"]
    output_encodes = model_translate.generate(tokenizer_translate(input_text, return_tensors="pt", padding=True).input_ids.to(device), max_length=1024)
    output = tokenizer_translate.batch_decode(output_encodes, skip_special_tokens=True)    
    return output[0].split(":", 1)[1]

**Function embedding**

In [23]:
def embedding_text(input_text):
    # Tokenize the input texts
    batch_dict = tokenizer_embedding(input_text, max_length=8192, padding=True, truncation=True, return_tensors='pt').to(device)

    outputs = model_embedding(**batch_dict)
    embeddings = outputs.last_hidden_state[:, 0]
    embeddings = F.normalize(embeddings, p=2, dim=1).cpu().detach().numpy()[0].tolist()
    
    return embeddings

**Function retrieval**

In [27]:
def retrieval_context(vector_embedding,topk): 
    query_results = index.query(
    #namespace="example-namespace",
    vector=vector_embedding,
    include_metadata=True, 
    top_k=topk,
    include_values=False
    )
    list_id = []
    list_url = []
    for item in query_results['matches']:
        list_id.append(int(item["id"]))
        list_url.append(item["metadata"]["url"])
    return list_id,list_url

In [30]:
# retrieval_context(embedding_text("Tôi muốn tìm kiếm một bản tin về kiện nhau do manga"), topk = 5)
retrieval_context(embedding_text("tin tức về Tottenham"), topk = 5)

([130, 194, 114, 366, 300],
 ['https://vnexpress.net/nuot-hai-goi-heroine-khi-bi-bat-qua-tang-mua-ban-4797709.html\n',
  'https://vnexpress.net/con-trai-kien-me-vi-vut-bo-suu-tap-truyen-tranh-manga-4797697.html\n',
  'https://vnexpress.net/co-nen-bo-nganh-dieu-duong-de-thi-lai-rang-ham-mat-4796055.html\n',
  'https://vnexpress.net/lam-thanh-my-dien-vien-gen-z-duoc-san-don-4795187.html\n',
  'https://vnexpress.net/ong-trump-doa-truy-to-google-neu-dac-cu-4797915.html\n'])

**Function mapping data**

In [31]:
import pickle
def mapping_data(list_id,list_url):
    
    with open('/kaggle/input/llm-chatbot/total_output_clean.pkl', 'rb') as file:
        total_output_clean = pickle.load(file)
        
    total_text_with_link = []
    for index,url in zip(list_id,list_url): 
        total_text_with_link.append(f"{total_output_clean[index]}, link:{url}")
    
#     with open('/kaggle/input/llm-chatbot/total_chunks.pkl', 'rb') as file:
#         total_chunks = pickle.load(file)
    # Turn list to string
    sentence_list = total_text_with_link

    # Convert the list to a string in the desired format
    formatted_string = '; '.join([f'"{sentence}"' for sentence in sentence_list])

    # Add brackets around the final string
    result_context = f"[{formatted_string}]"
    
#     print(result_context)
    return result_context

**Function answer question**

In [32]:
import re
def chatbot(question,context):
    from datetime import date

    # Get the current date
    current_date = date.today()
#     print(f"Date: {current_date}")  # Output: YYYY-MM-DD (e.g., 2024-08-02)

    # Define the chat template using Role 1 (Prompting Specialist)
    messages = [
        {"role": "user", "content": f"You are an expert in understanding user queries and rephrasing them. The original question is: {question}. Rephrase it clearly and concisely in 2 sentences for a QA chatbot to answer. Only return the rephrased question, no extra content or answers."},
    ]

    input_ids_1 = tokenizer_LLM.apply_chat_template(conversation=messages, return_tensors="pt", return_dict=True).to("cuda")

    outputs_1 = model_LLM.generate(**input_ids_1, max_new_tokens=256)
    decoded_output_1 = tokenizer_LLM.decode(outputs_1[0], skip_special_tokens=False)
    answer_query_1 = decoded_output_1.rsplit("<end_of_turn>", 2)[1].strip().strip('*') # Because the output include the answer between 2 "<end_of_turn>"

#     print(f"Rephrase question: {answer_query_1}")

    ###############################################################

    # Define the chat template using Role 2 (QA Chatbot)
    messages = [
        {"role": "user", "content": f"The current date is {current_date} (YYYY-MM-DD format). You are a friendly AI chatbot that looks through the news article and provide answer for user. Answer the question in a natural and friendly tone under 200 words. Have to use Chain of Thought reasoning with no more than three steps but dont include it in the response to user. Here are the new article {context}, the user asks {answer_query_1}. YOU MUST INCLUDE THE LINK TO THE ARTICLE AT THE END OF YOUR ANSWER"},
    ]

    input_ids_2 = tokenizer_LLM.apply_chat_template(conversation=messages, return_tensors="pt", return_dict=True).to("cuda")

    outputs_2 = model_LLM.generate(**input_ids_2, max_new_tokens=1024)
    decoded_output_2 = tokenizer_LLM.decode(outputs_2[0], skip_special_tokens=False)
    answer_query_2 = decoded_output_2.rsplit("<end_of_turn>", 2)[1].strip().strip('*') # Because the output include the answer between 2 "<end_of_turn>"
    
    # Regular expression pattern to extract URLs
    url_pattern = r'https?://[^\s]+'

    # Find the URL in the text
    answer_without_url = re.sub(url_pattern, '', answer_query_2)
    urls = re.findall(url_pattern, answer_query_2)

#     print(f"Answer: {answer_query_2}")
    return answer_without_url,urls[0]

## **Full Pipeline**

In [33]:
def pipeline(question):
    question_translate = translate_vi2eng(question)
    question_embedding = embedding_text(question_translate)
    list_id,list_url = retrieval_context(question_embedding,3)
    context = mapping_data(list_id,list_url)
    result,url = chatbot(question_translate,context)
    answer = translate_eng2vi(result)
#     print(question_translate)
#     print("----")
#     print(question_embedding)
#     print("----")
#     print(list_id,list_url)
#     print("----")
#     print(context)
#     print("----")
#     print(result)
#     print("----")
#     print(answer)
#     print("----")    

    return answer,url   
    

In [34]:
answer,url = pipeline("Tôi cần tin tức về đá bóng")
print(f"answer:{answer}")
print(f"url:{url}")

NameError: name 'translate_vi2eng' is not defined