# Install environments

In [1]:
# !set CMAKE_ARGS=-DLLAMA_CUBLAS=on
# !set FORCE_CMAKE=1
# !pip install -q llama-cpp-python langchain langchain-community langchain-core transformers
!pip install -q transformers

# Set working path and select GPU or CPU to run on

In [1]:
import torch
import os

if torch.cuda.is_available():
    !nvidia-smi
    !nvcc --version
else:
    print("GPU is not available")


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {device}")

working_path = "/kaggle/working"
crawl_result_path = f"{working_path}/result"

print(f"Working path: {working_path}")
print(f"Result path: {crawl_result_path}")

GPU is not available
Running on cpu
Working path: /kaggle/working
Result path: /kaggle/working/result


## Crawl

In [2]:
# Delete all files and folders in the working directory. Use with caution!
import shutil
shutil.rmtree(working_path, ignore_errors=True)

In [3]:
!git clone https://github.com/egliette/VNNewsCrawler.git {working_path}/VNNewsCrawler

Cloning into '/kaggle/working/VNNewsCrawler'...
remote: Enumerating objects: 231, done.[K
remote: Counting objects: 100% (231/231), done.[K
remote: Compressing objects: 100% (166/166), done.[K
remote: Total 231 (delta 137), reused 150 (delta 61), pack-reused 0 (from 0)[K
Receiving objects: 100% (231/231), 39.12 KiB | 9.78 MiB/s, done.
Resolving deltas: 100% (137/137), done.


In [4]:
!pip install -q -r {working_path}/VNNewsCrawler/requirements.txt

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires cloudpickle~=2.2.1, but you have cloudpickle 3.0.0 which is incompatible.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 17.0.0 which is incompatible.
beatrix-jupyterlab 2024.66.154055 requires jupyterlab~=3.6.0, but you have jupyterlab 4.2.5 which is incompatible.
bigframes 0.22.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.10.0, but you have google-cloud-bigquery 2.34.4 which is incompatible.
bigframes 0.22.0 requires google-cloud-storage>=2.0.0, but you have google-cloud-storage 1.44.0 which is incompatible.
bigframes 0.22.0 requires pandas<2.1.4,>=1.5

## Create config file

In [6]:
config_content = f"""
# Web that want to crawls (vnexpress, dantri, but only supports vnexpress now)
webname: "vnexpress"

# tasks = ["url", "type"]
task: "type"

#logger config file path
logger_fpath: {working_path}/VNNewsCrawler/logger/logger_config.yml
urls_fpath: {working_path}/VNNewsCrawler/urls.txt
output_dpath: {crawl_result_path}
num_workers: 1

# if task == "type": 
# article_type == "all" to crawl all of types
article_type: "the-gioi"
total_pages: 1
"""

# Write the content to the YAML file
with open(f"{working_path}/VNNewsCrawler/crawler_config.yml", "w") as f:
    f.write(config_content)

## Crawling process

In [7]:
!python {working_path}/VNNewsCrawler/VNNewsCrawler.py --config {working_path}/VNNewsCrawler/crawler_config.yml

Crawl articles type the-gioi
Getting urls of the-gioi...
Pages: 100%|██████████████████████████████████████| 1/1 [00:01<00:00,  1.56s/it]
Crawling from urls of the-gioi...
Start crawling urls from /kaggle/working/result/urls/the-gioi.txt file...
URLs: 100%|█████████████████████████████████████| 47/47 [01:29<00:00,  1.90s/it]
Saving crawling result into /kaggle/working/result/the-gioi directory...
The number of failed URL: 0


## Check subfolers list

In [8]:
import os
from tqdm import tqdm

def list_subfolders(directory):
    subfolders = [f.path for f in os.scandir(directory) if f.is_dir() & ("urls" not in f.name ) ]
    return subfolders

subfolders = list_subfolders(crawl_result_path)
subfolders

['/kaggle/working/result/the-gioi']

In [9]:
total_urls = []
total_content_paths = []
total_contents = []
for category_path in subfolders:
    category_name = category_path.split("/")[-1]
    url_path = f"{crawl_result_path}/urls/{category_name}.txt"
    with open(url_path, "r") as f:
        url_lst = f.readlines()
    
    content_paths = os.listdir(category_path)
    content_paths = [os.path.join(category_path, path) for path in content_paths]
    content_paths.sort()
    
    category_contents = []
    for path in content_paths:
        with open(path, "r") as f:
            content = f.read()
            category_contents.append(content)
        
    
    total_urls += url_lst
    total_content_paths += content_paths
    total_contents += category_contents

In [14]:
total_urls = []
for path in total_content_paths:
    cate = path.split("/")[-2]
    order = path.split("/")[-1].split(".")[0][4:]
    order = int(order) - 1
    
    url_txt_path = f"{crawl_result_path}/urls/{category_name}.txt"
    with open(url_txt_path, "r") as f:
        url = f.readlines()[order]
        total_urls.append(url)
    
print(f"Total url: {len(total_urls)}")
print(f"Total content path: {len(total_content_paths)}")
print(f"Total content: {len(total_contents)}")

Total url: 47
Total content path: 47
Total content: 47


# Chunking

In [15]:
import re

def chunk_text_by_sentence(text, max_words=1000):
    # Split the text into sentences using regular expressions
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        word_count = len(sentence.split())
        # Check if adding this sentence exceeds the word limit
        if current_word_count + word_count <= max_words:
            current_chunk.append(sentence)
            current_word_count += word_count
        else:
            # If it exceeds the limit, store the current chunk and start a new one
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_word_count = word_count

    # Add the last chunk if there is any remaining content
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def batch_split_list(input_list, n):
    # Use list comprehension to split the list into chunks of size n
    return [input_list[i:i + n] for i in range(0, len(input_list), n)]

In [16]:
import pickle
total_chunks = []
total_url_chunks = []
for url, content in zip(total_urls, total_contents):
#     print(url)
#     print(content)
    
    chunks = chunk_text_by_sentence(content, 800) # Chunking every 800 words
    total_chunks += chunks
    
    total_url_chunks += ([url] * len(chunks))
    
print(f"Total chunk: {len(total_chunks)}")

with open(f"{working_path}/total_url_chunks.pkl", "wb") as f:
    pickle.dump(total_url_chunks, f)
    
with open(f"{working_path}/total_chunks.pkl", "wb") as f:
    pickle.dump(total_chunks, f)

Total chunk: 62


In [17]:
input_total_chunks = [f"vi: {chunk}" for chunk in total_chunks]

batch_input_chunks = batch_split_list(input_total_chunks, 48)

# Translating

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_name_translate = "VietAI/envit5-translation"
tokenizer_translate = AutoTokenizer.from_pretrained(model_name_translate)  
model_translate = AutoModelForSeq2SeqLM.from_pretrained(model_name_translate).to(device)



tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.49M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

In [18]:
%time
total_output = []
for batch_input in tqdm(batch_input_chunks):
    output_encodes = model_translate.generate(tokenizer_translate(batch_input, return_tensors="pt", padding=True).input_ids.to(device), max_length=1024)
    outputs = tokenizer_translate.batch_decode(output_encodes, skip_special_tokens=True)
    total_output += outputs

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.68 µs


100%|██████████| 2/2 [01:07<00:00, 34.00s/it]


In [19]:
## remove the flag "en: " which has 4 characters in the start of sentence
total_output_clean = [out[4:] for out in total_output]
print(f"Total len of total output clean: {len(total_output_clean)}")

Total len of total output clean: 62


In [20]:
import pickle
with open(f"{working_path}/total_output_clean.pkl", "wb") as f:
    pickle.dump(total_output_clean, f)
    
with open(f"{working_path}/total_urls.pkl", "wb") as f:
    pickle.dump(total_urls, f)
    
print(total_output_clean[5])
print(total_urls[5])

Trump's deputy secretary of state, Harris, debates 43 million viewers The debate between U.S.president JD Vance and Tim Walz attracted about 43 million viewers, mostly over the age of 55. Senator JD Vance and Minnesota Governor Tim Walz, the Democratic and Republican vice presidential candidates, had their first and only live debate on October 1 before the November election. American viewers watched the confrontation between the two deputy generals on fifteen news channels. The Nielsen Media Data Analysis Company released statistics on October 2 that the debate attracted about 43 million viewers, including more than 9 million people between the ages of 35 and 54 and more than 3 million between the ages of 18 and 34. The largest audience over 55 was 29 million. This was lower than the 57.9 million viewers watching the 2020 confrontation between vice presidential candidates Mike Pence and Kamala Harris. Vance and Walz focused primarily on policy, talking about immigration, abortion right

## Embedding

In [3]:
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

model_name_embedding = "Alibaba-NLP/gte-large-en-v1.5"
tokenizer_embedding = AutoTokenizer.from_pretrained(model_name_embedding)
model_embedding = AutoModel.from_pretrained(model_name_embedding, trust_remote_code=True).to(device)

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

In [5]:
import pickle
with open(f"{working_path}/total_output_clean.pkl", "rb") as f:  # Open in binary read mode
    total_output_clean = pickle.load(f)

In [6]:
def embed_script(input_texts):
    # Tokenize the input texts
    batch_dict = tokenizer_embedding(input_texts, max_length=8192, padding=True, truncation=True, return_tensors='pt').to(device)

    outputs = model_embedding(**batch_dict)
    embeddings = outputs.last_hidden_state[:, 0]
    embeddings = F.normalize(embeddings, p=2, dim=1).cpu().detach().numpy()
    
    return embeddings

In [24]:
import numpy as np
from tqdm import tqdm

sub_video_name = []
sub_scene_range = []
total_embeddings = np.empty([0,1024])
for en_txt in tqdm(total_output_clean):

    embedded_txt = embed_script(en_txt)
    total_embeddings = np.concatenate((total_embeddings, embedded_txt), axis=0)
    
    np.save(f"{working_path}/total_text_embeddings.npy", total_embeddings)

100%|██████████| 62/62 [00:05<00:00, 12.28it/s]


In [25]:
print(f"Shape of total_text_embeddings.npy: {np.load(f'{working_path}/total_text_embeddings.npy').shape}")

Shape of total_text_embeddings.npy: (62, 1024)


# Database search

In [8]:
import random
import numpy as np
import pickle


## randome show some content in vietnamese
with open(f"{working_path}/total_chunks.pkl", "rb") as f:
    vi_total_chunks = pickle.load(f)
    
idx_rand = random.randint(0, len(vi_total_chunks))
print(f"idx_rand: {idx_rand} \n\n")
print(vi_total_chunks[idx_rand],"\n\n")


## show some content in english
with open(f"{working_path}/total_output_clean.pkl", "rb") as f:
    en_total_chunks = pickle.load(f)
print(en_total_chunks[idx_rand],"\n\n")


## show url
with open(f"{working_path}/total_url_chunks.pkl", "rb") as f:
    total_url_chunks = pickle.load(f)
print(total_url_chunks[idx_rand])

idx_rand: 29 


Vợ ông Trump ủng hộ quyền phá thai của phụ nữ 
Trái với quan điểm của chồng, người ủng hộ lệnh cấm phá thai, bà Melania Trump cho rằng phụ nữ cần có quyền tự quyết với cơ thể của mình.
"Cần đảm bảo phụ nữ có quyền tự chủ, tự quyết vấn đề sinh đẻ dựa trên quan điểm của bản thân mà không bị chính quyền can thiệp hay gây áp lực", Guardian ngày 2/10 dẫn đoạn trích trong hồi ký sắp xuất bản của bà Melania Trump, vợ của cựu tổng thống Donald Trump.
Quan điểm này của bà khác với ông Trump, người đang là ứng viên tổng thống đảng Cộng hòa. Trong chiến dịch tranh cử, ông Trump thường tự hào giới thiệu quyết định đề cử ba thẩm phán vào Tòa án Tối cao của ông đã mở đường để cơ quan này năm 2022 đảo ngược phán quyết công nhận quyền phá thai của phụ nữ.
Ít nhất 20 bang ở Mỹ từ đó đã ban hành quy định cấm hoàn toàn hoặc hạn chế một phần độ tuổi phá thai, trong đó bang Georgia cấm các ca phá thai trên 6 tuần tuổi.
Phá thai là chủ đề nóng trước thềm cuộc bầu cử tổng thống Mỹ năm nay. Cá

In [9]:
print(total_url_chunks[:5])

['https://vnexpress.net/tai-xe-bo-tron-khi-xe-buyt-cho-hoc-sinh-thai-lan-boc-chay-4799250.html\n', 'https://vnexpress.net/so-nguoi-chet-o-my-do-bao-helene-tang-len-189-4799578.html\n', 'https://vnexpress.net/israel-iran-dau-khau-tai-hoi-dong-bao-an-4799587.html\n', 'https://vnexpress.net/phat-hien-nhieu-binh-khi-nen-trai-phep-tren-xe-buyt-boc-chay-4799689.html\n', 'https://vnexpress.net/tan-thu-tuong-nhat-sap-cong-bo-chi-so-hanh-phuc-4799939.html\n']


In [15]:
vector_database = np.load(f"{working_path}/total_text_embeddings.npy")
question = """Who launch the missle to israel recently"""

question_embedding = embed_script(question)[0]
print(question_embedding)

[-0.04899779  0.0121     -0.01829863 ...  0.06795684 -0.0185944
 -0.00973391]


In [16]:
def cosine_similarity(query_vector, vectors):
    # Normalize the vectors
    vectors_norm = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
    query_vector_norm = query_vector / np.linalg.norm(query_vector)
    
    # Compute cosine similarity (dot product between query and vectors)
    cosine_sim = np.dot(vectors_norm, query_vector_norm)
    
    return cosine_sim

def top_k_similar_vectors(query_vector, vectors, k):
    # Calculate cosine similarities
    similarities = cosine_similarity(query_vector, vectors)
    
    # Get the indices of the top k most similar vectors
    top_k_indices = np.argsort(similarities)[-k:][::-1]  # Sort and get top k in descending order
    
    return top_k_indices, similarities[top_k_indices]

In [17]:
k = 5  # Top 5 most similar vectors
top_k_indices, top_k_similarities = top_k_similar_vectors(question_embedding, vector_database, k)

print(f"Top {k} indices: {top_k_indices}")
print(f"Cosine similarities of top {k}: {top_k_similarities}")

Top 5 indices: [ 8 21 20 43 28]
Cosine similarities of top 5: [0.74986695 0.74777352 0.71452605 0.70339389 0.69696549]


In [31]:
total_url_matchs = []
for index in top_k_indices:
    temp = total_url_chunks[index]
    total_url_matchs.append(temp) 
    
total_url_matchs = list(set(total_url_matchs))

url_idxs = []
for idx, url in enumerate(total_url_chunks):
    if url in total_url_matchs:
        url_idxs.append(idx)

total_text = []
for index in url_idxs: 
    total_text.append(en_total_chunks[index])
    
total_text_with_link = []
for index in url_idxs: 
    total_text_with_link.append(f"{en_total_chunks[index]}, link: {total_url_chunks[index].strip()}")

In [32]:
total_text_with_link

['Houthi announced the launch of a series of cruise missiles into Israel The Houthi Force announced the launch of three long-range cruise missiles deep into Israeli territory shortly after the large-scale Iranian strike. "The Yemeni missile force has launched an offensive against military installations deep within Israeli territory, using three Quds-5 cruise missiles. The missiles have successfully reached their targets and the enemy remains secretive about the results of the strike," Yahya Saree, the Yemeni Houthi Force spokeswoman, said today. The announcement was made almost a day after the Islamic Revolutionary Guard Corps (IRGC) launched nearly two hundred ballistic missiles against a series of military bases around Tel Aviv and other Israeli areas, in retaliation for the deaths of Hezbollah leader Hassan Nasrallah, Hamas leader Ismail Haniyeh, and IRGC Brigadier General Abbas Nilforoushan. The Houthi spokesman congratulated Iran on the Israeli raid, stating he was willing to part

# LLM model

## Login to hugginface hub

In [33]:
# from huggingface_hub import notebook_login
# notebook_login()

from huggingface_hub import login
login(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)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Genma 2B

### Using Hugging Face Transformers

In [35]:
# pip install accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer_LLM = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model_LLM = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

#### Using 2 chatbot

##### Context 1

In [37]:
# Turn list to string
def list_to_string(input_list):
    result_string = '; '.join([f'"{sentence}"' for sentence in input_list])
    return result_string

context_with_url_list = list_to_string(total_text_with_link)
url_list = list_to_string(total_url_chunks)

# print(result_context)

In [42]:
context = context_with_url_list
url_list = url_list

# context = """
# ["The spotlight collapse in HCMC for the Miss Cosmo 2024 pageant at Phu Tho Gymnasium in District 11 suddenly caused an injury to a technician. Nguyen Dong Hoa, director of Phu Tho Gymnasium, said on September 29 that the incident happened yesterday afternoon when technicians completed the main stage. At this time, the spotlight system from a height of over 5 meters suddenly collapsed. Many square, interlocking steel frames fell onto the stage, seats in the audience area. According to Tran Viet Bao Hoang, head of the organizing committee for Miss Cosmo 2024, the spotlight collapsed, causing a technician minor injury, while many others had external bruises, but no serious injury. The organizing committee is repairing to finish the stage for the rehearsal and the final night of the contest, which took place on October 2-5. Miss Cosmo 2024 is an international beauty contest organized by Vietnamese people for the first time, with the participation of representatives of over 50 countries and territories in the world. Phu Tho Gymnasium located on Ly Thuong Kiet Street, opened in 2003, has a capacity of up to 8,000 people. It often hosts large-scale sports, entertainment, music, and exhibition events. Dinh Van , link: https://vnexpress.net/sap-san-khau-cuoc-thi-hoa-hau-o-tp-hcm-4798164.html"; "The troops and military branches coordinated the launching of S-125 Pechora 2TM surface-to-air missiles, controlling Su-30 to drop bombs, Mi-17 helicopters to shoot rockets to destroy targets in Hanoi defense area exercises. On October 2, at the National Rifle School of Region 1 (TB1) Bac Giang province, 28 contacts of the troops and military branches coordinated the Hanoi defense area exercises. On October 2, at the National Rifle School of Region 1 (TB1) Bac Giang province, 28 contacts of the troops and military branches coordinated the Hanoi defense area exercises. Artillery soldiers prepared the shells for the first shot. The maneuvers consist of four stages, with the military and military cooperation with various forms of combat, using various types of weapons, equipment and techniques manufactured by the Army; using the application of science, information technology and direct video. Artillery soldiers prepare the shells for the first shot. The maneuvers consist of four stages, with the military and military cooperation with various forms of combat, using various types of weapons, equipment and techniques manufactured by the Army; using the application of science, information technology and direct video. Anti-aircraft artillery men deployed the high-issued artillery commanders. Each machine needs 5 people to perform the tasks. The general and defense area exercises are the highest training forms to improve the leadership, command, advisory and cooperation of the levels and forces; assessing the level, strength and combat readiness of the armed forces. Anti-aircraft artillery men deployed the high-issued artillery commanders. Each machine needs 5 people to perform the tasks. The general and defense area exercises are the highest training forms to improve the leadership, command, advisory and cooperation of the levels and forces; assessing the level, strength and combat readiness of the armed forces. After the order of the command of the exercise, the artillery soldiers fired 14 rounds of 57 mm guns at the target on the hill. After the order of the command of the exercise, the artillery soldiers fired 14 rounds of 57 mm guns at the target on the hill. After seeing the target, the two Mi-17 helicopters entered and fired 32 rockets, destroying the designated target. After seeing the target, the two Mi-17 helicopters entered and fired 32 rockets, destroying the designated target. The two Cobra Su-30MK2 of the 371st Air Division dropped two bombs at the target below. The two Cobra Su-30MK2 of the 371st Air Division dropped two bombs at the target below. After dropping the bombs, the Su-30 dropped the heat lead to avoid the missile and exited in two directions. After dropping the bomb, the Su-30 dropped the heat lead to avoid the missile and exited in two directions. The S-125 Pechora 2TM surface-to-air missile was launched to destroy the aerial target. Immediately after that, the artillery soldiers controlled the optical equipment to observe the target to see if it has been shot down. The Capital High Command coordinated many troops and military branches to fire combat maneuvers at the TB1 shooting school (Can Son commune, Luc Ngan district, Bac Giang province; Hoa Thach shooting school (Hoa Thach commune, Quoc Oai district); Dong Doi shooting school (Van Hoa commune, Ba Vi district); Xuan Bang shooting school (Bac Son commune, Soc Son district, Hanoi city)., link: https://vnexpress.net/phong-ten-lua-tha-bom-trong-dien-tap-khu-vuc-phong-thu-tp-ha-noi-4799547.html"; "Immediately after, the combatant maneuvers optical equipment to observe the target to see if it has been shot down or not. The Capital High Command coordinated many troops and branches to practice combat fire at the locations: TB1 shooting field (Cam Son commune, Luc Ngan district, Bac Giang province; Hoa Thach shooting field (Hoa Thach commune, Quoc Oai district); Dong Doi shooting field (Van Hoa commune, Ba Vi district); Xuan Bang shooting field (Bac Son commune, Soc Son district, Hanoi city) At the same time, the armored soldiers deploy the rehearsal formation. According to the Ministry of Defense, Hanoi City is in the general defense posture of the country, holding a particularly important strategic position. The exercise aims to build the provinces and centrally-run cities into solid defense areas in accordance with the strategy to protect the Fatherland in the new situation. At the same time, the armored soldiers deploy the rehearsal formation. According to the Ministry of Defense, Hanoi City is in the general defense posture of the country, holding a particularly important strategic position. The exercise aims to build the provinces and centrally-run cities into solid defense areas in accordance with the strategy to protect the Fatherland in the new situation. Many targets were destroyed. General Phan Van Giang, Minister of Defense, evaluated the successful exercise, the services, branches and infantry on the battlefield were coordinated well. The process of combat practice ensures absolute safety. Many targets were destroyed. General Phan Van Giang, Minister of Defense, evaluated the successful exercise, the services, branches and infantry on the battlefield were coordinated well. The process of combat practice ensures absolute safety. Giang Huy, link: https://vnexpress.net/phong-ten-lua-tha-bom-trong-dien-tap-khu-vuc-phong-thu-tp-ha-noi-4799547.html"; "Hanoi City defense exercises On October 2, the Capital High Command coordinated with many troops and branches to practice combat fire at the field in Quoc Oai, Ba Vi, Soc Son (Hanoi) and Bac Giang. According to the Capital High Command, the exercises were performed simultaneously at four shooting ranges: TB1 (Guang Ngan district, Bac Giang province), Hoa Thach (Hoa Thach commune, Quoc Oai district), Dong Doi (Dong Hoa commune, Ba Vi district) and Xuan Bang (Bac Son commune, Soc Son district)., link: https://vnexpress.net/dien-tap-khu-vuc-phong-thu-tp-ha-noi-4799519.html"; "Category 15 storm is about to enter the East Sea Typhoon Krathon with wind speed of 183 km/h (grade 15) will enter the East Sea in the next few hours, but does not affect the coast and mainland of Vietnam. The National Center for Hydro-Meteorological Forecasting said the storm is in the sea south of Taiwan Island (China), the strongest wind speed of 15, gusting above 17. In the next few hours, the storm is heading west-northwest at 10-15 km/h, hitting the northern South China Sea. The storm then moved slowly, changed direction to northwest then north-northeast, and made landfall on Taiwan island. "The storm is unlikely to affect the coastal areas and mainland of Vietnam," the Vietnamese meteorological agency said. International stations such as Japan, US Navy and Hong Kong generally said that Krathon hit Taiwan on October 2 and did not affect the mainland of Vietnam. The northeastern sea area of the North East Sea had strong winds at 8-9, waves 2-4 m high, then increased to 10-12, the area near the storm center at 13-15, gusting above 17, waves 7-9 m high. Vessels operating in the above dangerous areas are likely to be affected by thunderstorms, whirlwinds, strong winds, and high waves. The Institute of Hydro-Meteorological and Climate Change said that in the remaining months of this year, the number of storms and tropical depressions in the East Sea and affecting Vietnam is approximately higher than the average of many years, 4-5 in the East Sea and 2-3 affecting the mainland of Vietnam. Since the beginning of the year, there have been four storms in the East Sea, in which Yagi hit Quang Ninh on September 7, causing heavy rains to the entire northern region, triggering flash floods and landslides. Typhoon Yagi has killed 344 people and left them missing, causing economic damage of more than VND81 trillion. Particularly Lao Cai recorded 132 deaths, 19 missing; Cao Bang 55 deaths and 2 missing; Yen Bai 54 deaths; Quang Ninh 29 deaths. Gia Chinh , link: https://vnexpress.net/bao-cap-15-sap-vao-bien-dong-4798702.html"]
# """

question = "What country launches the missle to Israel?"
question = "Give me the news today"

##### Context 2

In [None]:
context = """Truong My Lan, chairwoman of property developer Van Thinh Phat, has denied using for her personal needs or for her company the VND445.75 trillion ($17.5 billion) she allegedly misappropriated from Saigon Commercial Bank. At her ongoing trial at the People's Court of Ho Chi Minh City, the property tycoon has been accused of using the money and raised by issuing fraudulent bonds for paying off debts, personal expenses and making investments. But she told the court she spent all the money on restructuring the bank. She also denied the accusation that she ordered the SCB management to falsify documents to help her withdraw money from the bank. The executives did it on their own, she said. She said all her family credit card expenses were paid using her money and not the money embezzled from SCB. Her husband, Eric Chu, who is also in the dock for "money laundering," admitted he spent VND33 billion on credit cards and Lan paid the bills. He claimed to be surprised when investigators said the money came from illegal sources. He said his family has turned in VND19 billion to pay for the losses caused to SCB. Lan faces charges of fraudulent appropriation of assets through the issuance of bonds, money laundering and illegal trans-border movement of money. Earlier this year she was sentenced to death for embezzling money from SCB between 2012 and 2022. She and her accomplices owe VND677 trillion to the bank they had obtained through 1,300 different loans. The court also found her guilty of bribery and violating banking regulations."""

question = "This woman and her accomplices owe the bank through how many loans, and her name?"

### LLM Magic

In [40]:
from datetime import date

# Get the current date
current_date = date.today()
print(f"Date: {current_date}")  # Output: YYYY-MM-DD (e.g., 2024-08-02)

# Define the chat template using Role 1 (Prompting Specialist)
messages = [
    {"role": "user", "content": f"You are an expert in understanding user queries and rephrasing them. The original question is: {question}. Rephrase it clearly and concisely in 2 sentences for a QA chatbot to answer. Only return the rephrased question, no extra content or answers."},
]

input_ids_1 = tokenizer_LLM.apply_chat_template(conversation=messages, return_tensors="pt", return_dict=True).to(device)

outputs_1 = model_LLM.generate(**input_ids_1, max_new_tokens=128)
decoded_output_1 = tokenizer_LLM.decode(outputs_1[0], skip_special_tokens=False)
answer_query_1 = decoded_output_1.rsplit("<end_of_turn>", 2)[1].strip().strip('*') # Because the output include the answer between 2 "<end_of_turn>"

print(f"Rephrase question: {answer_query_1}")

###############################################################

# Define the chat template using Role 2 (QA Chatbot)
messages = [
    {"role": "user", "content": f"The current date is {current_date} (YYYY-MM-DD format). You are a friendly AI chatbot that looks through the news article and provide answer for user. Answer the question in a natural and friendly tone under 200 words. Have to use Chain of Thought reasoning with no more than three steps but dont include it in the response to user. Here are the new article {context}, the user asks {answer_query_1}. YOU MUST INCLUDE THE LINK TO THE ARTICLE AT THE END OF YOUR ANSWER"},
]

# messages = [
#     {"role": "user", "content": f"The current date is {current_date} (YYYY-MM-DD format). You are a friendly AI chatbot that looks through the news article and provide answer for user. Answer the question in a natural and friendly tone under 200 words. Have to use Chain of Thought reasoning with no more than three steps but dont include it in the response to user. There are 2 type of questions you need to be aware of and decide what approach to take. **Type 1**: if the user don't know what news to read, with question likes 'Give me the summary of today news' or 'Is there any interesting news today',etc, you take the total urls from this file {url_list[:10]} and suggest users what news to read or what happen today. **Type 2**: If user ask a specific question that is not Type 1, looks through this news article {context} for an answer, and for Type 2 YOU MUST INCLUDE THE LINK TO THE ARTICLE AT THE END OF YOUR ANSWER. Here is what the user asks {answer_query_1}."},
# ]

input_ids_2 = tokenizer_LLM.apply_chat_template(conversation=messages, return_tensors="pt", return_dict=True).to(device)

outputs_2 = model_LLM.generate(**input_ids_2, max_new_tokens=1024)
decoded_output_2 = tokenizer_LLM.decode(outputs_2[0], skip_special_tokens=False)
answer_query_2 = decoded_output_2.rsplit("<end_of_turn>", 2)[1].strip().strip('*') # Because the output include the answer between 2 "<end_of_turn>"

print(f"Answer: {answer_query_2}")

###############################################################

# # Define the chat template using Role 3 (Format fixer)
# messages = [
#     {"role": "user", "content": f"Read this sentence and correct any wrong format then return me the result in Vietnamese. ONLY RETURN THE RESULT SENTENCES, the sentence is {answer_query_2}."},
# ]

# input_ids_3 = tokenizer_LLM.apply_chat_template(conversation=messages, return_tensors="pt", return_dict=True).to(device)

# outputs_3 = model_LLM.generate(**input_ids_3, max_new_tokens=150)
# decoded_output_3 = tokenizer_LLM.decode(outputs_3[0], skip_special_tokens=False)
# answer_query_3 = decoded_output_3.rsplit("<end_of_turn>", 2)[1].strip() # Because the output include the answer between 2 "<end_of_turn>"

# print(f"Formatted sentence: {answer_query_3}")

Date: 2024-10-03
Rephrase question: Which country is responsible for launching missiles at Israel?
Answer:  Iran is responsible for launching missiles at Israel. 

https://vnexpress.net/khoanh-khac-israel-danh-chan-loat-ten-lua-dan-dao-iran-4799221.html


##### Test another approach

In [53]:
from datetime import date

# Get the current date
current_date = date.today()
print(f"Date: {current_date}")  # Output: YYYY-MM-DD (e.g., 2024-08-02)

# Define the chat template using Role 1 (Prompting Specialist)
messages = [
    {"role": "user", "content": f"You are an expert in understanding user queries and rephrasing them. The original question is: {question}. Rephrase it clearly and concisely in 2 sentences for a QA chatbot to answer. Only return the rephrased question, no extra content or answers."},
]

input_ids_1 = tokenizer_LLM.apply_chat_template(conversation=messages, return_tensors="pt", return_dict=True).to(device)

outputs_1 = model_LLM.generate(**input_ids_1, max_new_tokens=128)
decoded_output_1 = tokenizer_LLM.decode(outputs_1[0], skip_special_tokens=False)
answer_query_1 = decoded_output_1.rsplit("<end_of_turn>", 2)[1].strip().strip('*') # Because the output include the answer between 2 "<end_of_turn>"

print(f"Rephrase question: {answer_query_1}")

###############################################################

# Step 2: Define the chat template for deciding whether it’s Type 1 or Type 2
messages_2 = [
    {"role": "user", "content": f"""You are a chat bot that will have to decide what type of question it is. There are 2 type of questions you need to be aware of and decide what approach to take. **Type 1**: if the user don't know what news to read, with question likes 'Give me the summary of today news' or 'Is there any interesting news today',etc. **Type 2**: If user ask a specific question about news that is not Type 1. The question can only be either type 1 or type 2, and it can't be both type. RESPOND "TYPE 1" IF THE QUESTION IS TYPE 1, RESPOND "TYPE 2" IF THE QUESTION IS TYPE 2. The question is: {answer_query_1}"""},
]

input_ids_2 = tokenizer_LLM.apply_chat_template(conversation=messages_2, return_tensors="pt", return_dict=True).to(device)

outputs_2 = model_LLM.generate(**input_ids_2, max_new_tokens=128)
decoded_output_2 = tokenizer_LLM.decode(outputs_2[0], skip_special_tokens=False)
question_type = decoded_output_2.rsplit("<end_of_turn>", 2)[1].strip().strip('*')

print(f"Question Type: {question_type}")
print("-------------------------------")

###############################################################

# Step 3: If it's a Type 1 question, handle it with the Type 1 chatbot (news suggestion)
if "type 1" in question_type.lower():
    messages_3 = [
        {"role": "user", "content": f"The current date is {current_date}. You are a friendly AI chatbot that looks through the urls file and summary the articles for users. The user asks for a summary of today's news. Here are the link of the top 20 articles with titles in Vietnamese : {url_list[:20]}. Give me the news summary of them."},
    ]

    input_ids_3 = tokenizer_LLM.apply_chat_template(conversation=messages_3, return_tensors="pt", return_dict=True).to(device)

    outputs_3 = model_LLM.generate(**input_ids_3, max_new_tokens=512)
    decoded_output_3 = tokenizer_LLM.decode(outputs_3[0], skip_special_tokens=False)
    answer_type_1 = decoded_output_3.rsplit("<end_of_turn>", 2)[1].strip().strip('*')

    print(f"Type 1 Answer: {answer_type_1}")
    
###############################################################

# Step 4: If it's a Type 2 question, handle it with the Type 2 chatbot (specific query answer)
else:
    messages_4 = [
        {"role": "user", "content": f"The current date is {current_date}. You are a friendly AI chatbot that looks through news articles to answer user questions. The user asks a specific question about this news: {answer_query_1}. Here's the news context: {context}. Please answer the question and include the link to the article at the end."},
    ]

    input_ids_4 = tokenizer_LLM.apply_chat_template(conversation=messages_4, return_tensors="pt", return_dict=True).to(device)

    outputs_4 = model_LLM.generate(**input_ids_4, max_new_tokens=512)
    decoded_output_4 = tokenizer_LLM.decode(outputs_4[0], skip_special_tokens=False)
    answer_type_2 = decoded_output_4.rsplit("<end_of_turn>", 2)[1].strip().strip('*')

    print(f"Type 2 Answer: {answer_type_2}")


Date: 2024-10-03
Rephrase question: What are the latest news headlines today?
Question Type: Answer:** TYPE 1
-------------------------------
Type 1 Answer: Please note that I cannot access external websites or specific files, including URLs.* 

I can, however, provide you with a general summary of common news topics that are likely to be covered on a day like today. 

**Here are some potential news topics for today's news:**

* **Politics:**  Local and national elections, political scandals, government policies, and international relations.
* **Economy:**  Stock market fluctuations, inflation, unemployment rates, business news, and economic forecasts.
* **Society:**  Social issues, crime rates, education, healthcare, and cultural events.
* **Technology:**  New gadgets, software updates, artificial intelligence, cybersecurity, and space exploration.
* **Sports:**  Major sporting events, athlete news, team performance, and sports analysis.
* **Entertainment:**  Movie releases, music new

### Translate back to Vietnamese

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_name_translate = "VietAI/envit5-translation"
tokenizer_translate = AutoTokenizer.from_pretrained(model_name_translate)  
model_translate = AutoModelForSeq2SeqLM.from_pretrained(model_name_translate).to(device)

In [None]:
import re
text = answer_query_2

# Regular expression pattern to extract URLs
url_pattern = r'https?://[^\s]+'

# Find the URL in the text
answer_without_url = re.sub(url_pattern, '', text)
urls = re.findall(url_pattern, text)

# Print the extracted URL
print(urls[0])

In [None]:
###############################################################
# TRANSLATE BACK TO VIETNAMESE

input_translate = f"en: {answer_without_url}"
output_encodes = model_translate.generate(tokenizer_translate(input_translate, return_tensors="pt", padding=True).input_ids.to(device), max_length=1024)
output = tokenizer_translate.batch_decode(output_encodes, skip_special_tokens=True)
vietnamese_answer = output[0].split(":",1)[1]
print(f"Vietnamese answer: {vietnamese_answer}\n\nLink: {urls[0]}")

# Evaluation

In [5]:
crawl_result_path = "/kaggle/input/newsqa-200/stories"
working_path = "/kaggle/working/"

## Chunking

## Embedding