In [1]:
!pip install llama-index-core
!pip install llama_index
!pip install llama-index-llms-openai
!pip install matplotlib
!pip install spacy
!pip install pyvi



In [2]:
!pip install python-dotenv



In [23]:
from llama_index.core import VectorStoreIndex
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
    SemanticDoubleMergingSplitterNodeParser,
    LanguageConfig,
)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import SimpleDirectoryReader
import os
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [4]:
# Install the Vietnamese spaCy model
# !pip install https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz

Collecting https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz
  Using cached https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz (233.3 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [5]:
import nltk
from nltk.corpus import stopwords

# Download the stopwords corpus
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /Users/ngocp/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /Users/ngocp/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [24]:
import spacy

# Load the Vietnamese spaCy model
nlp = spacy.load('vi_core_news_lg')

# Example text
doc = nlp('Cộng đồng xử lý ngôn ngữ tự nhiên')

# Print token information
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Cộng đồng   V ROOT Xxxx xxxx False False
xử lý   N obj xx xx False False
ngôn ngữ   V xcomp xxxx xxx False False
tự nhiên   N compound xx xxxx False False


In [25]:
from typing import Dict, List

documents = SimpleDirectoryReader(input_dir="./data").load_data()

LANGUAGE_MODELS: Dict[str, List[str]] = {
    "english": ["en_core_web_md", "en_core_web_lg"],
    "german": ["de_core_news_md", "de_core_news_lg"],
    "spanish": ["es_core_news_md", "es_core_news_lg"],
    "vietnamese": ["vi_core_news_lg"],  # Add Vietnamese language models
}

class VietnameseLanguageConfig(LanguageConfig):
    def __init__(
        self,
        language: str = "english",
        spacy_model: str = "en_core_web_md",
        model_validation: bool = True,
    ):
        if language not in LANGUAGE_MODELS:
            raise ValueError(
                f"{language} language is not supported yet! Available languages: {list(LANGUAGE_MODELS.keys())}"
            )
        self.language = language
        self.spacy_model = spacy_model
        self.model_validation = model_validation
        if self.model_validation:
            self.validate_model()

    def validate_model(self):
        if self.spacy_model not in LANGUAGE_MODELS[self.language]:
            raise ValueError(
                f"{self.spacy_model} is not a valid model for {self.language}. Available models: {LANGUAGE_MODELS[self.language]}"
            )

config = VietnameseLanguageConfig(language="vietnamese", spacy_model="vi_core_news_lg")

splitter = SemanticDoubleMergingSplitterNodeParser(
    language_config=config,
    initial_threshold=0.4,
    appending_threshold=0.5,
    merging_threshold=0.5,
    max_chunk_size=5000,
)

In [26]:
print(len(documents))

2


In [27]:
nodes = splitter.get_nodes_from_documents(documents)
print(len(nodes))

  ).similarity(
  ).similarity(
  current_nlp.similarity(
  and current_nlp.similarity(
  and current_nlp.similarity(


89


In [29]:
vector_index = VectorStoreIndex(nodes)
query_engine = vector_index.as_query_engine()

In [30]:
text = "Tuổi xông đất tốt ? Gồm những năm nào ?"
# text = "đánh giá: Tuổi xông đất 1939 (Kỷ Mão - mệnh Hỏa) ?"
response = query_engine.query(
    text
)

print(response)

Tuổi xông đất tốt là những năm sau đây: 1976 (Bính Thìn), 1947 (Đinh Hợi), 1972 (Nhâm Tý), 2003 (Quý Mùi), 2002 (Nhâm Ngọ), 1960 (Canh Tý), 1942 (Nhâm Ngọ), 1943 (Quý Mùi), 1951 (Tân Mão), và 1939 (Kỷ Mão).
