In [1]:
from llama_index.core.node_parser import (
    SemanticSplitterNodeParser,
    HierarchicalNodeParser
)
from llama_index.core import (
    Settings,
    VectorStoreIndex,
    StorageContext,
    SimpleDirectoryReader, load_index_from_storage,
)
from llama_index.core.node_parser import get_leaf_nodes, get_root_nodes
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.embeddings.bedrock import BedrockEmbedding
from llama_index.llms.bedrock import Bedrock
from llama_index.core import Settings

from llama_index.core import Document
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    BaseExtractor,
)
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.question_gen import LLMQuestionGenerator
from llama_index.core.question_gen.prompts import (
    DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from qdrant_client import QdrantClient
from llama_index.vector_stores.qdrant import QdrantVectorStore

from llama_index.core.ingestion import IngestionPipeline
from dotenv import load_dotenv

import json
import os
import boto3
import docx, jaconv, re

In [2]:
load_dotenv()

# llm = Bedrock(
#     model="us.anthropic.claude-3-haiku-20240307-v1:0",
#     region_name="us-west-2", 
#     max_tokens=512,
#     temperature=0
# )

llm = GoogleGenAI(
    model_name="gemini-2.0-flash",
    temperature=0.1,
)

embed_model = BedrockEmbedding(
    model_name="amazon.titan-embed-text-v2:0",
    region_name=os.getenv("AWS_REGION"),
)

Settings.llm = llm
Settings.embed_model = embed_model

## Data preprocessing

In [3]:
docx_1_path = r'client data\JCAI３０１ＭＫ.docx'
docx_2_path = r'client data\ＪＣＡＩ３０２ＭＫ構造.docx'
docx_3_path = r'client data\ＪＣＡＩ２０２デシル顧客.docx'
docx_4_path = r'client data\ＪＣＡＩ２０４CRM.doc'
docx_5_path = r'client data\example.docx'

In [10]:
def clean_docx_text(text):
    text = text.replace('\ufeff', '')

    text = text.replace('\t', ' ').replace('\u3000', ' ')

    text = re.sub(r'[ ]{2,}', ' ', text)

    text = text.replace('。', '.').replace('、', ',').replace('「', '"').replace('」', '"')

    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if line != '':
            cleaned_lines.append(line)

    return '\n'.join(cleaned_lines)

def read_doc_file(file_path):
    try:
        doc = docx.Document(file_path)
        full_text = []

        for para in doc.paragraphs:
            text = para.text.strip()
            # Convert full-width to half-width
            text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
            full_text.append(text)

        raw_text = '\n'.join(full_text)
        final_text = clean_docx_text(raw_text)

        print("--- Nội dung file DOCX ---")
        print(final_text)
        print("--------------------------")

        return final_text

    except FileNotFoundError:
        print(f"Lỗi: Không tìm thấy file tại đường dẫn '{file_path}'")
    except Exception as e:
        print(f"Đã xảy ra lỗi: {e}")

In [11]:
doc1 = read_doc_file(docx_1_path)
doc5 = read_doc_file(docx_5_path)

--- Nội dung file DOCX ---
マーケティングとは何か 用語 用語 書籍から
マーケティングって何か? What
なぜマーケティング? Why
どうしたらマーケターになれる? How
What とは マーケティングは需要を創造すること
Why とは マーケティングは会社を成長するため
How とは 需要を創造するため 顧客ニーズを理解すること
知っておきたい基本的なマーケティング用語
セグメンテーション
セグメンテーションとは,市場や顧客を,ニーズや特性に基づいてグループ分け,細分化,識別区分すること.マーケティング戦略を立てる場合どの顧客タイプに区分し,そこに焦点を当てる.
その活動を効果的に実施するために,マーケットのセグメンテーションを行う.多様化する消費者の価値観に,効果効率的に対応するためのマーケティングである.
具体的には,年齢,性別,職業,趣味嗜好,購買履歴,居住地などの属性や,顧客ニーズ,価値観,などに基づいて,顧客をグループ化しセグメンテーションする.セグメンテーションは,顧客タイプセグメンテーションと地域セグメンテーションがある.他にライフステージやライフスタイルというセグメンテーションがある.
ターゲティング
ターゲティングとは,セグメンテーションした個々のセグメントのどのセグメント,どのセグメントとどのセグメントにターゲットするか,狙いを定めること.
ターゲティングするメリットは,1つは費用対効果の向上.ターゲットを絞ることで,無駄な広告費や営業コストを削減し,より効率的に売上を伸ばすことができる.
メリット2つは顧客ニーズへの対応.顧客ニーズに的確に対応できること.
メリット3つは競争優位性の確立ができること.競合他社が狙っていない,または十分に対応できていない市場を狙うことで,競争優位性を確立できること.
ポジショニング
ポジショニングとは,マーケットで自社の商品やサービスが,顧客の心の中にどんなメッセージで記憶させるかのキーワード,自社の立つ位置を明確にすること.石鹸販売なら女性に"美肌効果",男性に"体臭予防"など.競合他社と比較してどのような位置づけで顧客に記憶させるか,その草案がポジショニングという.販売している商品の価値を,一言で脳に刻み込むことができるように.
リレーションシップ
リレーションシップとは,

In [12]:
reader = SimpleDirectoryReader(
    input_dir=str(r"C:\Users\Admin\Desktop\Advanced-RAG\client data"),
    recursive=True,
    required_exts=[".docx", ".doc"]
)

full_documents = reader.load_data()

## Chunking

### Semantic Chunking

In [None]:
# Create the semantic splitter with Bedrock embeddings
splitter = SemanticSplitterNodeParser(
    embed_model=embed_model,
    buffer_size=2,
    breakpoint_percentile_threshold=80,
)

documents = [Document(text=doc1)]

# Split the documents into nodes
nodes = splitter.get_nodes_from_documents(documents)

print(f"Created {len(nodes)} semantic nodes")
for i, node in enumerate(nodes):
    print(f"Node {i+1}:")
    print(f"Text: {node.text}")
    print(f"Length: {len(node.text)} characters")
    print("-" * 50)

## Metadata Extraction

In [None]:
# Title extraction templates
DEFAULT_TITLE_NODE_TEMPLATE = """\
You are an expert at analyzing marketing documents.
Your task is to extract a clear, concise title from the provided content.

The prompt is related to marketing documents:
{context_str}
"""

DEFAULT_TITLE_COMBINE_TEMPLATE = """\
You are an expert at synthesizing document titles from multiple candidate suggestions.
Your task is to create the most comprehensive and accurate title for this document 
based on the candidate titles and the provided content.

{context_str}
Title: """

# Summary extraction template
DEFAULT_SUMMARY_EXTRACT_TEMPLATE = """\
You are an expert at summarizing and identifying the most important topics in marketing-related content.
Your task is to summarize the key topics and entities from the following section:

{context_str}

Summary: """

# Question generation template
DEFAULT_QUESTION_GEN_TMPL = """\
You are an expert at generating precise, context-aware questions from text.
Your task is to create {num_questions} questions that this content can answer specifically,
and that are unlikely to be answerable from generic knowledge.

Here is the context:
{context_str}

Higher-level summaries of surrounding context may also be provided.
Use them to generate better, more relevant questions.
"""

# Keyword extraction template
DEFAULT_KEYWORD_EXTRACT_TEMPLATE = """\
You are an expert at extracting meaningful and unique keywords from documents.
Your task is to provide {keywords} unique keywords for this document.

{context_str}
Format as comma-separated values.
Keywords: """


extractors = [
    TitleExtractor(nodes=1, llm=llm, 
                   node_template=DEFAULT_TITLE_NODE_TEMPLATE, 
                   combine_template=DEFAULT_TITLE_COMBINE_TEMPLATE),

    QuestionsAnsweredExtractor(questions=3, llm=llm,
                               prompt_template=DEFAULT_QUESTION_GEN_TMPL),

    SummaryExtractor(summaries=["self"], llm=llm,
                     prompt_template=DEFAULT_SUMMARY_EXTRACT_TEMPLATE),
                     
    KeywordExtractor(keywords=3, llm=llm,
                     prompt_template=DEFAULT_KEYWORD_EXTRACT_TEMPLATE),
]

transformations = [splitter] + extractors

pipeline = IngestionPipeline(transformations=transformations)

knowledges_nodes = pipeline.run(documents=documents)

100%|██████████| 1/1 [00:02<00:00,  2.46s/it]
100%|██████████| 57/57 [01:59<00:00,  2.10s/it]  
100%|██████████| 57/57 [02:54<00:00,  3.06s/it]  
100%|██████████| 57/57 [02:44<00:00,  2.89s/it]  


In [None]:
knowledges_nodes

[TextNode(id_='f9e0f3bc-8de7-4d35-835e-e8a54c4e1717', embedding=None, metadata={'document_title': 'Based on the provided content, the most comprehensive and accurate title for this document would be:\n\n"The Significance of Market Segmentation in Developing Effective Marketing Strategies"\n\nThis title effectively captures the core focus of the content, which is to explain the concept of market segmentation and its importance in designing and implementing successful marketing strategies. The title highlights the key aspects covered, including the fundamentals of market segmentation and its crucial role in developing effective marketing approaches.', 'questions_this_excerpt_can_answer': 'Based on the provided content, here are 3 context-aware questions that are unlikely to be answerable from generic knowledge:\n\n1. What are the key aspects of market segmentation that the document highlights as crucial for developing effective marketing strategies?\n\n2. How does the document define mar

## VectorIndex

In [None]:
question_gen = LLMQuestionGenerator.from_defaults(
    llm=llm,
    prompt_template_str="""
        Follow the example, but instead of giving a question, always prefix the question 
        with: 'By first identifying and quoting the most relevant sources, '. 
        """
    + DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)

index = VectorStoreIndex(nodes=knowledges_nodes)

engine = index.as_query_engine(similarity_top_k=3, llm=llm, verbose=True)

## Retriever

In [139]:
final_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine,
            metadata=ToolMetadata(
                name="marketing_knowledge_base",
                description="Provides answers and insights from a diverse set of documents.",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)

response = final_engine.query(
    """
    What are the Six conditions for relationship marketing
    Give your answer as a JSON.
    """
)

data = json.loads(response.response)
data

Generated 1 sub questions.
[1;3;38;2;237;90;200m[marketing_knowledge_base] Q: What are the six conditions for relationship marketing according to the marketing knowledge base?
[0m[1;3;38;2;237;90;200m[marketing_knowledge_base] A: According to the provided context, the six key conditions or characteristics of relationship marketing are:

1. Mutual cooperation and shared joy between customers and retailers
2. Developing personnel who can realize customer needs
3. Preparing a system environment to realize customer needs 
4. Continuous collaborative efforts with customers
5. Maximizing the total purchase amount (lifetime value) of customers
6. Realizing the desired customer needs (wants)

The text states that these six conditions or features define what relationship marketing entails in the context of this marketing knowledge base.
[0m

{'conditions': ['Mutual cooperation and shared joy between customers and retailers',
  'Developing personnel who can realize customer needs',
  'Preparing a system environment to realize customer needs',
  'Continuous collaborative efforts with customers',
  'Maximizing the total purchase amount (lifetime value) of customers',
  'Realizing the desired customer needs (wants)']}

In [140]:
for node in response.source_nodes:
    print("Node ID:", node.node.node_id)
    print("Text:", node.node.text)
    print("-" * 50)

Node ID: a175cbb9-e6fc-4fa3-ab85-2ee90cc757c7
Text: Sub question: What are the six conditions for relationship marketing according to the marketing knowledge base?
Response: According to the provided context, the six key conditions or characteristics of relationship marketing are:

1. Mutual cooperation and shared joy between customers and retailers
2. Developing personnel who can realize customer needs
3. Preparing a system environment to realize customer needs 
4. Continuous collaborative efforts with customers
5. Maximizing the total purchase amount (lifetime value) of customers
6. Realizing the desired customer needs (wants)

The text states that these six conditions or features define what relationship marketing entails in the context of this marketing knowledge base.
--------------------------------------------------
Node ID: 2a743435-d855-45de-8a1b-41813710ef9f
Text: 2つは,顧客との会話,顧客の意見収集,買上データから顧客ニーズの想像.
3つは,以上を実現するためにレシートデータを蓄積すること.
リレーションシップ･マーケティングとは,
顧客から顧客の生涯価値(=ライフタイムバリュー)をで

### Recursive Retriever

In [None]:
metadata_dicts = []
for extractor in extractors:
    metadata_dicts.extend(extractor.extract(knowledges_nodes))

In [None]:
metadata_dicts

In [55]:
# cache metadata dicts
def save_metadata_dicts(path):
    with open(path, "w") as fp:
        for m in metadata_dicts:
            fp.write(json.dumps(m) + "\n")


def load_metadata_dicts(path):
    with open(path, "r") as fp:
        metadata_dicts = [json.loads(l) for l in fp.readlines()]
        return metadata_dicts

save_metadata_dicts("data/llama2_metadata_dicts.jsonl")
metadata_dicts = load_metadata_dicts("data/llama2_metadata_dicts.jsonl")


In [None]:
import copy
from llama_index.core.schema import IndexNode

all_nodes = copy.deepcopy(knowledges_nodes)
for idx, d in enumerate(metadata_dicts):
    inode_d = IndexNode(
        text=d["document_title"], index_id=knowledges_nodes[idx].node_id
    )
    inode_q = IndexNode(
        text=d["questions_this_excerpt_can_answer"], index_id=knowledges_nodes[idx].node_id,
    )
    inode_s = IndexNode(
        text=d["section_summary"], index_id=knowledges_nodes[idx].node_id
    )
    inode_k = IndexNode(
        text=d["excerpt_keywords"], index_id=knowledges_nodes[idx].node_id
    )
    all_nodes.extend([inode_q, inode_s, inode_k])

In [None]:
from llama_index.core.retrievers import RecursiveRetriever

retriever = RecursiveRetriever(
    retriever=index.as_retriever(similarity_top_k=2),  # base retriever
    verbose=True
)

# Retrieve results
results = retriever.retrieve("What are the terms i need to know to succeed in CRM")
for r in results:
    print(r.node.text)

### Auto Merging Recursive

In [16]:
# documents_1 = [Document(text=doc5)]

node_parser = HierarchicalNodeParser.from_defaults(
                    # chunk_sizes=[8192, 4096, 1024],
                    chunk_sizes=[2048, 512, 128],
                    chunk_overlap=20
                )
nodes = node_parser.get_nodes_from_documents(full_documents)

In [None]:
leaf_nodes = get_leaf_nodes(nodes)
root_nodes = get_root_nodes(nodes)

docstore = SimpleDocumentStore()

# insert nodes into docstore
docstore.add_documents(nodes)

# define storage context (will include vector store by default too)
storage_context = StorageContext.from_defaults(docstore=docstore)

base_index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
)

base_retriever = base_index.as_retriever(similarity_top_k=10)
retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_str = (
    "What are the six conditions for relationship marketing"
)

query_engine = RetrieverQueryEngine.from_args(retriever)

response = query_engine.query(query_str)
print(str(response))


> Merging 2 nodes into parent node.
> Parent node id: 6a876534-8a43-4e25-aaee-c3c527e93a41.
> Parent node text: その他マーケティングに関し貴重な知識を与えてくれたマーケティング書籍
『カスタマー･ロイヤルティ』 ジル・グリフィン著書
原書は『Customer Loyalty』
邦訳タイトルは『顧客はなぜ,...

> Merging 3 nodes into parent node.
> Parent node id: ddc663a1-92f7-45d6-b77e-b3effed8342f.
> Parent node text: CRMはソフトウエアの話ではない.マーケティング話である.会社の営業体制の改革であり,マーケティングのパ大転換の話だ.
他社に差別化し,成長する顧客マーケティング=CRMを実現するためには,成功...

> Merging 1 nodes into parent node.
> Parent node id: 541055ef-b085-4715-abf8-edeafc2409dd.
> Parent node text: その他マーケティングに関し貴重な知識を与えてくれたマーケティング書籍
『カスタマー･ロイヤルティ』 ジル・グリフィン著書
原書は『Customer Loyalty』
邦訳タイトルは『顧客はなぜ,...

The six conditions for relationship marketing are:
1. Customers and retailers cooperate to create and share joy.
2. Developing human resources that can realize the customer needs that customers desire.
3. Preparing a system environment that can realize the customer needs that customers desire.
4. Continuous cooperative efforts with customers.
5. Maximizing

In [15]:
nodess = retriever.retrieve(query_str)

> Merging 2 nodes into parent node.
> Parent node id: 6a876534-8a43-4e25-aaee-c3c527e93a41.
> Parent node text: その他マーケティングに関し貴重な知識を与えてくれたマーケティング書籍
『カスタマー･ロイヤルティ』 ジル・グリフィン著書
原書は『Customer Loyalty』
邦訳タイトルは『顧客はなぜ,...

> Merging 3 nodes into parent node.
> Parent node id: ddc663a1-92f7-45d6-b77e-b3effed8342f.
> Parent node text: CRMはソフトウエアの話ではない.マーケティング話である.会社の営業体制の改革であり,マーケティングのパ大転換の話だ.
他社に差別化し,成長する顧客マーケティング=CRMを実現するためには,成功...

> Merging 1 nodes into parent node.
> Parent node id: 541055ef-b085-4715-abf8-edeafc2409dd.
> Parent node text: その他マーケティングに関し貴重な知識を与えてくれたマーケティング書籍
『カスタマー･ロイヤルティ』 ジル・グリフィン著書
原書は『Customer Loyalty』
邦訳タイトルは『顧客はなぜ,...



In [None]:
from llama_index.core.response.notebook_utils import display_source_node

for node in nodess:
    display_source_node(node, source_length=10000)

## Database

### MongoDB

In [None]:
import pymongo
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.storage.docstore.mongodb import MongoDocumentStore

In [None]:
mongo_uri = os.environ["MONGO_URI"]

# Docstore
docstore = MongoDocumentStore.from_uri(uri=mongo_uri)
docstore.add_documents(nodes)

mongodb_client = pymongo.MongoClient(mongo_uri)

# Vectorstore
store = MongoDBAtlasVectorSearch(mongodb_client)

store.create_vector_search_index(
    dimensions=1536, path="embedding", similarity="cosine"
)


storage_context = StorageContext.from_defaults(
    docstore=docstore,
    vector_store=store
)


### Qdrant Vectorstore

In [5]:
from qdrant_client import QdrantClient

from llama_index.vector_stores.qdrant import QdrantVectorStore

In [8]:
qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")

client = QdrantClient(
    url=qdrant_url, 
    api_key=qdrant_api_key,
    timeout=60
)

print(client.get_collections())


collections=[CollectionDescription(name='sailing_project'), CollectionDescription(name='sailing_project_without_metadata'), CollectionDescription(name='sailing_project_eng')]


In [None]:
# Create the Qdrant vector store from existing nodes
node_parser = HierarchicalNodeParser.from_defaults(
                    chunk_sizes=[8192, 4096, 1024],
                    # chunk_sizes=[2048, 512, 128],
                    chunk_overlap=20
                )
nodes = node_parser.get_nodes_from_documents(full_documents)
leaf_nodes = get_leaf_nodes(nodes)

docstore = SimpleDocumentStore()

# insert nodes into docstore
docstore.add_documents(nodes)

vector_store = QdrantVectorStore(client=client, collection_name="sailing_project_without_metadata")
storage_context = StorageContext.from_defaults(vector_store=vector_store,
                                               docstore=docstore)
index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
    insert_batch_size=20
)

In [None]:
# Reset to use Qdrant Vectorstore Cloud
vector_store = QdrantVectorStore(
    client=client,
    collection_name="sailing_project_without_metadata"
)

storage_context = StorageContext.from_defaults(
    vector_store=vector_store
)

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store
    
)

base_retriever = index.as_retriever(similarity_top_k=10)
retriever = AutoMergingRetriever(base_retriever, storage_context)

query_engine = RetrieverQueryEngine.from_args(retriever)

# query_engine = index.as_query_engine()
# response = query_engine.query("What are the six conditions for relationship marketing")
# print(response)

In [11]:
query_str = (
    "What is the facts about AI ?"
)
response = query_engine.query(query_str)
print(str(response))

ValueError: doc_id b1183cad-4b09-4efc-87f8-83c1ca7b0573 not found.