In [1]:
# third-party packages
import os
import re
import tqdm
import array
import numpy as np
import pandas as pd
from dotenv import load_dotenv

from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser#,TokenTextSplitter
from llama_index.core import Document
from llama_index.readers.wikipedia import WikipediaReader # wikipedia package should also be installed.
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from sentence_transformers import CrossEncoder

from langchain_core.prompts import ChatPromptTemplate

from langchain_community.chat_models import ChatOCIGenAI
from langchain_community.embeddings import OCIGenAIEmbeddings

# Oracle sdk
import ads
import oci
import oracledb


In [2]:
ads.set_auth("resource_principal")

# Data Import and cleaning

- First we import the page "Artificial Intelligence" from wikipedia.
- We keep only the first 15k tokens
- The imported page includes many format ralated characters, which are removed using the clean_text function.
- "\n\n", which indicite paragraph start is not removed.

In [3]:
loader = WikipediaReader()
documents = loader.load_data(
    pages=["Artificial Intelligence"],
    auto_suggest=False
)

In [4]:
text = documents[0].text[:15000]

with open("demo_text.txt", "w", encoding="utf-8") as f:
    f.write(text)

In [5]:
def clean_text(text):
    text = re.sub(r"={2,}", "", text)           # == Heading ==
    text = re.sub(r"=+", "", text)              # any stray = left
    text = re.sub(r"\[\[.*?\]\]", "", text)     # links
    text = re.sub(r"\{\{.*?\}\}", "", text)     # templates
    text = re.sub(r"''+", "", text)             # bold/italic
    text = re.sub(r"File:[^\n]*", "", text)     # images

    # 2. Fix escaped quotes and apostrophes (this is the key line!)
    text = text.encode('utf-8').decode('unicode_escape')

    # 3. Clean up whitespace but preserve paragraphs
    text = re.sub(r"[ \t]+", " ", text)         # multiple spaces → one
    text = re.sub(r"\n{3,}", "\n\n", text)      # max two newlines

    # 4. Remove empty lines and rebuild clean paragraphs
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return "\n\n".join(lines)

clean_text = clean_text(documents[0].text)
clean_text[:1000]

'Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.\n\nHigh-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); virtual assistants (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., language models and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go). However, many AI applications are not perceived as AI: "A lot of cutting edge AI has filtered into general applications, often without being c

# Creating the Knowledge Base

## Splitting

- We will compare two splitting approuches: rule-base and semantic-base splitting
- Note, that under each category there are many types of splitters, suitable for different data format logic (json, txt, etc.), as well as for different methods.
- We are using here Llama index, however, Langchain also includes many methods.

### Rule-based Splitting

- These methods uses transparent logic to split the data
- Here we use the method 'sentence_splitter' which target splitting the text into chunks of X tokens.
- We also add overalapping between the chunks in order to avoid information loss.
- We also use paragraph_separator argument, such that the splitter will keep the whole paragraph, in case the paragraph size is withing some bugger around X

In [6]:
def sentence_splitter(text):
    splitter = SentenceSplitter(chunk_size=300, chunk_overlap=20, paragraph_separator='\n\n')
    text_split = splitter.split_text(text)
    return text_split

In [7]:
chunks = sentence_splitter(clean_text)

In [10]:
len(chunks)

71

In [11]:
chunks[:5]

['Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.\n\nHigh-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); virtual assistants (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., language models and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go). However, many AI applications are not perceived as AI: "A lot of cutting edge AI has filtered into general applications, often without being 

### Semantic-based Splitting

- In semantic-based splitting utilizes language models to split based on the text semantic
- There are several approuch for semantic based splitting, teh most common one is using embedding models
- The SemanticSplitterNodeParser computes embedding distances between consecutive sentences and splits the text whenever the distance exceeds a threshold. Here, we define that threshold using the X percentile.

In [12]:
splitter = SemanticSplitterNodeParser(
    embed_model=HuggingFaceEmbedding("sentence-transformers/all-MiniLM-L6-v2"),
    breakpoint_percentile_threshold=95,
    buffer_size=1
)

In [13]:
def semantic_splitter(text):
    nodes = splitter.get_nodes_from_documents([Document(text=text)])
    return [n.text for n in nodes]

In [14]:
chunks_semantic = semantic_splitter(clean_text)

In [15]:
chunks_semantic[:5]

['Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.\n\nHigh-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); virtual assistants (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., language models and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go). However, many AI applications are not perceived as AI: "A lot of cutting edge AI has filtered into general applications, often without being 

## Establish Connection to Oracle DB and Writing the Data to the DB

### Establish Connection to the DB

In [None]:
if load_dotenv('<your_db_config_file>'):
    print('Environment Variables Loaded Successfully.')
else:
    print('Environment Variables Failed to Load.')

Environment Variables Loaded Successfully.


In [17]:
user_name = os.environ.get('DB_USER')
pwd = os.environ.get('DB_PWD')
dns_option=os.environ.get('DB_DNS_MEDIUM')
wallet_path = os.environ.get('DB_WALLET_PATH')

In [18]:
connection = oracledb.connect(user=user_name, 
                              password=pwd,
                              dsn=dns_option, # can also add the dsn itself rather than the name.
                              config_dir=wallet_path, # should include ewallet.pem. In case only dsn name is used, then also include tnsnames.ora.
                              wallet_location=wallet_path,
                              wallet_password=pwd
                             )

In [19]:
database = connection.cursor()

### Writing the Data to the DB

In [20]:
# Drop table if exists
database.execute("""
                BEGIN
                   EXECUTE IMMEDIATE 'DROP TABLE IF EXISTS WIKIPEDIA_AI';
                END;
                """)
connection.commit()

In [21]:
# Create New Table
database.execute("""
    CREATE TABLE WIKIPEDIA_AI (
        ID_NUMBER NUMBER GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
        TEXT_SPLIT VARCHAR2(32000),
        TEXT_SPLIT_V VECTOR
    )
""")
connection.commit()

In [22]:
for chunk in chunks:
    database.execute(
        """
        INSERT INTO WIKIPEDIA_AI (TEXT_SPLIT, TEXT_SPLIT_V)
        VALUES (:1, NULL)
        """,
        [chunk]
    )

connection.commit()

### Printing the data from the DB

In [23]:
database.execute("""
                SELECT 
                    ID_NUMBER, 
                    TEXT_SPLIT,
                    TEXT_SPLIT_V
                FROM 
                    WIKIPEDIA_AI
                """)

# Extract Data - List
rows = database.fetchall()
print(rows[0])

(6, 'In classical planning, the agent knows exactly what the effect of any action will be. In most real-world problems, however, the agent may not be certain about the situation they are in (it is "unknown" or "unobservable") and it may not know for certain what will happen after each possible action (it is not "deterministic"). It must choose an action by making a probabilistic guess and then reassess the situation to see if the action worked.\n\nIn some problems, the agent\'s preferences may be uncertain, especially if there are other agents or humans involved. These can be learned (e.g., with inverse reinforcement learning), or the agent can seek information to improve its preferences. Information value theory can be used to weigh the value of exploratory or experimental actions. The space of possible future actions and situations is typically intractably large, so the agents must take actions and evaluate situations while being uncertain of what the outcome will be.\n\nA Markov dec

## Vector Embedding

In [None]:
ENDPOINT = "https://inference.generativeai.eu-frankfurt-1.oci.oraclecloud.com"
COMPARTMENT_ID = '<your compartment ocid>'
MODEL_NAME = "cohere.embed-english-v3.0"

In [25]:
embed_model = OCIGenAIEmbeddings(
    model_id = MODEL_NAME,
    service_endpoint = ENDPOINT,
    compartment_id= COMPARTMENT_ID,
    auth_type='RESOURCE_PRINCIPAL')

In [26]:
# Embedding and adding the vectors to the database
for row in tqdm.tqdm(rows):
    id_val = row[0]
    try:
        data = row[1]
        vec = embed_model.embed_documents([data])[0]
        vec_array = array.array('d', vec)
        database.execute("UPDATE WIKIPEDIA_AI SET TEXT_SPLIT_V = :1 WHERE ID_NUMBER = :2", [vec_array, id_val])
    except Exception as e:
        print(e)
    
connection.commit()

100%|██████████| 71/71 [00:05<00:00, 12.13it/s]


In [27]:
database.execute("""
                SELECT 
                    ID_NUMBER, 
                    TEXT_SPLIT,
                    TEXT_SPLIT_V
                FROM 
                    WIKIPEDIA_AI
                """)

# Extract Data - List
knowledge_base = database.fetchall()
print(knowledge_base[0])

(6, 'In classical planning, the agent knows exactly what the effect of any action will be. In most real-world problems, however, the agent may not be certain about the situation they are in (it is "unknown" or "unobservable") and it may not know for certain what will happen after each possible action (it is not "deterministic"). It must choose an action by making a probabilistic guess and then reassess the situation to see if the action worked.\n\nIn some problems, the agent\'s preferences may be uncertain, especially if there are other agents or humans involved. These can be learned (e.g., with inverse reinforcement learning), or the agent can seek information to improve its preferences. Information value theory can be used to weigh the value of exploratory or experimental actions. The space of possible future actions and situations is typically intractably large, so the agents must take actions and evaluate situations while being uncertain of what the outcome will be.\n\nA Markov dec

# Perform Similarity Search using Oracle AI Vector Search

## Similarity Search

1. Embed the query
2. Exectue similarity search
3. Organize the result 

In [28]:
def similarity_search(query, top_n):

    # Embedding the query
    input_vec = embed_model.embed_documents([query])[0]    # embedding the query
    input_array = array.array('d', input_vec)
    
    # Executing the SQL code. 
    #The ':1' is a placeholder for the input_array will be used later in the execution
    sql_query = """
        SELECT TEXT_SPLIT
        FROM WIKIPEDIA_AI
        ORDER BY VECTOR_DISTANCE(TEXT_SPLIT_V, :1, DOT)
        FETCH FIRST :2 ROWS ONLY
        """
    
    database.execute(sql_query, [input_array,top_n])

    # Retrieving the result
    rows = database.fetchall()                              # retrieve all selected rows
    results_string = [row[0] for row in rows]

    return results_string

In [29]:
query='What is classification in AI?'

In [30]:
results_string=similarity_search(query, 5)
results_string

['As a result of the many circulating definitions scholars have started to critically analyze and order the AI discourse itself including discussing the many AI narratives and myths to be found within societal, political and academic discourses. Similarly, in practice, some authors have suggested that the term \'AI\' is often used too broadly and vaguely. This raises the question of where the line should be drawn between AI and classical algorithms, with many companies during the early 2020s AI boom using the term as a marketing buzzword, often even if they did "not actually use AI in a material way".\n\nThere has been debate over whether large language models exhibit genuine intelligence or merely simulate it by imitating human text.\n\nEvaluating approaches to AI\n\nNo established unifying theory or paradigm has guided AI research for most of its history. The unprecedented success of statistical machine learning in the 2010s eclipsed all other approaches (so much so that some sources

## Similarity + Keyword Search

In [31]:
def similarity_search_with_keyword(query, keyword, top_n):

    input_vec = embed_model.embed_documents([query])[0]
    input_array = array.array('d', input_vec)

    # Prepare keyword search pattern
    pattern = f"%{keyword}%"

    # Where should come before order
    sql_query = """
        SELECT TEXT_SPLIT
        FROM WIKIPEDIA_AI
        WHERE LOWER(TEXT_SPLIT) LIKE LOWER(:1)
        ORDER BY VECTOR_DISTANCE(TEXT_SPLIT_V, :2, DOT)
        FETCH FIRST :3 ROWS ONLY
    """
    database.execute(sql_query, [pattern,input_array, top_n])

    rows = database.fetchall()
    return [row[0] for row in rows]

In [32]:
results_with_keyword = similarity_search_with_keyword(query=query,keyword="class",top_n=3)
results_with_keyword

['As a result of the many circulating definitions scholars have started to critically analyze and order the AI discourse itself including discussing the many AI narratives and myths to be found within societal, political and academic discourses. Similarly, in practice, some authors have suggested that the term \'AI\' is often used too broadly and vaguely. This raises the question of where the line should be drawn between AI and classical algorithms, with many companies during the early 2020s AI boom using the term as a marketing buzzword, often even if they did "not actually use AI in a material way".\n\nThere has been debate over whether large language models exhibit genuine intelligence or merely simulate it by imitating human text.\n\nEvaluating approaches to AI\n\nNo established unifying theory or paradigm has guided AI research for most of its history. The unprecedented success of statistical machine learning in the 2010s eclipsed all other approaches (so much so that some sources

# Reranker

- Adding an extra filtering step for retrieved context is highly recommended.
- This can be done with rerankers, which score how relevant each chunk is to the query.
- Some rerankers output probabilities, while others produce scores on an open scale (e.g., -inf to inf).
- These models are lightweight and typically run efficiently without a GPU.

## Applying the Reranker model

In [34]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # the raranker model

def rerank_with_scores(query, chunks):
    scores = cross_encoder.predict([(query, chunk) for chunk in chunks])
    
    ranked = sorted(zip(chunks, scores), key=lambda x: x[1], reverse=True)
    
    reranked_chunks = [chunk for chunk, score in ranked]
    scores_list     = [score for chunk, score in ranked]
    
    return reranked_chunks, scores_list

In [35]:
results_string_reranked, scores=rerank_with_scores(query, results_string)
results_string_reranked

['Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.\n\nHigh-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); virtual assistants (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., language models and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go). However, many AI applications are not perceived as AI: "A lot of cutting edge AI has filtered into general applications, often without being 

In [36]:
scores

[3.4497643, 1.0053312, -0.29014003, -2.8727384, -7.462182]

## Filter out all non relevant chunks

In [37]:
def top_positive_chunks(reranked_chunks, scores, k):
    return [chunk for chunk, score in zip(reranked_chunks, scores) if score > 0][:k]

In [38]:
results_string_reranked_filtered=top_positive_chunks(results_string_reranked, scores, 3)
results_string_reranked_filtered

['Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.\n\nHigh-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); virtual assistants (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., language models and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go). However, many AI applications are not perceived as AI: "A lot of cutting edge AI has filtered into general applications, often without being 

# RAG

* For implementing RAG, we need to add an LLM model that answer the query given the retreived chunks
* Here we are going to use cohere.command-a-03-2025

In [39]:
chat = ChatOCIGenAI(
    model_id="cohere.command-a-03-2025",
    service_endpoint=ENDPOINT,
    compartment_id=COMPARTMENT_ID,
    model_kwargs={"temperature": 0.7, "max_tokens": 500},
    auth_type='RESOURCE_PRINCIPAL'
)

In [40]:
def generate_response_llm(chat_model, retrieved_docs, query):

    template = """Answer the question based only on the following context:
    {context}

    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template)
    chat_chain = prompt | chat 
    
    response = chat_chain.invoke({'context':retrieved_docs, 'question':query})
    
    return response

In [41]:
def run_rag(chat_model,query, top_n):
    results_string= similarity_search(query, top_n)
    results_string_reranked,_=rerank_with_scores(query, results_string)
    response = generate_response_llm(chat_model,results_string_reranked, query)
    formatted_response = f'Answer: {response.content}'
    return formatted_response

In [42]:
result = run_rag(chat,query, 3)
print(result)

Answer: In the context provided, classification in AI is mentioned in relation to **artificial neural networks**, which are used as classifiers. Classification is a fundamental task in machine learning and AI where the goal is to assign input data to specific categories or classes based on learned patterns. For example, a neural network can be trained to recognize patterns in data (e.g., images, text, or numerical data) and then classify new, unseen data into predefined categories. This process involves training the network on labeled data, where the correct class for each input is known, and then using the trained model to predict the class of new, unlabeled data. The neural network achieves this by adjusting its internal weights through algorithms like backpropagation to minimize prediction errors. In essence, classification in AI is about teaching a model to make accurate predictions about the category or class of input data.
