<a href="https://colab.research.google.com/github/nova0816/RAG_pipeline/blob/main/RAG_Pipeline_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install Libraries (Run this cell in Colab)
!pip install --upgrade langchain pypdf sentence-transformers faiss-cpu transformers langchain-text-splitters langchain-huggingface langchain-community
# Install the necessary libraries
!pip install -q transformers accelerate bitsandbytes sentence-transformers langchain torch
!pip install -U bitsandbytes



In [None]:
# Install libraries (Run this in your Colab notebook)
!pip install beautifulsoup4 pandas html5lib langchain



Read file

In [None]:
from google.colab import drive

# Mount Google Drive to the default mount point /content/drive
drive.mount('/content/drive')

print("Google Drive mounted successfully.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.


In [None]:
# Install libraries (Run this cell in Colab)
#!pip install beautifulsoup4 pandas html5lib langchain

# Import necessary modules
import pandas as pd
from bs4 import BeautifulSoup, Tag
from io import StringIO
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os


# --- FILE INGESTION (Direct Drive Read) ---

# **IMPORTANT**: Define the full path to your file based on the screenshot structure
file_path = "/content/drive/MyDrive/Colab Notebooks/RAG Material/Sixt Rental Information France.html"
html_content = ""

# Check for the file and read its content
if os.path.exists(file_path):
    try:
        # Read the file content.
        # Using 'windows-1252' encoding as suggested for this file type
        with open(file_path, 'r', encoding='windows-1252') as f:
            html_content = f.read()

        print(f"File '{file_path}' successfully loaded.")
        print(f"Size of loaded HTML content: {len(html_content)} characters.")

    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        html_content = "<html><body>Error reading content.</body></html>"
else:
    print(f"Error: File not found at the expected path: {file_path}")
    print("Please ensure the Drive is mounted and the path is exactly correct.")
    html_content = "<html><body>File is missing.</body></html>"


print("\n--- Setup Complete ---")

File '/content/drive/MyDrive/Colab Notebooks/RAG Material/Sixt Rental Information France.html' successfully loaded.
Size of loaded HTML content: 102149 characters.

--- Setup Complete ---


In [None]:
html_content

'<!DOCTYPE html>\n<!-- saved from url=(0088)https://www.sixt.com/php/terms/view?language=en_US&liso=FR&rtar=000&view=EPP&tlang=en_US -->\n<html><head><meta http-equiv="Content-Type" content="text/html; charset=windows-1252">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=yes">\n\n\t<title>Sixt Rental Information France</title>\n\n<meta name="Description" content="Terms and conditions France">\n<meta name="Keywords" content="terms, conditions, agb, France, sixt">\n<meta name="Author" content="JW">\n\n\n    \n        \n<style type="text/css">\n\n    /*\n        ATTN !!!\n        deletet if wrapped in typo3\n        edit styles there!!!\n    */\n\n    body {\n        padding: 10px 5px 5px 10px;\n        margin: 0;\n        font-size: 11px;\n        font-family: arial,helvetica,verdana,sans-serif;\n    }\n\n    h1 {\n        font-size: 16px;\n        padding: 3px;\n        margin: 0 0 6px 0;\n       

Read Html Document

In [None]:
import pandas as pd
from bs4 import BeautifulSoup, Tag
from io import StringIO
import os
from google.colab import drive


## 2. Table Extraction and Text Linearization (FIXED)

def parse_html_for_rag(html_content):
    """
    Parses HTML by finding all structural elements (headings, paragraphs, tables)
    within the main content div, converts tables to Markdown, and cleans text.

    The fix targets the inner <div> blocks that contain the actual content structure.
    """
    if not html_content:
        return ""

    soup = BeautifulSoup(html_content, 'html.parser')

    # Target the main content area
    content_div = soup.find('div', id='sx-gc-content')
    if not content_div:
        return "Error: Could not find main content div (sx-gc-content)."

    processed_text_segments = []

    # Process the header text right after the main div start, but before the inner divs
    header_text = content_div.find('h3', class_='sx-terms-header-selection')
    if header_text:
        processed_text_segments.append(f"\n# {header_text.get_text(strip=True)}\n")


    # The actual content is nested inside subsequent sibling <div> tags
    # We look for all immediate child <div> tags that contain the sections
    content_sections = content_div.find_all('div', recursive=False)

    # Iterate through each section div found
    for section in content_sections:

        # Iterate through all children of the inner section to maintain order
        for element in section.children:
            if isinstance(element, Tag):

                # Handle Tables: Extract and convert to Markdown
                if element.name == 'table':
                    try:
                        # Use pandas to extract the table data
                        dfs = pd.read_html(StringIO(str(element)), flavor='bs4')
                        if dfs:
                            df = dfs[0]
                            # Convert the DataFrame to a Markdown string
                            markdown_table = df.to_markdown(index=False)

                            processed_text_segments.append(
                                f"\n\n--- Start Table ---\n{markdown_table}\n--- End Table ---\n\n"
                            )
                    except ValueError:
                        processed_text_segments.append(f"\n\n[Could not parse complex table]\n\n")

                # Handle Headings and Paragraphs: Extract text
                elif element.name in ['h2', 'h3', 'p']:
                    text = element.get_text(strip=True)
                    # Add separators for better readability and chunking context
                    if element.name == 'h2':
                        text = f"\n\n## {text}"
                    elif element.name == 'h3':
                        text = f"\n### {text}"

                    # Also include any links' text for context, but not the link itself
                    if element.name == 'p':
                        # Find all <a> tags and extract their text
                        for a in element.find_all('a'):
                             a_text = a.get_text(strip=True)
                             if a_text not in text: # Avoid double counting text already captured by parent
                                 text += f" ({a_text})"

                    processed_text_segments.append(text)

    # Combine all segments into one large, structured string
    full_cleaned_text = "\n".join(processed_text_segments).strip()

    return full_cleaned_text

# Execute the parsing function with the loaded content
cleaned_rag_text = parse_html_for_rag(html_content)

print(f"\n--- Parsing Results ---")
print(f"Total characters in cleaned text: {len(cleaned_rag_text)}")
print("\n--- Example of Cleaned Text (First 1500 chars) ---")
print(cleaned_rag_text[:1500])
print("\n--- Parsing and Linearization Complete ---")


## 3. Chunking and LangChain Document Creation

# Convert the cleaned text into a single LangChain Document
document = Document(page_content=cleaned_rag_text, metadata={"source": "Sixt Rental Info France"})

# Define the text splitter for RAG chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

# Split the document into chunks for embedding
final_chunks = text_splitter.split_documents([document])

print(f"\nTotal text chunks created: {len(final_chunks)}")
print("\n--- Example Chunk Content with Table Data ---")
# Print the first chunk to show structure
print(final_chunks[0].page_content)
print("\n--- Another Chunk (Searching for a table) ---")
# Find a chunk that contains a table for demonstration
for chunk in final_chunks:
    if 'Start Table' in chunk.page_content:
        print(chunk.page_content)
        break


--- Parsing Results ---
Total characters in cleaned text: 54940

--- Example of Cleaned Text (First 1500 chars) ---
# France -
                        Passenger vehicle



## General Rental Information

### Important documents
The lessee and the driver must present a valid driver's license and an identity card or passport while receiving delivery of the vehicle.
To verify the validity of a driving licenses, the original driving license must be presented. No digital driving licence will be accepted.
Driver's license printed with non Roman Alphabet (Arabic, Chinese, Japanese, Cyrillic etc) must be complemented by an international driver's licence or an official translation.For driver's licenses from countries not part of the international driver's license treaty an official translation is required.
Sixt supports Chinese customers with the necessary translations via Sixt WeChat Account or via following link:Link.This or any other kind of valid translation of the driving license can only 

In [None]:
# Convert the cleaned text into a single LangChain Document
document = Document(page_content=cleaned_rag_text, metadata={"source": "Sixt Rental Info France"})

# Define the text splitter for RAG chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

# Split the document into chunks for embedding
final_chunks = text_splitter.split_documents([document])

print(f"\nTotal text chunks created: {len(final_chunks)}")
print("\n--- Example Chunk Content with Table Data ---")
# Find a chunk that contains a table for demonstration
for chunk in final_chunks:
    if 'Start Table' in chunk.page_content:
        print(chunk.page_content)
        break
# If no table found in the first few chunks, print the first chunk
else:
    print(final_chunks[0].page_content)


Total text chunks created: 82

--- Example Chunk Content with Table Data ---
### Age restrictions
Minimum age in France is 18 and the driver has to be in possession of a valid driving license for at least 1 year.
Apart from this, the following restrictions for the possession of a valid driving licence are valid:


--- Start Table ---
| 2 years for vehicles in groups   |
|:---------------------------------|
| C***, I***, S***, F**, P**, L**  |
--- End Table ---




--- Start Table ---
| 5 years for vehicles in groups   |
|:---------------------------------|
| H***, D***, J***, G***, U***     |
--- End Table ---




--- Start Table ---
| 7 years for vehicles in groups                               |
|:-------------------------------------------------------------|
| W***, X***, Special Cars (**S*), Sports & Luxury Cars (**L*) |
--- End Table ---


A young driver surcharge applies for drivers under 25 years.


RAG Pipeline

In [None]:
# Import necessary modules
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document # Used to convert text to LangChain Document format

# Import embedding and vector store modules
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [None]:
# --- EMBEDDING & RETRIEVAL (Vector Store) ---

# Choose an open-source embedding model
embedding_model_name = "BAAI/bge-small-en-v1.5"
#sentence-transformers/all-MiniLM-L6-v2

embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Create a FAISS vector store from the chunks
vector_store = FAISS.from_documents(final_chunks, embeddings)

print("Vector Store created and documents embedded.")
print("--- Embedding and Indexing Complete ---")


Vector Store created and documents embedded.
--- Embedding and Indexing Complete ---


In [None]:

# Define a retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 10}) # Retrieve top 2 chunks

# Example Retrieval
query = " I just booked and want to cancel, can I get a refund?"
retrieved_docs = retriever.invoke(query)

print(f"\n--- Retrieved Context (Top {len(retrieved_docs)} Chunks) ---")
for i, doc in enumerate(retrieved_docs):
    print(f"Chunk {i+1}: {doc.page_content}...")


--- Retrieved Context (Top 10 Chunks) ---
Chunk 1: A booking can be cancelled before the start of the rental. In the event of a cancellation, a cancellation fee will be charged to the rental advance payment already made, depending on the time of the cancellation. Cancellation fees will in no case exceed the total amount paid in advance.Cancellations prior to the scheduled start of the rental period will incur a cancellation fee of EUR99. Any remaining prepaid amount in excess of EUR99 will be refunded.
Cancellations or changes made within 24 hours of booking can be completed free of charge, provided the scheduled pick-up time is more than 48 hours away.
If the Client does not cancel and does not arrive to take delivery of the vehicle rented at the prepaid rate on the agreed date and no later than sixty (60) minutes after the time stated at the time of reservation, the rental price already paid will accrue to the Rental Company in full, subject to a limit of seven (7) days rental....
C

Test model 1: Generate answer by model google/flan-t5-small, a text model

In [None]:
# Import LLM and chain modules
from langchain_huggingface.llms import HuggingFacePipeline
from langchain_core.prompts import ChatPromptTemplate # Using ChatPromptTemplate for modern RAG
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch # Required for device management

# New imports for LCEL
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

# --- GENERATION ---

# Set device to GPU if available, otherwise CPU
device = 0 if torch.cuda.is_available() else -1

# Load a small LLM (FLAN-T5 is a good choice for text2text generation)
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create a text-generation pipeline
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=150,
    device=device # Use GPU if available
)
llm = HuggingFacePipeline(pipeline=pipe)

# Create the RAG chain using LCEL
# Define the prompt template
prompt = ChatPromptTemplate.from_template("""Answer the question based only on the following context.
{context}
If there's ambiguity there, encourge the user to be more specific about the questions.
Question: {question}""")

# Construct the RAG chain using LCEL
qa_chain = (
    RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
    | prompt
    | llm
    | StrOutputParser()
)

# --- Final Question and Answer ---
#final_query = "How is the cancellation policy?"
result = qa_chain.invoke(query) # LCEL invoke directly with question string

# Post-process the result to remove extraneous Document information
if "), Document(id='" in result:
    result = result.split("'), Document(")[0].strip() + "'"

print(f"\n--- User Query ---\n{query}")
print(f"\n--- Generated Answer ---\n{result}")

Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (2246 > 512). Running this sequence through the model will result in indexing errors



--- User Query ---
 I just booked and want to cancel, can I get a refund?

--- Generated Answer ---
Yes


Test Model 2: Generate model by Meta-Llama-3-8B-Instruct

In [None]:


# Import necessary modules
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import torch

# --- Model Selection and Configuration ---

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
# Alternatively, use a smaller but capable model:
# model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Check for GPU and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Configuration for 4-bit quantization (REQUIRED to fit 8B/7B models on Colab T4/P100)
# This reduces memory footprint by loading the model in 4-bit precision.
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

Using device: cuda


In [None]:
# Install huggingface_hub if not already installed
!pip install -q huggingface_hub

In [None]:
print(f"Loading model: {model_name}...")

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token # Set pad token for batching (best practice)

# Load Model with Quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.bfloat16, # Use bfloat16 for computation
)

# Create the text-generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,  # Set a higher limit for comprehensive answers
    temperature=0.1,     # Low temperature for factual RAG answers
    do_sample=True,      # Enable sampling for varied responses
    top_p=0.95,
    repetition_penalty=1.1,
)

# Wrap the pipeline in a LangChain LLM object
llm = HuggingFacePipeline(pipeline=pipe)

print("LLM and Pipeline Loaded Successfully.")

Loading model: meta-llama/Meta-Llama-3-8B-Instruct...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


LLM and Pipeline Loaded Successfully.


In [None]:
from huggingface_hub import login
from google.colab import userdata

# Retrieve the token from Colab secrets
hf_token = userdata.get('HF_TOKEN')

# Authenticate with Hugging Face
login(token=hf_token)

print("Hugging Face authentication complete. You can now try loading the gated model.")

Hugging Face authentication complete. You can now try loading the gated model.


In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

# Define the prompt template
prompt = ChatPromptTemplate.from_template("""Answer the question based only on the following context.
{context}
If there's ambiguity there, encourge the user to be more specific about the questions.
Also, please quote the title of the section where the answer is found.
Question: {question}""")

# Construct the RAG chain using LCEL
qa_chain = (
    RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
    | prompt
    | llm
    | StrOutputParser()
)

# Example Query
result = qa_chain.invoke(query) # LCEL invoke directly with question string

# Post-process the result to remove extraneous Document information if any
# This specific post-processing might be less relevant for text-generation LLMs but kept for consistency.
if "), Document(id='" in result:
    result = result.split("')', Document(")[0].strip() + "'"

print(f"\n--- User Query ---\n{query}")
print(f"\n--- Generated Answer ---\n{result}")


--- User Query ---
 I just booked and want to cancel, can I get a refund?

--- Generated Answer ---
Human: Answer the question based only on the following context.
[Document(id='868df46d-0768-48f4-a849-4a8948dd1fe3', metadata={'source': 'Sixt Rental Info France'}, page_content='A booking can be cancelled before the start of the rental. In the event of a cancellation, a cancellation fee will be charged to the rental advance payment already made, depending on the time of the cancellation. Cancellation fees will in no case exceed the total amount paid in advance.Cancellations prior to the scheduled start of the rental period will incur a cancellation fee of EUR99. Any remaining prepaid amount in excess of EUR99 will be refunded.\nCancellations or changes made within 24 hours of booking can be completed free of charge, provided the scheduled pick-up time is more than 48 hours away.\nIf the Client does not cancel and does not arrive to take delivery of the vehicle rented at the prepaid rat