<a href="https://colab.research.google.com/github/mikolaura/impleting_basic_rag_pipeline/blob/main/rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install all libraries from pip

In [None]:
%pip install langchain langchain-google-genai faiss-cpu  langchain-community huggingface_hub

In [None]:
%pip install -U datasets

## Loading all libraries into a notebook

In [None]:
import os
import getpass
from datasets import load_dataset
import pandas as pd
import numpy as np
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
import faiss
import unittest

# Create vector store and loading dataset from scratch

**You also can use already created faiss vector store**

*for this move down to `Alternative: Load vector store`*


## Loading and structuring dataset

`To done this part you would need hugging face api token`




In [None]:
!huggingface-cli login

In [None]:
ds = load_dataset("uran050311/wikipedia_rag")

In [None]:
import pandas as pd
# Converting into pandas data frame and preprocesing data
train_data = pd.DataFrame.from_dict((ds['train']))
texts = train_data['text'].tolist()


## Vector store

***You need google api key for this part***

In this part:

* We create document from text
* We split documents
* We create vector store
* We add data to vector store

In [None]:
# Preprocess text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=30)
documents = text_splitter.create_documents(texts)
splited_documents = text_splitter.split_documents(documents)

### Create embeddings. Convert from text to embeddings

In [None]:
os.environ['GOOGLE_API_KEY'] = getpass.getpass()

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [None]:
def from_text_to_embeddings(embeddings, text):
    return embeddings.embed_query(text)

### Setup Faiss

In [None]:
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [None]:
vector_store.add_documents(documents=splited_documents)

## Creating LLM and retriever

This part:

* Initializing LLM
* Createing retriver
* Creating rag chain
* And make first prediction

In [None]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    model_kwargs = {
        "seed": 42
    }
    )

In [None]:
retriever = vector_store.as_retriever()
rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")
result = rag_chain.invoke("What is ASCII")['result']
print("Answer:", result)

## Unit testing

In [None]:
class TestStringMethods(unittest.TestCase): # not all unit test would work because llm can change it result
  def test_what_is_ascii(self):
      self.assertIn(rag_chain.invoke("What is ASCII")['result'], "ASCII, abbreviated from American Standard Code for Information Interchange, is a character encoding standard for electronic communication. ASCII codes represent text in computers, telecommunications equipment, and other devices. It encodes 128 specified characters into seven-bit integers.")
  def test_where_is_andorra(self):
      self.assertIn(rag_chain.invoke("Where is Andorra")['result'], "Andorra is a landlocked country in the eastern Pyrenees mountains, bordered by France to the north and Spain to the south.")

In [None]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

## Saving dataset with vector

In [None]:
rows = []

for i in range(len(texts)):
    vector = []
    # This code work, but because gemini cannot process that much data it would take forever to run

      # for q in range(100, len(texts[i])-100, 100):

      #   vec = embeddings.embed_query(texts[i][q-100:q])
      #   vector.extend(vec)
    rows.append({
        "id": i,
        "title": train_data['title'].tolist()[i],
        "text": texts[i],
        "vector": vector
    })

df = pd.DataFrame(rows)
df.to_csv("text_with_empty_vector.csv", index=False)

## Saving faiss vector store

In [None]:
vector_store.save_local("my_faiss_index")

# Alternative: Load vector store

***For this you would need google api key***

In [None]:
!wget https://github.com/mikolaura/faiss_index/raw/refs/heads/main/my_faiss_index.zip

In [None]:
!unzip my_faiss_index.zip

## OPTIONAL: USING CLI INTERFACE

In [None]:
os.environ['GOOGLE_API_KEY'] = getpass.getpass()

In [None]:
#Loading embeddings and vector store
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_store = FAISS.load_local("my_faiss_index",
                                embeddings,
                                allow_dangerous_deserialization=True
                                )

In [None]:
# Init chat model
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
# Creating retriver and rag chain
retriever = vector_store.as_retriever()
rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")

In [None]:
# Predicting on own input
i = input()
if i == "":
  print("Input cannot be empty")
  i = input()
result = rag_chain.invoke(i)['result']
print("Answer:", result)

## OPTIONAL: CREATE WEB-GUI USING STREAMLIT

In [None]:
!pip install -q streamlit

In [None]:
%%writefile app.py
import os
import numpy as np
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
import faiss
import streamlit as st
with st.sidebar:
    google_gemini_api_key = st.text_input(
        "Google Gemini API Key",
        key="api_key",
        type='password'
    )

st.title('RAG')

if google_gemini_api_key:
    os.environ['GOOGLE_API_KEY'] = google_gemini_api_key
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vector_store = FAISS.load_local("my_faiss_index",
                                embeddings,
                                allow_dangerous_deserialization=True
                                )
    # Init chat model
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )

    # Creating retriver and rag chain
    retriever = vector_store.as_retriever()
    rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")
    text = st.text_input(
        "Write your question",
        key="text"
    )
    # Predicting on own input
    if text:
        result = rag_chain.invoke(text)['result']
        print("Answer:", result)
        st.write(result)

In [None]:
!npm install localtunnel

In [None]:
!wget -q -O - https://loca.lt/mytunnelpassword

In [None]:
!streamlit run /content/app.py &>/content/logs.txt &

In [None]:
!npx localtunnel --port 8501