# Streamlit-based RAG with Wikipedia Data

In [5]:
!pip install streamlit sentence-transformers faiss-cpu pyngrok pandas    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




# Data Preprocess

In [1]:
import pandas as pd

file_path = "archive/test.csv" 
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,ID,TITLE,ABSTRACT
0,20973,Closed-form Marginal Likelihood in Gamma-Poiss...,We present novel understandings of the Gamma...
1,20974,Laboratory mid-IR spectra of equilibrated and ...,Meteorites contain minerals from Solar Syste...
2,20975,Case For Static AMSDU Aggregation in WLANs,Frame aggregation is a mechanism by which mu...
3,20976,The $Gaia$-ESO Survey: the inner disk intermed...,Milky Way open clusters are very diverse in ...
4,20977,Witness-Functions versus Interpretation-Functi...,Proving that a cryptographic protocol is cor...


In [9]:
# Display dataset info
df.info()

# Check for missing values
print(df.isnull().sum())

# Check total number of rows
print(f"Total number of rows: {len(df)}")

# Remove missing and duplicate values
df_cleaned = df.dropna().drop_duplicates().reset_index(drop=True)

# Check cleaned data
print(f"Total rows after cleaning: {len(df_cleaned)}")
df_cleaned.head()

df_cleaned.to_csv("cleaned_test.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8989 entries, 0 to 8988
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        8989 non-null   int64 
 1   TITLE     8989 non-null   object
 2   ABSTRACT  8989 non-null   object
dtypes: int64(1), object(2)
memory usage: 210.8+ KB
ID          0
TITLE       0
ABSTRACT    0
dtype: int64
Total number of rows: 8989
Total rows after cleaning: 8989


# Implement the Retrieval-Augmented Generation Pipeline

In [3]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")  # Small and efficient model

# Example: Convert a sample text into an embedding
sample_text = "This is a test sentence."
sample_embedding = model.encode(sample_text)

# Print the embedding shape
print(f"Sample embedding shape: {sample_embedding.shape}")


  from .autonotebook import tqdm as notebook_tqdm


Sample embedding shape: (384,)


In [4]:
import numpy as np

# Convert the 'TITLE' and 'ABSTRACT' columns into embeddings
corpus = df_cleaned["TITLE"] + " " + df_cleaned["ABSTRACT"]
corpus_embeddings = model.encode(corpus.tolist(), convert_to_numpy=True)

# Print the shape of the embeddings
print(f"Corpus embeddings shape: {corpus_embeddings.shape}")


Corpus embeddings shape: (8989, 384)


In [6]:
import faiss

# Define the dimension of the embeddings
embedding_dim = corpus_embeddings.shape[1]

# Create a FAISS index
index = faiss.IndexFlatL2(embedding_dim)  # L2 (Euclidean) distance

# Add embeddings to the FAISS index
index.add(corpus_embeddings)

# Check the number of indexed vectors
print(f"Total vectors in FAISS index: {index.ntotal}")

Total vectors in FAISS index: 8989


In [7]:
def retrieve_similar_documents(query, top_k=5):
    """
    Given a query, retrieve the most relevant documents from FAISS index.
    
    Parameters:
        query (str): The input query text.
        top_k (int): The number of results to retrieve.
    
    Returns:
        list of (title, abstract, score)
    """
    # Convert query to embedding
    query_embedding = model.encode([query], convert_to_numpy=True)

    # Search FAISS index for the top-k closest vectors
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the corresponding titles and abstracts
    results = []
    for idx, score in zip(indices[0], distances[0]):
        title = df_cleaned.iloc[idx]["TITLE"]
        abstract = df_cleaned.iloc[idx]["ABSTRACT"]
        results.append((title, abstract, score))

    return results

# Test the retrieval function
query_text = "Neural networks for image processing"
results = retrieve_similar_documents(query_text)

# Display results
for title, abstract, score in results:
    print(f"Title: {title}\nAbstract: {abstract}\nScore: {score}\n")


Title: Provably efficient neural network representation for image classification
Abstract:   The state-of-the-art approaches for image classification are based on neural
networks. Mathematically, the task of classifying images is equivalent to
finding the function that maps an image to the label it is associated with. To
rigorously establish the success of neural network methods, we should first
prove that the function has an efficient neural network representation, and
then design provably efficient training algorithms to find such a
representation. Here, we achieve the first goal based on a set of assumptions
about the patterns in the images. The validity of these assumptions is very
intuitive in many image classification problems, including but not limited to,
recognizing handwritten digits.

Score: 0.8415294885635376

Title: Vector Field Based Neural Networks
Abstract:   A novel Neural Network architecture is proposed using the mathematically and
physically rich idea of vector fiel

# Build the Streamlit Interface

In [10]:
!streamlit run app.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://10.10.68.98:8502[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
^C
[34m  Stopping...[0m
Exception ignored in: <module 'threading' from '/Users/nanxuan/miniconda3/envs/dscapstone/lib/python3.9/threading.py'>
Traceback (most recent call last):
  File "/Users/nanxuan/miniconda3/envs/dscapstone/lib/python3.9/threading.py", line 1477, in _shutdown
    lock.acquire()
  File "/Users/nanxuan/miniconda3/envs/dscapstone/lib/python3.9/site-packages/streamlit/web/bootstrap.py", line 44, in signal_handler
    server.stop()
  File "/Users/nanxuan/miniconda3/envs/dscapstone/lib/python3.9/site-packages/streamlit/web/server/server.py", line 470, in stop
    self._runtime.stop()
  File "/Users/nanxuan/miniconda3/envs/dscapstone/lib/python3.9

# Deploy the Application Using ngrok

In [12]:
import os
import time
import threading
import streamlit as st
from pyngrok import ngrok

# Ensure Streamlit app is written in `app.py`
streamlit_script = "app.py"

# Run Streamlit as a background process
def run_streamlit():
    os.system(f"streamlit run {streamlit_script}")

# Start Streamlit in a separate thread
thread = threading.Thread(target=run_streamlit)
thread.daemon = True
thread.start()

# Authenticate Ngrok (Replace with your token)
NGROK_AUTH_TOKEN = "2sufQy1aFtFgngqGBdhpTMIZ95Y_3wjyKE1hhwykQszxurVyL" 
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Ensure no previous Ngrok tunnels are running
ngrok.kill()

# Create a public URL using Ngrok
public_url = ngrok.connect(8501, "http")
print(f"Public URL: {public_url}")

# Keep the notebook running to prevent termination
while True:
    time.sleep(10)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Public URL: NgrokTunnel: "https://5115-134-193-197-212.ngrok-free.app" -> "http://localhost:8501"
  Stopping...


KeyboardInterrupt: 


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8502
  Network URL: http://10.10.68.98:8502

  For better performance, install the Watchdog module:

  $ xcode-select --install
  $ pip install watchdog
            
