<a href="https://colab.research.google.com/github/noambassat/RAG_Agent_GITHUB_Rep.project/blob/main/ChromaDB_RAG_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install -q pandas chromadb langchain langchain-community openai sentence-transformers faiss-cpu --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import pandas as pd

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

import chromadb
from chromadb.config import Settings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from tqdm import tqdm

from langchain.agents import initialize_agent, Tool
from langchain.agents.agent_types import AgentType
from langchain_community.llms import OpenAI
from google.colab import userdata

from sentence_transformers import CrossEncoder

import os
os.environ["OPENAI_API_KEY"] = userdata.get('open_ai_key')

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:

# Path to your file (replace with your actual path)
path = "/content/drive/MyDrive/GitHubRepositoriesProject/clean_df.xlsx"

# Load only relevant columns
df = pd.read_excel(path, usecols=["Name", "Description", "URL", "Topics"])

# Drop rows with missing Description or Topics
df.dropna(subset=["Description", "Topics"], inplace=True)

# Ensure Topics is a string
df["Topics"] = df["Topics"].astype(str)

# Combine into single text column
df["Full_Text"] = df["Description"] + " " + df["Topics"]




Unnamed: 0,Name,Description,URL,Topics,Full_Text
0,PyPOTS,toolboxlibrary data mining partially observed ...,https://github.com/WenjieDu/PyPOTS,"classification, clustering, data mining, forec...",toolboxlibrary data mining partially observed ...
1,changedetection.io,best simplest free open source website change ...,https://github.com/dgtlmoon/changedetection.io,"back in stock, change alert, change detection,...",best simplest free open source website change ...


In [24]:
# Preview
df.head(2)

Unnamed: 0,Name,Description,URL,Topics,Full_Text
0,PyPOTS,toolboxlibrary data mining partially observed time series including sota models supporting tasks forecasting incomplete irregularly sampled multivariate time series missing values,https://github.com/WenjieDu/PyPOTS,"classification, clustering, data mining, forecasting, imputation, incomplete data, incomplete time series, irregularly sampled time series, machine learning, missing data, missing values, partially observed time series, pytorch, time series, time series analysis, time series classification, time series clustering, time series forecasting, time series imputation, time series with missing values","toolboxlibrary data mining partially observed time series including sota models supporting tasks forecasting incomplete irregularly sampled multivariate time series missing values classification, clustering, data mining, forecasting, imputation, incomplete data, incomplete time series, irregularly sampled time series, machine learning, missing data, missing values, partially observed time series, pytorch, time series, time series analysis, time series classification, time series clustering, time series forecasting, time series imputation, time series with missing values"
1,changedetection.io,best simplest free open source website change restock monitor notification service restock change detection designed simplicity simply monitor websites text change free free open source web page change website defacement price change price drop notification,https://github.com/dgtlmoon/changedetection.io,"back in stock, change alert, change detection, change monitoring, changedetection, monitoring, notifications, restock, restock monitor, self hosted, url monitor, web scraping, website change detection, website change detector, website change monitor, website change notification, website change tracker, website defacement monitoring, website monitor, website monitoring","best simplest free open source website change restock monitor notification service restock change detection designed simplicity simply monitor websites text change free free open source web page change website defacement price change price drop notification back in stock, change alert, change detection, change monitoring, changedetection, monitoring, notifications, restock, restock monitor, self hosted, url monitor, web scraping, website change detection, website change detector, website change monitor, website change notification, website change tracker, website defacement monitoring, website monitor, website monitoring"


In [23]:
# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [21]:
df.loc[0,'Full_Text']

'toolboxlibrary data mining partially observed time series including sota models supporting tasks forecasting incomplete irregularly sampled multivariate time series missing values classification, clustering, data mining, forecasting, imputation, incomplete data, incomplete time series, irregularly sampled time series, machine learning, missing data, missing values, partially observed time series, pytorch, time series, time series analysis, time series classification, time series clustering, time series forecasting, time series imputation, time series with missing values'

In [11]:
df.shape

(11663, 5)

In [26]:


# Load embedding model
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")

# Generate embeddings
embeddings = embedding_model.encode(df["Full_Text"].tolist(), show_progress_bar=True)

# Convert to numpy array
embeddings = np.array(embeddings).astype("float32")

# Build FAISS index
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)

# Save the index and dataframe (optional)
faiss.write_index(faiss_index, "faiss_index.index")
df.to_pickle("projects_df.pkl")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/365 [00:00<?, ?it/s]

In [13]:


# Initialize tqdm
tqdm.pandas()

# Initialize the embedding function
embedding_fn = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# Start Chroma client with in-memory DB
chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))

# Create collection
collection = chroma_client.create_collection(
    name="github_projects",
    embedding_function=embedding_fn
)

# Prepare data
documents = df["Full_Text"].tolist()
metadatas = df[["Name", "Description", "URL", "Topics"]].to_dict(orient="records")
ids = [str(i) for i in range(len(df))]

# Add to collection in batches
batch_size = 5000
for i in tqdm(range(0, len(df), batch_size)):
    collection.add(
        documents=documents[i:i + batch_size],
        metadatas=metadatas[i:i + batch_size],
        ids=ids[i:i + batch_size]
    )


ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
  0%|          | 0/3 [00:00<?, ?it/s]ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given
100%|██████████| 3/3 [00:47<00:00, 15.82s/it]


In [14]:
def hybrid_search(user_query: str, top_k: int = 10):
    """
    Perform hybrid search:
    - semantic search using ChromaDB
    - keyword filtering on topics
    """
    # Semantic search (ChromaDB)
    semantic_results = collection.query(query_texts=[user_query], n_results=top_k)
    semantic_ids = set(semantic_results["ids"][0])

    # Keyword search on Topics
    keyword_ids = set()
    lowered_query = user_query.lower()
    for i, topics in enumerate(df["Topics"]):
        if isinstance(topics, str) and any(term.strip().lower() in lowered_query for term in topics.split(",")):
            keyword_ids.add(str(i))

    # Combine unique results
    combined_ids = list(semantic_ids.union(keyword_ids))

    # Get the corresponding rows
    combined_results = df.loc[df.index.isin(map(int, combined_ids))].copy()
    combined_results["id"] = combined_results.index.astype(str)

    return combined_results.reset_index(drop=True)


In [15]:
results_df = hybrid_search("deep learning optimization", top_k=5)
results_df[["Name", "Topics", "URL"]]

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


Unnamed: 0,Name,Topics,URL
0,accel-brain-code,"auto encoder, automatic summarization, combina...",https://github.com/accel-brain/accel-brain-code
1,Object_Detection_Tracking,"activity detection, computer vision, deep lear...",https://github.com/JunweiLiang/Object_Detectio...
2,pytorch-a2c-ppo-acktr-gail,"a2c, acktr, actor critic, advantage actor crit...",https://github.com/ikostrikov/pytorch-a2c-ppo-...
3,sparse-evolutionary-artificial-neural-networks,"adaptive sparse connectivity, artificial neura...",https://github.com/dcmocanu/sparse-evolutionar...
4,ODISE,"deep learning, diffusion models, instance segm...",https://github.com/NVlabs/ODISE
...,...,...,...
2771,librespot-python,"librespot, librespot , music streaming, roadmap",https://github.com/kokarare1212/librespot-python
2772,SLAM-application,"lidar, lidar inertial odometry, lidar odometry...",https://github.com/engcang/SLAM-application
2773,Uni-Mol,"deep learning, molecular modeling, pre trained...",https://github.com/dptech-corp/Uni-Mol
2774,onthespot,"audio player,",https://github.com/casualsnek/onthespot


In [16]:
semantic_results = collection.query(query_texts=["neural networks"], n_results=3)
for i, doc in enumerate(semantic_results['documents'][0]):
    print(f"\nResult {i+1}:")
    print(doc)



Result 1:
simple neural network autocompletion autocomplete, lstm, machine learning, 

Result 2:
shufflenet pytorch artificial intelligence, convolution, deep learning, neural network, pytorch

Result 3:
closed form continuous time neural networks deep learning, neural ode, pytorch, recurrent neural networks, sequence models, tensorflow


In [17]:


# Load reranker model (optimized for passage ranking)
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# User query
query = "neural networks"

# Get semantic results (already done above)
documents = semantic_results["documents"][0]
metadatas = semantic_results["metadatas"][0]

# Prepare input pairs for the reranker
reranker_inputs = [[query, doc] for doc in documents]

# Get relevance scores
scores = reranker.predict(reranker_inputs)

# Combine metadata and scores
ranked_results = sorted(zip(scores, metadatas), key=lambda x: x[0], reverse=True)

# Print reranked top 3
for i, (score, meta) in enumerate(ranked_results):
    print(f"\nRank {i+1} - Score: {score:.4f}")
    print(f"Name: {meta['Name']}")
    print(f"Description: {meta['Description']}")
    print(f"URL: {meta['URL']}")


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]


Rank 1 - Score: 4.7498
Name: CfC
Description: closed form continuous time neural networks
URL: https://github.com/raminmh/CfC

Rank 2 - Score: 3.0199
Name: python_autocomplete
Description: simple neural network autocompletion
URL: https://github.com/vpj/python_autocomplete

Rank 3 - Score: 0.7259
Name: ShuffleNet
Description: shufflenet pytorch
URL: https://github.com/jaxony/ShuffleNet


In [18]:

# Load CrossEncoder for reranking
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def hybrid_search(query, k=5):
    """
    Perform semantic search with reranking using ChromaDB and CrossEncoder.
    Returns top-k projects as list of dicts.
    """
    # Step 1: Search in ChromaDB
    results = collection.query(query_texts=[query], n_results=15)
    docs = results["documents"][0]
    metas = results["metadatas"][0]

    # Step 2: Rerank using CrossEncoder
    pairs = [(query, doc) for doc in docs]
    scores = cross_encoder.predict(pairs)

    # Combine metadata with scores
    combined = list(zip(metas, scores))
    sorted_results = sorted(combined, key=lambda x: x[1], reverse=True)

    # Return top-k
    return sorted_results[:k]


In [19]:


# Define a function that wraps hybrid_search and formats the output
def search_and_format(query: str) -> str:
    results = hybrid_search(query)
    output = ""
    for i, (meta, score) in enumerate(results):
        output += f"{i+1}. {meta['Name']}\n"
        output += f"   Description: {meta['Description']}\n"
        output += f"   URL: {meta['URL']}\n\n"
    return output.strip()

tools = [
    Tool(
        name="GitHubProjectSearch",
        func=search_and_format,
        description="Useful for answering questions about relevant open-source GitHub projects. Input should be a natural language query like 'I want a library for image segmentation'."
    )
]

llm = OpenAI(temperature=0)
agent = initialize_agent(tools=tools, llm=llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)


  llm = OpenAI(temperature=0)
  agent = initialize_agent(tools=tools, llm=llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)


In [20]:
agent.run("I want a library for image segmentation")


  agent.run("I want a library for image segmentation")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I should use GitHubProjectSearch to find relevant open-source GitHub projects
Action: GitHubProjectSearch
Action Input: 'I want a library for image segmentation'[0m
Observation: [36;1m[1;3m1. albumentations
   Description: fast image augmentation easy wrapper libraries documentation httpsalbumentationsaidocs
   URL: https://github.com/albumentations-team/albumentations

2. ISAT_with_segment_anything
   Description: interactive semi automatic image segmentation annotation supports samsegment sam mobilesam etcsamsam mobilesam
   URL: https://github.com/yatengLG/ISAT_with_segment_anything

3. keras-unet
   Description: helper package multiple u net implementations keras useful utility tools helpful working image semantic segmentation tasks underlying tools come multiple projects performed working semantic segmentation tasks
   URL: https://github.com/karolzak/keras-unet

4. CRIS.pytorch
   Description: official pytorch imple

'Based on the observations, it seems that the best option for a library for image segmentation would be Pytorch-UNet, as it is a high quality implementation specifically for image semantic segmentation.'