In [1]:
!pip install pymilvus pymilvus[model] pymupdf



In [2]:
from pymilvus import MilvusClient
import numpy as np
from pymilvus import model

client = MilvusClient("./milvus_demo18.db")
client.create_collection(
    collection_name="demo_collection",
    dimension=768  # The vectors we will use in this demo has 384 dimensions
)


embedding_fn =  model.dense.SentenceTransformerEmbeddingFunction(
    model_name='cornstack/CodeRankEmbed', # Specify the model name
    device='cuda:0',
    trust_remote_code=True
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# modified ip search to also filter by library metadata

from pymilvus import MilvusClient
from pymilvus import model
import re

DISTANCE_THRESHOLD = 0.25

# def ip_search(
#     queries,
#     output_field,
#     collection_name,
#     distance_threshold=DISTANCE_THRESHOLD,
#     embedding_fn=embedding_fn,
# ):
#     """Searches for relevant context and filters by library metadata."""

#     query_vectors = embedding_fn.encode_documents(queries)
#     res = client.search(
#         collection_name=collection_name,
#         data=query_vectors,
#         limit=5,
#         output_fields=["text", "metadata"],  # Retrieve metadata along with text
#     )

#     # Extract the target library from the query
#     query_text = " ".join(queries).lower()
#     relevant_libraries = ["matplotlib", "numpy", "pandas", "pytorch", "scipy", "sklearn", "tensorflow"]
#     detected_library = next((lib for lib in relevant_libraries if lib in query_text), None)

#     filtered_results = []
#     for table in res[0]:
#         if table["distance"] < distance_threshold:
#             retrieved_library = table.get("metadata", {}).get("library", "").lower()

#             # Ensure retrieved context matches the detected library
#             if detected_library and retrieved_library == detected_library:
#                 filtered_results.append(table["entity"]["text"])

#     return filtered_results if filtered_results else []


# ip search function with keyword matching
def ip_search(
    queries,
    output_field,
    collection_name,
    distance_threshold=0.5,
    embedding_fn=embedding_fn,
):
    """Searches for relevant context and filters by library metadata."""

    # Library keyword mappings
    library_keywords = {
        "pandas": ["pd", "dataframe", "df", "iloc", "loc", "series", "groupby", "merge", "concat", "pivot", "melt"],
        "numpy": ["np", "array", "ndarray", "linspace", "arange", "zeros", "ones", "random", "linalg", "fft"],
        "matplotlib": ["plt", "pyplot", "figure", "subplot", "plot", "scatter", "hist", "bar", "boxplot", "imshow"],
        "sklearn": ["scikit", "scikit-learn", "classifier", "regressor", "cluster", "pipeline", "grid_search", "cross_val", "metrics"],
        "scipy": ["stats", "interpolate", "optimize", "signal", "sparse", "spatial", "integrate", "ode"],
        "pytorch": ["torch", "nn", "optim", "cuda", "tensor", "autograd", "module", "dataloader", "dataset"],
        "tensorflow": ["tf", "keras", "estimator", "layers", "variable", "session", "placeholder", "eager", "tpu", "gpu"]
    }

    query_vectors = embedding_fn.encode_documents(queries)
    res = client.search(
        collection_name=collection_name,
        data=query_vectors,
        limit=5,
        output_fields=["text", "metadata"],
    )

    # Extract the target library from the query
    query_text = " ".join(queries).lower()
    detected_libraries = set()

    # Check for direct library mentions
    relevant_libraries = list(library_keywords.keys())
    for lib in relevant_libraries:
        if lib in query_text:
            detected_libraries.add(lib)

    # Check for keyword matches
    for lib, keywords in library_keywords.items():
        for keyword in keywords:
            # Check for whole word matches to avoid partial matches
            if re.search(r'\b' + re.escape(keyword) + r'\b', query_text):
                detected_libraries.add(lib)
                break

    filtered_results = []

    for table in res[0]:
        if table["distance"] < distance_threshold:
            retrieved_library = table.get("entity", {}).get("metadata", {}).get("library", "").lower()

            # Include results if they match any detected library
            if detected_libraries and retrieved_library in detected_libraries:
                filtered_results.append(table["entity"]["text"])

    return filtered_results if filtered_results else []

In [4]:
# Adding library metadata when reading the files and websites

import os
import fitz
from transformers import GPT2TokenizerFast
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests
from bs4 import BeautifulSoup

tokenizer_gpt = GPT2TokenizerFast.from_pretrained("gpt2")
def count_tokens(text: str) -> int:
    return len(tokenizer_gpt.encode(text))

# Step 1: Read PDF using PyMuPDF
def read_pdf(file_path: str) -> str:
    """Extract text from the PDF file and return it as a single string."""
    text = ""
    with fitz.open(file_path) as pdf:
        for page_num in range(len(pdf)):
            page = pdf.load_page(page_num)
            text += page.get_text("text")  # Extract text from the page
    return text

def process_pdfs_in_folder(folder_path: str):
    """Process PDFs, extract text, split into chunks, and store with library metadata."""

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=256,
        chunk_overlap=24,
        length_function=count_tokens,
    )

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pdf"):
            file_path = os.path.join(folder_path, file_name)

            # Extract library name from filename (e.g., "numpy.pdf" → "numpy")
            library_name = file_name.replace(".pdf", "").lower()

            text = read_pdf(file_path)

            # Split the text into chunks
            chunks = text_splitter.create_documents([text])
            chunks = [chunk.page_content for chunk in chunks]

            print(f"Processed {file_name}: {len(chunks)} chunks")

            # Store with metadata
            vectors = embedding_fn.encode_documents(chunks)

            data = [
                {"id": i, "vector": vectors[i], "text": chunks[i], "metadata": {"library": library_name}}
                for i in range(len(vectors))
            ]

            print("Data has", len(data), "entities, each with fields: ", data[0].keys())
            print("Vector dim:", len(data[0]["vector"]))

            res = client.insert(collection_name="demo_collection", data=data)
            print(res)



# Process data from websites
def read_webpage(url: str, class_type: str, class_name: str) -> str:
    """Extract text from a web page and return it as a single string."""
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch page: {url}")

    soup = BeautifulSoup(response.text, "html.parser")
    main_content = soup.find(class_type, class_=class_name)

    if main_content:
        return main_content.get_text(separator="\n", strip=True)
    else:
        return ""

def process_webpage(url: str, page_name: str, class_type: str, class_name: str, library_name: str):
    """Process webpage text, split into chunks, and store with metadata."""

    text = read_webpage(url, class_type, class_name)

    if not text:
        print(f"No content extracted from {url}")
        return

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=256,
        chunk_overlap=24,
        length_function=count_tokens,
    )

    # Split text into chunks
    chunks = text_splitter.create_documents([text])
    chunks = [chunk.page_content for chunk in chunks]

    print(f"Processed {page_name}: {len(chunks)} chunks")

    vectors = embedding_fn.encode_documents(chunks)

    data = [
        {"id": i, "vector": vectors[i], "text": chunks[i], "metadata": {"library": library_name}}
        for i in range(len(vectors))
    ]

    print("Data has", len(data), "entities, each with fields:", data[0].keys())
    print("Vector dim:", len(data[0]["vector"]))

    res = client.insert(collection_name="demo_collection", data=data)
    print(res)


def read_csv(file_path: str):
    """Read CSV file and extract relevant columns."""
    df = pd.read_csv(file_path)

    # Ensure necessary columns exist
    required_columns = {"QuestionTitle", "QuestionBody", "AnswerBody"}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"CSV file must contain columns: {required_columns}")

    return df

def process_csv(file_path: str):
    """Process CSV data and split it into text chunks."""
    df = read_csv(file_path)

    # Initialize text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=256,  # Size of chunks (in tokens)
        chunk_overlap=24,  # Overlap between chunks
        length_function=count_tokens,
    )

    all_chunks = []

    for _, row in df.iterrows():
        # Combine Question and Answer
        combined_text = f"Title: {row['QuestionTitle']}\n\nQuestion: {row['QuestionBody']}\n\nAnswer: {row['AnswerBody']}"

        # Split the text into chunks
        chunks = text_splitter.create_documents([combined_text])
        chunks = [chunk.page_content for chunk in chunks]

        all_chunks.extend(chunks)

    print(f"Processed {len(df)} Q&A pairs into {len(all_chunks)} chunks")

    # Embed and insert into vector store
    vectors = embedding_fn.encode_documents(all_chunks)

    data = [
        {"id": i, "vector": vectors[i], "text": all_chunks[i]} for i in range(len(vectors))
    ]

    print("Data has", len(data), "entities, each with fields:", data[0].keys())
    print("Vector dim:", len(data[0]["vector"]))

    res = client.insert(collection_name="demo_collection", data=data)
    print(res)


# csv_file_path = "./docs/sklearn_stackoverflow.csv"
# process_csv(csv_file_path)


folder_path = "./docs"
process_pdfs_in_folder(folder_path)

url_dict = {
    "https://ipgp.github.io/scientific_python_cheat_sheet/?utm_content=buffer7d821&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer#numpy-import-numpy-as-np": ["cheatsheet", "section", "main-content", "numpy"]
}
for url, (page_name, class_type, class_name, library_name) in url_dict.items():
    process_webpage(url, page_name, class_type, class_name, library_name)

Processed sklearn.pdf: 10 chunks
Data has 10 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'metadata'])
Vector dim: 768
{'insert_count': 10, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
Processed tensorflow.pdf: 14 chunks
Data has 14 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'metadata'])
Vector dim: 768
{'insert_count': 14, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]}
Processed pandas.pdf: 24 chunks
Data has 24 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'metadata'])
Vector dim: 768
{'insert_count': 24, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]}
Processed matplotlib.pdf: 14 chunks
Data has 14 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'metadata'])
Vector dim: 768
{'insert_count': 14, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]}
Processed pytorch.pdf: 19 chunks
Data has 19 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'metad

Token indices sequence length is longer than the specified maximum sequence length for this model (1818 > 1024). Running this sequence through the model will result in indexing errors


Processed cheatsheet: 29 chunks
Data has 29 entities, each with fields: dict_keys(['id', 'vector', 'text', 'metadata'])
Vector dim: 768
{'insert_count': 29, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]}


In [5]:
print(ip_search(["How to fetch specific columns from a dataframe?"], ["text"], "demo_collection"))

['Applying Functions\n>>> f = lambda x: x*2\n>>> df.apply(f)         Apply function\n>>> df.applymap(f)          Apply function element-wise\nRetrieving Series/DataFrame Information\n>>> df.shape            (rows,columns)        \n>>> df.index\t\n         Describe index\t\n \n>>> df.columns          Describe DataFrame columns\n>>> df.info()           Info on DataFrame\n>>> df.count()          Number of non-NA values\nGetting \nAlso see NumPy Arrays\nSelecting, Boolean Indexing & Setting\nBasic Information\nSummary\n>>> df.sum()               Sum of values']


In [6]:
!pip install torch transformers datasets tqdm 'accelerate>=0.26.0'



In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-7B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-7B-Instruct")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
import os
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

import argparse

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model",
    type=str,
    default="Qwen/Qwen2.5-Coder-7B-Instruct",
    help="which results to run",
)
parser.add_argument(
    "--resume",
    action="store_true",
    default=False,
    help="where to resume inference",
)

# Adding this line to prevent SystemExit:2
parser.add_argument('-f')

args = parser.parse_args()
model_name = args.model

cached_cnt = 0
if os.path.exists(f"data/{args.model.replace('/', '-')}-answers.jsonl"):
    if args.resume:
        cached_cnt = len(open(f"data/{args.model.replace('/', '-')}-answers.jsonl", "r").readlines())
    else:
        exit(0)

model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto", resume_download=True, trust_remote_code=True
)

for name, param in model.named_parameters():
    print(f"{name} is on {param.device}")

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Need to set the padding token to the eos token for generation
if tokenizer.eos_token:
    tokenizer.pad_token = tokenizer.eos_token
else:
    tokenizer.add_special_tokens({
        "pad_token": "<pad>"
    })





Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



model.embed_tokens.weight is on cuda:0
model.layers.0.self_attn.q_proj.weight is on cuda:0
model.layers.0.self_attn.q_proj.bias is on cuda:0
model.layers.0.self_attn.k_proj.weight is on cuda:0
model.layers.0.self_attn.k_proj.bias is on cuda:0
model.layers.0.self_attn.v_proj.weight is on cuda:0
model.layers.0.self_attn.v_proj.bias is on cuda:0
model.layers.0.self_attn.o_proj.weight is on cuda:0
model.layers.0.mlp.gate_proj.weight is on cuda:0
model.layers.0.mlp.up_proj.weight is on cuda:0
model.layers.0.mlp.down_proj.weight is on cuda:0
model.layers.0.input_layernorm.weight is on cuda:0
model.layers.0.post_attention_layernorm.weight is on cuda:0
model.layers.1.self_attn.q_proj.weight is on cuda:0
model.layers.1.self_attn.q_proj.bias is on cuda:0
model.layers.1.self_attn.k_proj.weight is on cuda:0
model.layers.1.self_attn.k_proj.bias is on cuda:0
model.layers.1.self_attn.v_proj.weight is on cuda:0
model.layers.1.self_attn.v_proj.bias is on cuda:0
model.layers.1.self_attn.o_proj.weight is

In [9]:
import json

test_prompts = [
    {
        "payload": {
            "message_content": "How do I create a pandas DataFrame?",
            "error_info": "not available"
        }
    },
    {
        "payload": {
            "message_content": "How do I plot using matplotlib?",
            "error_info": "Error count: 1, Error description: ModuleNotFoundError: No module named 'matplotlib'"
        }
    }
]

test_file = "test_queries.json"
with open(test_file, "w", encoding="utf-8") as f:
    json.dump(test_prompts, f, indent=2)

# 3. Load the test data
with open(test_file, "r", encoding="utf-8") as f:
    test_data = json.load(f)

batch_size = 4
pad_to_multiple_of = 8
formatted_prompts = [test_data[i:i+batch_size] for i in range(0, len(test_data), batch_size)]


In [None]:
# Apply padding on the left since we are doing generation
padding_side_default = tokenizer.padding_side
tokenizer.padding_side = "left"

# Tokenize each batch
tokenized_prompts = []
for batch in formatted_prompts:
    messages = [
        [
            {"role": "system", "content": """You are Qwen, created by Alibaba Cloud. You are a helpful assistant and an expert Python data scientist. Before answering, take a deep breath and follow these guidelines:
              1. **Understand the question fully** before responding.
              2. **Check if retrieved context is relevant** to the question.
                  - If it contains useful functions, libraries, or examples, use them.
                  - If it is unrelated or incorrect, **rely on general knowledge** instead.
              3. **Provide a clear explanation first**, before giving the code.
              4. **Ensure correctness, completeness, and efficiency** in the code.
              5. **Double-check variable names and constraints** before providing the answer.
              """},
            {"role": "user", "content": query["payload"]["message_content"] +
                                       "\n<context>\n" +
                                       "\n\n".join(ip_search([query["payload"]["message_content"]], ["text"], "demo_collection")) +
                                       "\n</context>\n<error_info>" +
                                       query["payload"]["error_info"] + "</error_info>"}
        ]
        for query in batch
    ]
    for query in batch:
        print(query["payload"]["message_content"])
        print(query["payload"]["error_info"])
    # Generate template text
    text_inputs = [
        tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages
    ]

    # Tokenize the formatted prompts
    tokenized_batch = tokenizer(
        text_inputs, return_token_type_ids=False, padding=True, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt"
    )
    tokenized_prompts.append(tokenized_batch)

# Restore the original padding behavior
tokenizer.padding_side = padding_side_default
model_name = model_name.replace('/', '-')

generation_config = {
    "do_sample": False,
    "max_new_tokens": 512,
    "num_beams": 1
}

print("\n=== GENERATED RESPONSES ===\n")
for batch in tqdm(tokenized_prompts):
    # Move the batch to the device
    batch = batch.to("cuda")
    prompt_len = len(batch["input_ids"][0])
    with torch.no_grad():
        outputs = model.generate(
            **batch,
            **generation_config
        )
    generated_text = tokenizer.batch_decode(outputs[:, prompt_len:], skip_special_tokens=True)
    for i, code in enumerate(generated_text):
        stop_words = ["</code>", "# SOLUTION END"]
        for stop_word in stop_words:
            code = code.split(stop_word)[0]
        print(f"\n🔹 **Prompt {cached_cnt + i}**")
        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
        print(f"📌 **Generated Code:**\n{code}")
        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    cached_cnt += len(generated_text)  # Update count

In [None]:
# import os
# import torch
# import json
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from datasets import load_dataset
# from tqdm import tqdm

# # Set model name and resume behavior directly in the code
# model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
# resume = True  # Set to True to resume from the last cached count

# # Initialize the model and tokenizer
# model = AutoModelForCausalLM.from_pretrained(
#     model_name, device_map="auto", resume_download=True, trust_remote_code=True
# )
# for name, param in model.named_parameters():
#     print(f"{name} is on {param.device}")

# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# if tokenizer.eos_token:
#     tokenizer.pad_token = tokenizer.eos_token
# else:
#     tokenizer.add_special_tokens({"pad_token": "<pad>"})

# # Load dataset
# ds1000 = list(load_dataset("xlangai/DS-1000")["test"])
# prompts = [p["prompt"] for p in ds1000]

# # Check for cached answers if resume is enabled
# cached_cnt = 0
# #output_path = f'{model_name.replace("/", "-")}-answers.jsonl'
# output_path=f"./data/{model_name.replace('/', '-')}-answers.jsonl"
# if resume and os.path.exists(output_path):
#     cached_cnt = len(open(output_path, "r").readlines())

# prompts = prompts[cached_cnt:]  # Resume from the last processed prompt if applicable

# # Set parameters
# batch_size = 2
# pad_to_multiple_of = 8

# # Split prompts into batches
# formatted_prompts = [prompts[i: i + batch_size] for i in range(0, len(prompts), batch_size)]
# print("cached_cnt", cached_cnt)


In [None]:
# # Tokenize and process each batch
# cached_cnt=0
# tokenizer.padding_side = "left"
# tokenized_prompts = []
# for formatted_prompt in formatted_prompts:
#     messages = [
#         [
#             {"role": "system", "content":  """"You are Qwen, created by Alibaba Cloud. You are a helpful assistant. Before answering, take a deep breath and answer based on the following guidelines:
#               1. Use the provided context ONLY if relevant
#               2. If the context doesn't help, rely on your general knowledge
#               3. Answer the question, EXACTLY ACCORDING TO HOW IT IS ASKED, that is, only provide the neccessary code needed to add to the user's code for it to work.
#               4, DO NOT give any explanation, only the code is required. For example, "Problem:\nHow do I get the dimensions of an array? For instance, this is (2, 2):\na = np.array([[1,2],[3,4]])\n\nA:\n<code>\nimport numpy as np\na = np.array([[1,2],[3,4]])\n</code>\nresult = ... ", your answer should be "result = a.shape".

#               """},
#             {"role": "user", "content": prompt + "\n<context>\n" + "\n\n".join(ip_search([formatted_prompt[0]], ["text"], "demo_collection"))+ "\n</context>\n"}
#         ]
#         for prompt in formatted_prompt
#     ]
#     text_inputs = [
#         tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages
#     ]
#     tokenized_batch = tokenizer(
#         text_inputs, return_token_type_ids=False, padding=True, pad_to_multiple_of=pad_to_multiple_of, return_tensors="pt", max_length=2048
#     )
#     tokenized_prompts.append(tokenized_batch)
# tokenizer.padding_side = "right"  # Reset padding side

# # Generation configuration
# generation_config = {
#     "do_sample": False,
#     "max_new_tokens": 1024,
#     "num_beams": 1
# }

# # Run inference and save outputs

# with open(output_path, 'a+') as f:
#     for batch in tqdm(tokenized_prompts):
#         batch = batch.to("cuda")
#         prompt_len = len(batch["input_ids"][0])

#         with torch.no_grad():
#             outputs = model.generate(**batch, **generation_config)
#         generated_text = tokenizer.batch_decode(outputs[:, prompt_len:], skip_special_tokens=True)

#         for code in generated_text:
#             stop_words = ["</code>", "# SOLUTION END"]
#             for stop_word in stop_words:
#                 code = code.split(stop_word)[0]
#             result = {
#                 'id': cached_cnt,
#                 'code': code,
#                 'metadata': ds1000[cached_cnt]['metadata']
#             }
#             f.write(json.dumps(result) + '\n')
#             cached_cnt += 1

# print(f"Results saved in {output_path}")
