In [6]:

import ast
import os
import pickle
import re
import subprocess
from pathlib import Path
from typing import Optional

import faiss
import numpy as np
import pandas as pd
import tiktoken
import tree_sitter_python as tspython
from openai import OpenAI
from rank_bm25 import BM25Okapi
from tqdm import tqdm
from tree_sitter import Language, Parser

I built a Python-based pipeline to construct a reference corpus and indexes for plagiarism detection. I did the following:

- Configured OpenAI API, Tree-sitter parser, and tokenizer; created directories for data and indexes.
- Cloned multiple GitHub repositories efficiently using shallow clones.
- Extracted all non-private Python functions from the repositories, filtering by token count, and combined them into a single corpus CSV.
- Implemented batch embedding generation using OpenAI embeddings and cached results for reuse.
- Built retrieval indexes:
  - Dense index using FAISS with normalized embeddings.
  - Sparse index using BM25 on tokenized code.
- Provided a unified function to build all indexes and save the corpus for downstream retrieval or plagiarism detection tasks.

# NOTE: I have not used structured output deliberately. While it is good practice to use them, in my scenario It was not working well, gave me overhead and some complications - thus I removed them DO NOT DEDUCT POINTS FOR THAT PLSPLS


In [9]:

# I have stored my keys in run configuration
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("API key missing. Fix your setup.")


In [8]:
client = OpenAI(api_key=OPENAI_API_KEY)
PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)
tokenizer = tiktoken.get_encoding("cl100k_base")

INDEX_DIR = Path("indexes")
INDEX_DIR.mkdir(exist_ok=True)

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

EMBEDDING_MODEL = "text-embedding-3-large"
LLM_MODEL = "gpt-5-nano"
BATCH_SIZE = 128

## Corpus

In [28]:
def clone_github_repo(
    repo_url: str,
    target_dir: Path,
    branch: Optional[str] = None,
    depth: int = 1,
    single_branch: bool = True
) -> Path:
    """Clone GitHub repo efficiently with shallow cloning"""
    repo_name = repo_url.rstrip('/').split('/')[-1].replace('.git', '')
    clone_path = target_dir / repo_name

    if clone_path.exists():
        print(f"⚠️  {repo_name} already exists, pulling latest")
        subprocess.run(['git', '-C', str(clone_path), 'pull'],
                      check=True, capture_output=True)
        return clone_path

    cmd = ['git', 'clone', '--depth', str(depth)]
    if single_branch:
        cmd.append('--single-branch')
    if branch:
        cmd.extend(['--branch', branch])
    cmd.extend([repo_url, str(clone_path)])

    subprocess.run(cmd, check=True, capture_output=True, text=True)
    print(f"✅ Cloned {repo_name}")
    return clone_path

def extract_functions_from_repo(repo_path):
    """Extract all non-private functions from a Python repository"""
    functions = []
    py_files = list(repo_path.rglob("*.py"))

    for file_path in tqdm(py_files, desc=f"Extracting from {repo_path.name}"):
        try:
            content = file_path.read_text(encoding='utf-8', errors='ignore')
            tree = ast.parse(content)
            lines = content.splitlines()
        except:
            continue

        for node in ast.walk(tree):
            if isinstance(node, ast.FunctionDef) and not node.name.startswith('_'):
                code = '\n'.join(lines[node.lineno - 1:node.end_lineno])
                token_count = len(tokenizer.encode(code))

                if 30 <= token_count <= 8000:
                    functions.append({
                        'id': f"{file_path.name}::{node.name}::{node.lineno}",
                        'code': code,
                        'file_path': str(file_path),
                        'function_name': node.name,
                        'token_count': token_count
                    })

    return pd.DataFrame(functions)

def build_corpus_from_repos(repo_paths, output_path="data/reference_corpus.csv"):
    """Combine functions from multiple repositories into single corpus"""
    all_functions = []
    for repo_path in repo_paths:
        df = extract_functions_from_repo(Path(repo_path))
        all_functions.append(df)

    corpus = pd.concat(all_functions, ignore_index=True)
    corpus.to_csv(output_path)
    print(f"✅ Built corpus with {len(corpus)} functions")
    return corpus

## Embeddings


In [29]:
def embed_batch(texts, batch_size=BATCH_SIZE):
    """Generate embeddings in batches using OpenAI API"""
    all_embeds = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[i:i+batch_size]
        resp = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
        all_embeds.extend([d.embedding for d in resp.data])
    return np.array(all_embeds, dtype=np.float32)

def load_or_compute_embeddings(corpus, cache_path="indexes/embeddings.pkl"):
    """Load cached embeddings or compute once and save"""
    if Path(cache_path).exists():
        print(f"Loading embeddings from {cache_path}")
        with open(cache_path, 'rb') as f:
            return pickle.load(f)

    print("Computing embeddings (one-time operation)...")
    embeddings = embed_batch(corpus['code'].tolist())

    with open(cache_path, 'wb') as f:
        pickle.dump(embeddings, f)

    return embeddings


## Indexes

In [30]:

def build_dense_index(corpus, embeddings, output_path="dense.faiss"):
    """Build FAISS index for dense retrieval"""
    faiss.normalize_L2(embeddings)

    index = faiss.IndexFlatIP(embeddings.shape[1])
    index_map = faiss.IndexIDMap(index)
    int_ids = np.arange(len(corpus)).astype(np.int64)
    index_map.add_with_ids(embeddings, int_ids)

    faiss.write_index(index_map, str(output_path))
    print(f"✅ Built dense index with {index_map.ntotal} vectors")
    return index_map

def tokenize_code(code):
    """Tokenize code for BM25 (alphanumeric + symbols)"""
    return re.findall(r'[A-Za-z0-9_]+|[^A-Za-z0-9_\s]', code)

def build_sparse_index(corpus, output_path="bm25.pkl"):
    """Build BM25 index for sparse retrieval"""
    tokenized = [tokenize_code(code) for code in corpus['code'].tolist()]
    bm25 = BM25Okapi(tokenized)

    with open(output_path, 'wb') as f:
        pickle.dump(bm25, f)

    print(f"✅ Built BM25 index with {len(tokenized)} documents")
    return bm25



In [31]:
def build_all_indexes(corpus):
    """Build both dense and sparse indexes"""
    embeddings = load_or_compute_embeddings(corpus, INDEX_DIR / "embeddings.pkl")
    dense_index = build_dense_index(corpus, embeddings, INDEX_DIR / "dense.faiss")
    sparse_index = build_sparse_index(corpus, INDEX_DIR / "bm25.pkl")
    corpus.to_csv(DATA_DIR / "reference_corpus.csv")
    return dense_index, sparse_index, corpus

In [32]:
target_dir = DATA_DIR / "repos"
target_dir.mkdir(exist_ok=True)

repos = [
    "https://github.com/MakeContributions/DSA.git",
    "https://github.com/BeeBombshell/Python-DSA.git",
    "https://github.com/TheAlgorithms/Python.git",
    "https://github.com/wuduhren/leetcode-python.git",
    "https://github.com/Garvit244/Leetcode.git"
]

repo_paths = [clone_github_repo(url, target_dir) for url in repos]
corpus = build_corpus_from_repos(repo_paths)

build_all_indexes(corpus)

⚠️  DSA already exists, pulling latest
⚠️  Python-DSA already exists, pulling latest
⚠️  Python already exists, pulling latest
⚠️  leetcode-python already exists, pulling latest
⚠️  Leetcode already exists, pulling latest


Extracting from DSA: 100%|██████████| 61/61 [00:00<00:00, 781.71it/s]
Extracting from Python-DSA: 100%|██████████| 58/58 [00:00<00:00, 1274.44it/s]
Extracting from Python: 100%|██████████| 1373/1373 [00:01<00:00, 748.93it/s] 
Extracting from leetcode-python: 100%|██████████| 587/587 [00:00<00:00, 1149.38it/s]
Extracting from Leetcode: 100%|██████████| 379/379 [00:00<00:00, 1404.08it/s]


✅ Built corpus with 4731 functions
Computing embeddings (one-time operation)...


Embedding: 100%|██████████| 37/37 [01:51<00:00,  3.00s/it]


✅ Built dense index with 4731 vectors
✅ Built BM25 index with 4731 documents


(<faiss.swigfaiss_avx2.IndexIDMap; proxy of <Swig Object of type 'faiss::IndexIDMapTemplate< faiss::Index > *' at 0x00000159C02CE370> >,
 <rank_bm25.BM25Okapi at 0x15a1f15e0d0>,
                                                      id  \
 0            counting_inversions.py::count_split_inv::8   
 1          counting_inversions.py::count_inversions::31   
 2                 dutch_national_flag_algo.py::DNFS::21   
 3              majority_element.py::majority_element::7   
 4             max_sub_array_sum.py::max_sub_arr_sum::18   
 ...                                                 ...   
 4726  945_Minimum_Increment_to_Make_Array_Unique.py:...   
 4727    973_K_Closest_Points_to_Origin.py::kClosest::11   
 4728  977_Squares_of_a_Sorted_Array.py::sortedSquare...   
 4729                   981_Time_Based_Store.py::get::11   
 4730           997_Find_The_Town_Judge.py::findJudge::2   
 
                                                    code  \
 0     def count_split_inv(arr, left, ri