In [None]:
import os
import pickle
import json

import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from ollama import Client

try:
    client = Client()
    print("Ollama client initialized.")
    client.show('phi3:mini') 
    print("Successfully connected to local model 'phi3:mini'.")
except Exception as e:
    print(f"Error initializing Ollama client: {e}")
    print("Please ensure the Ollama application is running and you have downloaded the model (e.g., 'ollama run phi3:mini').")

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Embedding model loaded.")

INDEX_DIR = 'indexes'
FAISS_INDEX_PATH = os.path.join(INDEX_DIR, 'faiss_index.bin')
BM25_INDEX_PATH = os.path.join(INDEX_DIR, 'bm25_index.pkl')
METADATA_PATH = os.path.join(INDEX_DIR, 'metadata.pkl')

try:
    faiss_index = faiss.read_index(FAISS_INDEX_PATH)

    with open(BM25_INDEX_PATH, 'rb') as f:
        bm25_index = pickle.load(f)

    with open(METADATA_PATH, 'rb') as f:
        metadata = pickle.load(f)
    
    print("All indexes and metadata have been loaded successfully.")
    print(f"FAISS index contains {faiss_index.ntotal} vectors.")
    print(f"Metadata contains information for {len(metadata)} chunks.")

except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    print("Please make sure you have run the '01_indexing.ipynb' notebook first.")

Ollama client initialized.
Successfully connected to local model 'phi3:mini'.
Embedding model loaded.
All indexes and metadata have been loaded successfully.
FAISS index contains 18 vectors.
Metadata contains information for 18 chunks.


In [None]:
def detect_embedding(query_code, k=1, similarity_threshold=0.9):
    query_embedding = embedding_model.encode([query_code])
    
    distances, indices = faiss_index.search(query_embedding, k)
    
    top_match_index = indices[0][0]
    match_distance = distances[0][0]
    
    cosine_similarity = 1 - (match_distance**2 / 2)
    
    is_plagiarized = cosine_similarity > similarity_threshold
    
    match_info = metadata[top_match_index]
    
    return {
        "is_plagiarized": bool(is_plagiarized),
        "confidence_score": float(cosine_similarity),
        "explanation": f"The most similar function found has a similarity score of {float(cosine_similarity):.4f}. "
                       f"The threshold for plagiarism is {similarity_threshold}.",
        "most_similar_function": {
            "file_path": match_info['file_path'],
            "function_name": match_info['function_name'],
            "code": match_info['code']
        }
    }

In [None]:
def load_full_corpus_text(directory='data/reference_corpus'):
    full_text = ""
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.py'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    full_text += f"--- START OF FILE: {file_path} ---\n\n"
                    full_text += f.read()
                    full_text += f"\n\n--- END OF FILE: {file_path} ---\n\n"
    return full_text

FULL_CORPUS_TEXT = load_full_corpus_text()

def detect_llm(query_code):
    prompt = f"""
    You are a Code Plagiarism Detection expert. Your task is to determine if the provided "Query Code" is plagiarized from the "Source Code Corpus".

    Analyze the "Query Code" and determine if its core logic, structure, or implementation is substantially derived from any part of the "Source Code Corpus".

    Respond ONLY with a valid JSON object with three keys:
    1. "is_plagiarized": a boolean (true or false).
    2. "confidence_score": a float between 0.0 and 1.0.
    3. "explanation": a brief justification for your decision.

    --- SOURCE CODE CORPUS ---
    {FULL_CORPUS_TEXT[:20000]} 

    --- QUERY CODE ---
    {query_code}

    --- YOUR JSON RESPONSE ---
    """
    
    try:
        response = client.chat(
            model="phi3:mini",
            format="json",
            messages=[{"role": "user", "content": prompt}]
        )
        return json.loads(response['message']['content'])
    except Exception as e:
        return {"error": str(e)}

In [None]:
def detect_rag(query_code, k=5):
    query_embedding = embedding_model.encode([query_code])
    _, indices = faiss_index.search(query_embedding, k)
    
    retrieved_chunks = [metadata[i] for i in indices[0]]
    
    context = ""
    for i, chunk in enumerate(retrieved_chunks):
        context += f"--- Retrieved Function #{i+1} from {chunk['file_path']} ---\n"
        context += f"Function Name: {chunk['function_name']}\n"
        context += f"Code:\n{chunk['code']}\n\n"

    prompt = f"""
    You are a Code Plagiarism Detection expert. Your task is to determine if the "Query Code" is plagiarized from any of the "Retrieved Functions" provided as context.

    Analyze the "Query Code" and determine if its core logic is substantially derived from any of the "Retrieved Functions".

    Respond ONLY with a valid JSON object with three keys:
    1. "is_plagiarized": a boolean (true or false).
    2. "confidence_score": a float between 0.0 and 1.0.
    3. "explanation": a brief justification. If plagiarized, mention which function it is similar to.

    --- RETRIEVED FUNCTIONS (CONTEXT) ---
    {context}

    --- QUERY CODE ---
    {query_code}

    --- YOUR JSON RESPONSE ---
    """
    
    try:
        response = client.chat(
            model="phi3:mini",
            format="json",
            messages=[{"role": "user", "content": prompt}]
        )
        return json.loads(response['message']['content'])
    except Exception as e:
        return {"error": str(e)}

In [None]:
def detect_hybrid_rag(query_code, k=5):
    query_embedding = embedding_model.encode([query_code])
    _, dense_indices = faiss_index.search(query_embedding, k)
    
    tokenized_query = query_code.split()
    bm25_scores = bm25_index.get_scores(tokenized_query)
    lexical_indices = np.argsort(bm25_scores)[::-1][:k]
    
    combined_indices = np.union1d(dense_indices[0], lexical_indices)
    
    retrieved_chunks = [metadata[i] for i in combined_indices]
    
    context = ""
    for i, chunk in enumerate(retrieved_chunks):
        context += f"--- Retrieved Function #{i+1} from {chunk['file_path']} ---\n"
        context += f"Function Name: {chunk['function_name']}\n"
        context += f"Code:\n{chunk['code']}\n\n"

    prompt = f"""
    You are a Code Plagiarism Detection expert. Your task is to determine if the "Query Code" is plagiarized from any of the "Retrieved Functions" provided as context.

    Analyze the "Query Code" and determine if its core logic is substantially derived from any of the "Retrieved Functions".

    Respond ONLY with a valid JSON object with three keys:
    1. "is_plagiarized": a boolean (true or false).
    2. "confidence_score": a float between 0.0 and 1.0.
    3. "explanation": a brief justification. If plagiarized, mention which function it is similar to.

    --- RETRIEVED FUNCTIONS (CONTEXT) ---
    {context}

    --- QUERY CODE ---
    {query_code}

    --- YOUR JSON RESPONSE ---
    """
    
    try:
        response = client.chat(
            model="phi3:mini",
            format="json",
            messages=[{"role": "user", "content": prompt}]
        )
        return json.loads(response['message']['content'])
    except Exception as e:
        return {"error": str(e)}

In [None]:
sample_code = """
def b_sort(numbers):
    list_len = len(numbers)
    for i in range(list_len):
        for j in range(0, list_len - i - 1):
            if numbers[j] > numbers[j + 1]:
                numbers[j], numbers[j + 1] = numbers[j + 1], numbers[j]
    return numbers
"""

print("--- Running Plagiarism Checks on Sample Code ---")

print("\n[1] Pure Embedding Search:")
result_embedding = detect_embedding(sample_code)
print(json.dumps(result_embedding, indent=2))

print("\n[3] Standard RAG:")
result_rag = detect_rag(sample_code)
print(json.dumps(result_rag, indent=2))

print("\n[4] Hybrid RAG:")
result_hybrid = detect_hybrid_rag(sample_code)
print(json.dumps(result_hybrid, indent=2))

--- Running Plagiarism Checks on Sample Code ---

[1] Pure Embedding Search:
{
  "is_plagiarized": false,
  "confidence_score": 0.5355204343795776,
  "explanation": "The most similar function found has a similarity score of 0.5355. The threshold for plagiarism is 0.9.",
  "most_similar_function": {
    "file_path": "data\\reference_corpus\\repo5\\14.py",
    "function_name": "bubblesort",
    "code": "def bubblesort(arr):\n    arr = arr.copy()\n    n = len(arr)\n    for i in range(n):\n        swapped = False\n        for j in range(0, n - i - 1):\n            if arr[j] > arr[j + 1]:\n                arr[j], arr[j + 1] = (arr[j + 1], arr[j])\n                swapped = True\n        if not swapped:\n            break\n    return arr"
  }
}

[3] Standard RAG:
{
  "is_plagiarized": true,
  "confidence_score": 0.95,
  "explanation": "The Query Code 'bubblesort' function has a very similar structure and logic to the first retrieved function (Retrieved Function #1), with only minor differenc