In [2]:
import json
import os
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import tree_sitter_python as tspython
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from tqdm.notebook import tqdm
from tree_sitter import Language, Parser


### Dataset Construction Summary

I collected Python code from multiple GitHub repositories and used it as the base corpus for generating plagiarism test samples. For each selected function, I created two types of examples:

**1. Positive (plagiarized) examples**
I took original functions and applied realistic plagiarism-style transformations. These included renaming variables and functions, removing comments, changing formatting, and reordering independent statements while keeping the underlying logic identical. I validated each transformed sample using AST similarity and token-level overlap to ensure it remained structurally close to the original.

**2. Negative (non-plagiarized) examples**
I generated completely different Python solutions to unrelated algorithmic problems so that they shared no meaningful structure or logic with the original. I checked embedding similarity against both the original and the entire corpus to make sure these samples were genuinely dissimilar.

I embedded the full corpus once, cached the embeddings, and used them during validation. The final output is a labeled dataset of more than 30 examples, stored in JSON, with metadata such as AST similarity, token Jaccard scores, and links back to the original code.


# NOTE: I have not used structured output deliberately. While it is good practice to use them, in my scenario It was not working well, gave me overhead and some complications - thus I removed them

In [4]:
# I have stored my keys in run configuration
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("API key missing. Fix your setup.")


client = OpenAI(api_key=OPENAI_API_KEY)

PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)

def get_embedding(text):
    """Get OpenAI embedding"""
    response = client.embeddings.create(
        model="text-embedding-3-large",
        input=text
    )
    return np.array(response.data[0].embedding)

def get_ast_structure(code):
    """Extract AST node types as structural fingerprint"""
    tree = parser.parse(bytes(code, "utf8"))
    nodes = []
    def traverse(node):
        nodes.append(node.type)
        for child in node.children:
            traverse(child)
    traverse(tree.root_node)
    return ' '.join(nodes)

def compute_ast_similarity(code1, code2):
    """Structural similarity via AST"""
    ast1 = get_ast_structure(code1)
    ast2 = get_ast_structure(code2)
    emb1 = get_embedding(ast1)
    emb2 = get_embedding(ast2)
    return cosine_similarity([emb1], [emb2])[0][0]

def compute_token_jaccard(code1, code2):
    """Token-level overlap"""
    tree1 = parser.parse(bytes(code1, "utf8"))
    tree2 = parser.parse(bytes(code2, "utf8"))
    tokens1 = set(tree1.root_node.text.decode().split())
    tokens2 = set(tree2.root_node.text.decode().split())
    intersection = len(tokens1 & tokens2)
    union = len(tokens1 | tokens2)
    return intersection / union if union > 0 else 0

def generate_plagiarized_variant(code):
    """Generate high-similarity plagiarized code"""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{
            "role": "user",
            "content": f"""Transform this Python code to be plagiarized (80-95% similar):

Rules:
1. Keep exact same algorithm and control flow
2. Rename ALL variables/functions to different names
3. Change whitespace/formatting
4. Reorder independent statements ONLY
5. Remove comments

Original:
```python
{code}
```

Return ONLY the transformed code, no markdown, no explanation."""
        }],
        temperature=0.7,
        max_tokens=1024
    )

    result = response.choices[0].message.content.strip()
    result = result.replace('```python', '').replace('```', '').strip()
    return result

def validate_plagiarized(original, generated):
    """Check if generated code is actually plagiarized"""
    ast_sim = compute_ast_similarity(original, generated)
    token_jac = compute_token_jaccard(original, generated)

    is_valid = ast_sim >= 0.75 and token_jac >= 0.1
    return is_valid, ast_sim, token_jac


def generate_non_plagiarized_dsa(original_code):
    """Generate a completely different DSA problem solution"""

    prompt = f"""Generate a COMPLETELY DIFFERENT Python DSA problem solution.

RULES:
1. Must solve a DIFFERENT algorithmic problem
2. Use DIFFERENT data structures (if original uses array, use tree/graph/hash)
3. Use DIFFERENT algorithmic paradigm (if original is greedy, use DP/backtracking/etc)
4. Must be valid, executable Python code
5. 30-200 lines

AVOID any similarity to this:
```python
{original_code}
```

Return ONLY the Python code, no markdown, no explanation."""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
    )

    result = response.choices[0].message.content.strip()
    result = result.replace('```python', '').replace('```', '').strip()
    return result

def validate_non_plagiarized(original, generated, corpus_embeddings, threshold=0.5):
    """Ensure generated code is dissimilar from both original AND entire corpus"""

    response = client.embeddings.create(
        model="text-embedding-3-large",
        input=[original, generated]
    )
    orig_sim = cosine_similarity(
        [response.data[0].embedding],
        [response.data[1].embedding]
    )[0][0]

    if orig_sim > threshold:
        return False, orig_sim, 1.0

    # Check against corpus
    gen_emb = np.array(response.data[1].embedding).reshape(1, -1)
    corpus_sims = cosine_similarity(gen_emb, corpus_embeddings)[0]
    max_corpus_sim = corpus_sims.max()

    if max_corpus_sim > threshold:
        return False, orig_sim, max_corpus_sim

    return True, orig_sim, max_corpus_sim


def embed_batch(texts, batch_size=128):
    """Generate embeddings in batches using OpenAI API"""
    all_embeds = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[i:i+batch_size]
        resp = client.embeddings.create(model="text-embedding-3-large", input=batch)
        all_embeds.extend([d.embedding for d in resp.data])
    return np.array(all_embeds, dtype=np.float32)

def load_or_compute_embeddings(corpus, cache_path="indexes/embeddings.pkl"):
    """Load cached embeddings or compute once and save"""
    if Path(cache_path).exists():
        print(f"Loading embeddings from {cache_path}")
        with open(cache_path, 'rb') as f:
            return pickle.load(f)

    print("Computing embeddings (one-time operation)...")
    embeddings = embed_batch(corpus['code'].tolist())

    with open(cache_path, 'wb') as f:
        pickle.dump(embeddings, f)

    return embeddings

def generate_dataset(corpus_df, n_samples=30):
    """Generate plagiarism detection dataset"""

    corpus_embeddings = load_or_compute_embeddings(corpus_df)

    sample_df = corpus_df.sample(n=n_samples, random_state=42)

    dataset = []
    stats = {'pos_success': 0, 'pos_fail': 0, 'neg_success': 0, 'neg_fail': 0}

    for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
        original = row['code']

        # POSITIVE: Generate plagiarized variant
        for attempt in range(3):
            try:
                plagiarized = generate_plagiarized_variant(original)
                is_valid, ast_sim, tok_jac = validate_plagiarized(original, plagiarized)

                if is_valid:
                    dataset.append({
                        'code': plagiarized,
                        'label': 1,
                        'original_ref': original,
                        'ast_similarity': float(ast_sim),
                        'token_jaccard': float(tok_jac)
                    })
                    stats['pos_success'] += 1
                    break
                else:
                    print(f"  Attempt {attempt+1} failed: ast={ast_sim:.2f}, jac={tok_jac:.2f}")
            except Exception as e:
                print(f"  Error: {e}")
        else:
            stats['pos_fail'] += 1

        # NEGATIVE: Sample dissimilar corpus code
        for attempt in range(5):
            try:
                non_plagiarized = generate_non_plagiarized_dsa(original)
                is_valid, orig_sim, corpus_sim = validate_non_plagiarized(
                    original, non_plagiarized, corpus_embeddings, threshold=0.6
                )

                if is_valid:
                    dataset.append({
                        'code': non_plagiarized,
                        'label': 0,
                        'original_ref': original,
                        'original_similarity': float(orig_sim),
                        'max_corpus_similarity': float(corpus_sim)
                    })
                    stats['neg_success'] += 1
                    break
                else:
                    print(f"  Negative attempt {attempt+1}: orig_sim={orig_sim:.2f}, corpus_sim={corpus_sim:.2f}")
            except Exception as e:
                print(f"  Negative error: {e}")
        else:
            stats['neg_fail'] += 1

    print(f"\nResults:")
    print(f"  Positive: {stats['pos_success']}/{n_samples} (failed: {stats['pos_fail']})")
    print(f"  Negative: {stats['neg_success']}/{n_samples} (failed: {stats['neg_fail']})")
    print(f"  Total: {len(dataset)} samples")

    return dataset



In [9]:
# Usage
df_corpus = pd.read_csv("data/reference_corpus.csv")
dataset = generate_dataset(df_corpus, n_samples=31)
with open('data/test_dataset.json', 'w') as f:
    json.dump(dataset, f, indent=2)

Loading embeddings from indexes/embeddings.pkl


  0%|          | 0/31 [00:00<?, ?it/s]

  Attempt 1 failed: ast=0.96, jac=0.08
  Attempt 2 failed: ast=0.96, jac=0.08
  Attempt 3 failed: ast=0.96, jac=0.08
  Negative attempt 1: orig_sim=0.13, corpus_sim=0.72


  6%|▋         | 2/31 [00:23<05:25, 11.23s/it]

  Negative attempt 1: orig_sim=0.32, corpus_sim=0.62
  Negative attempt 2: orig_sim=0.33, corpus_sim=0.72


 10%|▉         | 3/31 [00:40<06:28, 13.89s/it]

  Negative attempt 1: orig_sim=0.23, corpus_sim=0.62


 13%|█▎        | 4/31 [00:48<05:12, 11.56s/it]

  Negative attempt 1: orig_sim=0.08, corpus_sim=0.61
  Negative attempt 2: orig_sim=0.11, corpus_sim=0.71


 16%|█▌        | 5/31 [01:06<05:53, 13.60s/it]

  Negative attempt 1: orig_sim=0.23, corpus_sim=0.62


 23%|██▎       | 7/31 [01:24<04:23, 10.97s/it]

  Negative attempt 1: orig_sim=0.25, corpus_sim=0.73
  Negative attempt 2: orig_sim=0.25, corpus_sim=0.63
  Negative attempt 3: orig_sim=0.23, corpus_sim=0.72


 35%|███▌      | 11/31 [02:02<02:58,  8.92s/it]

  Negative attempt 1: orig_sim=0.16, corpus_sim=0.67


 42%|████▏     | 13/31 [02:19<02:29,  8.33s/it]

  Negative attempt 1: orig_sim=0.07, corpus_sim=0.62
  Negative attempt 2: orig_sim=0.05, corpus_sim=0.70


 52%|█████▏    | 16/31 [02:46<01:58,  7.92s/it]

  Negative attempt 1: orig_sim=0.36, corpus_sim=0.80
  Negative attempt 2: orig_sim=0.33, corpus_sim=0.70
  Negative attempt 3: orig_sim=0.31, corpus_sim=0.67


 58%|█████▊    | 18/31 [03:10<02:05,  9.63s/it]

  Negative attempt 1: orig_sim=0.26, corpus_sim=0.83
  Negative attempt 2: orig_sim=0.25, corpus_sim=0.71


 61%|██████▏   | 19/31 [03:27<02:20, 11.69s/it]

  Negative attempt 1: orig_sim=0.12, corpus_sim=0.74
  Negative attempt 2: orig_sim=0.16, corpus_sim=0.75
  Negative attempt 3: orig_sim=0.15, corpus_sim=0.62


 65%|██████▍   | 20/31 [03:44<02:28, 13.50s/it]

  Negative attempt 1: orig_sim=0.31, corpus_sim=0.79


 77%|███████▋  | 24/31 [04:17<01:02,  8.97s/it]

  Negative attempt 1: orig_sim=0.12, corpus_sim=0.72
  Negative attempt 2: orig_sim=0.13, corpus_sim=0.68


 81%|████████  | 25/31 [04:34<01:08, 11.38s/it]

  Negative attempt 1: orig_sim=0.22, corpus_sim=0.65
  Negative attempt 2: orig_sim=0.20, corpus_sim=0.70
  Negative attempt 3: orig_sim=0.22, corpus_sim=0.73


 87%|████████▋ | 27/31 [04:59<00:45, 11.31s/it]

  Attempt 1 failed: ast=0.94, jac=0.09
  Negative attempt 1: orig_sim=0.16, corpus_sim=0.64


 97%|█████████▋| 30/31 [05:28<00:09,  9.86s/it]

  Negative attempt 1: orig_sim=0.21, corpus_sim=0.69
  Negative attempt 2: orig_sim=0.18, corpus_sim=0.63
  Negative attempt 3: orig_sim=0.24, corpus_sim=0.74
  Negative attempt 4: orig_sim=0.20, corpus_sim=0.64


100%|██████████| 31/31 [05:50<00:00, 11.31s/it]


Results:
  Positive: 30/31 (failed: 1)
  Negative: 31/31 (failed: 0)
  Total: 61 samples



