In [1]:
#
#  CELL 0: SETUP AND MOUNT DRIVE
#
print("Installing required libraries...")
!pip install datasets pandas tqdm -q

print("\nMounting Google Drive...")
from google.colab import drive
drive.mount('/content/drive')

# Import libraries for the rest of the notebook
import pandas as pd
from datasets import load_dataset
import random
import json
from tqdm.notebook import tqdm
import math

# Define the final save path for your dataset on Google Drive
# You can change 'My Drive' or the filename if you wish
DRIVE_SAVE_PATH = '/content/drive/My Drive/code_plagiarism_dataset_final.csv'

print(f"\n✅ Setup complete. The final dataset will be saved to: {DRIVE_SAVE_PATH}")

Installing required libraries...

Mounting Google Drive...
Mounted at /content/drive

✅ Setup complete. The final dataset will be saved to: /content/drive/My Drive/code_plagiarism_dataset_final.csv


In [2]:
#
# CELL 1: GENERATE JAVA PAIRS
#
def generate_java_pairs(num_pairs):
    JAVA_DATASET_ID = "google/code_x_glue_cc_clone_detection_big_clone_bench"
    print(f"\n--- Generating {num_pairs} Java pairs from {JAVA_DATASET_ID} ---")

    java_pairs = []
    positive_needed = math.ceil(num_pairs / 2)
    negative_needed = math.floor(num_pairs / 2)

    positive_count, negative_count = 0, 0

    dataset = load_dataset(JAVA_DATASET_ID, split="train", streaming=True)

    for row in tqdm(dataset, desc="Processing Java pairs", total=num_pairs):
        if positive_count >= positive_needed and negative_count >= negative_needed:
            break

        label = row['label']
        # Handle boolean (True/False) or integer (1/0) labels
        if label and positive_count < positive_needed:
            java_pairs.append({'code1': row['func1'], 'code2': row['func2'], 'label': 1})
            positive_count += 1
        elif not label and negative_count < negative_needed:
            java_pairs.append({'code1': row['func1'], 'code2': row['func2'], 'label': 0})
            negative_count += 1

    print(f"✅ Generated {len(java_pairs)} Java pairs ({positive_count} positive, {negative_count} negative).")
    return java_pairs

# --- Execute Java Pair Generation ---
java_pairs = generate_java_pairs(3500)


--- Generating 3500 Java pairs from google/code_x_glue_cc_clone_detection_big_clone_bench ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Processing Java pairs:   0%|          | 0/3500 [00:00<?, ?it/s]

✅ Generated 3500 Java pairs (1750 positive, 1750 negative).


In [10]:
#
# CELL 2: GENERATE C++ PAIRS (CORRECTED LOGIC)
#
# ------------------------------------------------------------------------------------
# IMPORTANT: Replace the placeholder below with the Hugging Face ID for your C++ dataset
CPP_DATASET_ID = "google/code_x_glue_cc_clone_detection_poj104"
# ------------------------------------------------------------------------------------

def generate_pairs_from_solutions(dataset_id, num_pairs, code_col, problem_id_col, language_name):
    print(f"\n--- Generating {num_pairs} {language_name} pairs from {dataset_id} using group-by-problem-id logic ---")

    print(f"Pre-processing {language_name} dataset...")
    dataset = load_dataset(dataset_id, split="train")
    solutions_by_problem = {}

    # Step 1: Group all code by the problem ID, as you instructed
    for row in tqdm(dataset, desc=f"Grouping {language_name} solutions by problem ID"):
        pid = row[problem_id_col]
        code = row[code_col]
        if pid not in solutions_by_problem:
            solutions_by_problem[pid] = []
        solutions_by_problem[pid].append(code)

    # Filter for problems that have multiple solutions, which we need for positive pairs
    problems_with_pairs = {pid: sols for pid, sols in solutions_by_problem.items() if len(sols) >= 2}
    problem_ids_with_pairs = list(problems_with_pairs.keys())
    all_problem_ids = list(solutions_by_problem.keys())

    generated_pairs = []
    positive_needed = math.ceil(num_pairs / 2)
    negative_needed = math.floor(num_pairs / 2)

    # Step 2: Generate Plagiarized (Positive) Pairs
    print(f"Generating {positive_needed} positive {language_name} pairs...")
    for _ in tqdm(range(positive_needed), desc=f"Positive {language_name} pairs"):
        # Pick a random problem that has multiple solutions
        pid = random.choice(problem_ids_with_pairs)
        # Pick two different random solutions from that same problem
        sol1, sol2 = random.sample(problems_with_pairs[pid], 2)
        generated_pairs.append({'code1': sol1, 'code2': sol2, 'label': 1})

    # Step 3: Generate Non-Plagiarized (Negative) Pairs
    print(f"Generating {negative_needed} negative {language_name} pairs...")
    for _ in tqdm(range(negative_needed), desc=f"Negative {language_name} pairs"):
        # Pick two different random problems
        pid1, pid2 = random.sample(all_problem_ids, 2)
        while pid1 == pid2:
            pid1, pid2 = random.sample(all_problem_ids, 2)
        # Pick one random solution from each problem
        sol1 = random.choice(solutions_by_problem[pid1])
        sol2 = random.choice(solutions_by_problem[pid2])
        generated_pairs.append({'code1': sol1, 'code2': sol2, 'label': 0})

    print(f"✅ Generated {len(generated_pairs)} {language_name} pairs.")
    return generated_pairs

# --- Execute C++ Pair Generation ---
if CPP_DATASET_ID == "YOUR_CPP_DATASET_ID_HERE":
    print("🛑 PLEASE REPLACE THE CPP_DATASET_ID PLACEHOLDER IN THIS CELL BEFORE RUNNING!")
    cpp_pairs = []
else:
    # Here, we tell the function that for your dataset:
    # - the code is in the 'code' column
    # - the problem ID is in the 'label' column
    cpp_pairs = generate_pairs_from_solutions(
        dataset_id=CPP_DATASET_ID,
        num_pairs=3500,
        code_col='code',
        problem_id_col='label',
        language_name="C++"
    )


--- Generating 3500 C++ pairs from google/code_x_glue_cc_clone_detection_poj104 using group-by-problem-id logic ---
Pre-processing C++ dataset...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/8.03M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/2.85M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32500 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12000 [00:00<?, ? examples/s]

Grouping C++ solutions by problem ID:   0%|          | 0/32500 [00:00<?, ?it/s]

Generating 1750 positive C++ pairs...


Positive C++ pairs:   0%|          | 0/1750 [00:00<?, ?it/s]

Generating 1750 negative C++ pairs...


Negative C++ pairs:   0%|          | 0/1750 [00:00<?, ?it/s]

✅ Generated 3500 C++ pairs.


In [18]:
#
# CELL 3: GENERATE PYTHON PAIRS (Using kye/all-google-ai-python-code)
#
import re

# --- Helper Functions for Code Transformation ---

def rename_variables_simple(code):
    """A very simple variable renamer for Python."""
    keywords = {'def', 'return', 'for', 'in', 'if', 'else', 'while', 'import', 'from', 'as', 'True', 'False', 'None'}
    # Find words that look like variables
    variables = sorted(list(set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', code))))
    # Filter out keywords and built-in function names
    variables = [v for v in variables if v not in keywords and not hasattr(__builtins__, v)]

    if not variables:
        return code

    new_names = ['var_a', 'var_b', 'item', 'temp', 'val', 'x', 'y', 'z']
    mapping = {var: new_names[i % len(new_names)] for i, var in enumerate(variables)}

    for old, new in mapping.items():
        code = re.sub(r'\b' + old + r'\b', new, code)
    return code

def add_dead_code_simple(code):
    """Adds a simple, useless line of code."""
    lines = code.split('\n')
    dead_code = "useless_var = 1 + 2 # Dead code"
    if lines:
        # Indent the dead code to match the first line's indentation
        leading_whitespace = len(lines[0]) - len(lines[0].lstrip())
        indented_dead_code = ' ' * leading_whitespace + dead_code
        lines.insert(random.randint(1, len(lines)), indented_dead_code)
    return '\n'.join(lines)

def create_plagiarized_version(original_code):
    """Applies one or more random transformations to create a plagiarized version."""
    code = original_code
    transformations = [rename_variables_simple, add_dead_code_simple]

    num_to_apply = random.randint(1, len(transformations))
    for t in random.sample(transformations, num_to_apply):
        code = t(code)
    return code

# --- Main Function to Generate Pairs ---
def generate_python_pairs(num_pairs):
    # Using the dataset and column name you specified
    PYTHON_DATASET_ID = "kye/all-google-ai-python-code"
    CODE_COLUMN = "python_code"

    print(f"\n--- Generating {num_pairs} Python pairs from {PYTHON_DATASET_ID} using synthetic method ---")

    print("Loading Python dataset...")
    dataset = load_dataset(PYTHON_DATASET_ID, split="train")

    # Get a clean list of all non-empty code snippets from the specified column
    all_code_snippets = [row[CODE_COLUMN] for row in dataset if row[CODE_COLUMN] and row[CODE_COLUMN].strip()]

    python_pairs = []
    positive_needed = math.ceil(num_pairs / 2)
    negative_needed = math.floor(num_pairs / 2)

    print(f"Generating {positive_needed} positive Python pairs...")
    for _ in tqdm(range(positive_needed), desc="Positive Python pairs"):
        original = random.choice(all_code_snippets)
        plagiarized = create_plagiarized_version(original)
        python_pairs.append({'code1': original, 'code2': plagiarized, 'label': 1})

    print(f"Generating {negative_needed} negative Python pairs...")
    for _ in tqdm(range(negative_needed), desc="Negative Python pairs"):
        code1, code2 = random.sample(all_code_snippets, 2)
        python_pairs.append({'code1': code1, 'code2': code2, 'label': 0})

    print(f"✅ Generated {len(python_pairs)} Python pairs.")
    return python_pairs

# --- Execute Python Pair Generation ---
python_pairs = generate_python_pairs(3000)


--- Generating 3000 Python pairs from kye/all-google-ai-python-code using synthetic method ---
Loading Python dataset...


README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

data/train-00000-of-00001-131ca0ee6afc66(…):   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1157 [00:00<?, ? examples/s]

Generating 1500 positive Python pairs...


Positive Python pairs:   0%|          | 0/1500 [00:00<?, ?it/s]

Generating 1500 negative Python pairs...


Negative Python pairs:   0%|          | 0/1500 [00:00<?, ?it/s]

✅ Generated 3000 Python pairs.


In [19]:
#
# CELL 4: COMBINE, SHUFFLE, AND SAVE
#
print("\n--- Finalizing Dataset ---")

if 'java_pairs' in locals() and 'cpp_pairs' in locals() and 'python_pairs' in locals():
    print(f"Combining {len(java_pairs)} Java, {len(cpp_pairs)} C++, and {len(python_pairs)} Python pairs...")
    final_dataset = java_pairs + cpp_pairs + python_pairs

    print(f"Shuffling final dataset of {len(final_dataset)} pairs...")
    random.shuffle(final_dataset)

    print(f"Converting to DataFrame and saving to: {DRIVE_SAVE_PATH}")
    df = pd.DataFrame(final_dataset)
    df.to_csv(DRIVE_SAVE_PATH, index=False)

    print("\n" + "="*50)
    print("      ✅✅✅  DATASET CREATION COMPLETE  ✅✅✅")
    print("="*50)

    print(f"\nTotal pairs created: {len(df)}")
    print(f"File saved successfully to your Google Drive!")

    print("\nHere is a preview of your final dataset:")
    print(df.head())
else:
    print("🛑 One or more language pair lists were not generated. Please run all previous cells successfully.")


--- Finalizing Dataset ---
Combining 3500 Java, 3500 C++, and 3000 Python pairs...
Shuffling final dataset of 10000 pairs...
Converting to DataFrame and saving to: /content/drive/My Drive/code_plagiarism_dataset_final.csv

      ✅✅✅  DATASET CREATION COMPLETE  ✅✅✅

Total pairs created: 10000
File saved successfully to your Google Drive!

Here is a preview of your final dataset:
                                               code1  \
0  # coding=utf-8\n# Copyright 2021 The Google Re...   
1  # Copyright 2022 Google LLC\n#\n# Licensed und...   
2  # Copyright 2022 Google LLC\n#\n# Licensed und...   
3  # coding=utf-8\n# Copyright 2018 The Google Re...   
4  \r\nint main() {\r\n\tint a,b,c,d,e,A,B,C,D,E;...   

                                               code2  label  
0  # item=item-8\n# var_b 2021 item item x y.\n#\...      1  
1  # Copyright 2022 The Authors\n# \n# Licensed u...      0  
2  # coding=utf-8\n# Copyright 2023 The Chirp Aut...      0  
3  # coding=utf-8\n# Copyright 20