In [None]:
import os
from dotenv import load_dotenv
import requests
import tqdm
import parso
import random
import re
from pathlib import Path

# Load environment variables from .env file
load_dotenv()

GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")
GITHUB_SEARCH_URL = "https://api.github.com/search/repositories"
NUMBER_OF_REPOS = 50
MIN_FUNCTION_LENGTH = 20
DATA_ROOT = Path("./python_if_dataset")
all_functions_path = DATA_ROOT / "all_functions.txt"

with open(all_functions_path, "r", encoding="utf-8") as f:
    funcs = f.readlines()

PRETRAIN_SIZE = 150000
FINETUNE_SIZE = 50000
MASK_TOKEN = "<extra_id_0>"
TOKENIZER_VOCAB_SIZE = 32000
REPOS_PER_PAGE = 30

def show_dir(path, label):
    print(f"{label}: {path}")
    for name in os.listdir(path):
        print(" -", name)

DATA_ROOT.mkdir(parents=True, exist_ok=True)
(DATA_ROOT / "raw").mkdir(parents=True, exist_ok=True)
show_dir(DATA_ROOT, "After directory creation")

HEADERS = {
    "Authorization": f"Bearer {GITHUB_API_KEY}",
    "Accept": "application/vnd.github.v3+json"
}

def github_search_python_repos(n=30):
    print("Searching Python repos on GitHub...")
    repos = []
    per_page = min(n, REPOS_PER_PAGE)
    pages = (n + per_page - 1) // per_page
    for page in range(1, pages+1):
        to_fetch = min(per_page, n - len(repos))
        params = {"q": "language:Python", "sort": "stars", "order": "desc",
                  "per_page": to_fetch, "page": page}
        resp = requests.get(GITHUB_SEARCH_URL, headers=HEADERS, params=params, timeout=20)
        print("API page", page, "status", resp.status_code)
        if resp.status_code != 200:
            print(resp.text)
            raise RuntimeError(f"GitHub API error {resp.status_code}")
        data = resp.json()
        repos.extend(data.get("items", []))
        if len(repos) >= n:
            break
    print(f"Returning {len(repos)} repo URLs")
    return [r["clone_url"] for r in repos[:n]]

def clone_repo(url):
    local_dir = DATA_ROOT / "raw" / url.split('/')[-1].replace('.git','')
    if not local_dir.exists():
        print(f"Cloning: {url}")
        os.system(f'git clone --depth=1 {url} "{local_dir}" > /dev/null 2>&1')
    return local_dir

def extract_functions_from_file(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            source = f.read()
        module = parso.parse(source)
        return [node.get_code() for node in module.iter_funcdefs() if node.type == 'funcdef']
    except Exception as e:
        print(f"Error: {filepath}, {e}")
        return []

def extract_functions_from_repo(repo_dir):
    py_files = list(repo_dir.rglob("*.py"))
    print(f"{repo_dir}: found {len(py_files)} .py files")
    funcs = []
    for file in tqdm.tqdm(py_files, desc=f"Parsing {repo_dir.name}"):
        funs = extract_functions_from_file(file)
        funcs.extend(f for f in funs if len(f) > MIN_FUNCTION_LENGTH)
    print(f"{repo_dir}: extracted {len(funcs)} functions")
    return funcs

print("Fetching repos...")
repo_urls = github_search_python_repos(NUMBER_OF_REPOS)
print("Repo URLs:", repo_urls[:3], "...")
print(f"Found {len(repo_urls)} repos.")

funcs = []
for url in tqdm.tqdm(repo_urls, desc="Downloading & extracting functions"):
    repo_path = clone_repo(url)
    funcs.extend(extract_functions_from_repo(repo_path))

print(f"Extracted {len(funcs)} functions.")
show_dir(DATA_ROOT, "After extraction")
assert len(funcs) > 0, "No functions found!"

all_functions_path = DATA_ROOT / "all_functions.txt"
with open(all_functions_path, "w", encoding="utf-8") as f:
    for func in funcs:
        f.write(func.strip() + "\n")
print("Wrote all_functions.txt:", os.path.exists(all_functions_path), "size:",
      os.path.getsize(all_functions_path))

# SentencePiece-related code removed because sentencepiece is not used

# The rest of your pipeline like creating masked pairs and saving them would need to be rewritten, 
# possibly using a different tokenizer or text processing approach if needed.

print("Completed pipeline.")


In [None]:
print("Hello, World2!")


In [None]:
from tokenizers import ByteLevelBPETokenizer

# Step 1: Train tokenizer on all_functions.txt (run once)
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=[str(all_functions_path)], vocab_size=32000, special_tokens=["<mask>", "<pad>", "<unk>"])
tokenizer_save_dir = DATA_ROOT / "tokenizer"
os.makedirs(tokenizer_save_dir, exist_ok=True)  # Create directory if missing

tokenizer.save_model(str(tokenizer_save_dir))

# Step 2-3: Load tokenizer and prepare masked language model data
tokenizer = ByteLevelBPETokenizer(
    vocab=str(DATA_ROOT / "tokenizer" / "vocab.json"),
    merges=str(DATA_ROOT / "tokenizer" / "merges.txt"),
)

MASK_TOKEN = "<mask>"
mask_token_id = tokenizer.token_to_id(MASK_TOKEN)

mlm_pairs = []
for func in funcs:
    tokens = tokenizer.encode(func).ids
    num_to_mask = max(1, int(0.15 * len(tokens)))
    mask_indices = random.sample(range(len(tokens)), num_to_mask)
    masked_tokens = [mask_token_id if i in mask_indices else t for i, t in enumerate(tokens)]
    input_text = tokenizer.decode(masked_tokens)
    output_text = func
    mlm_pairs.append((input_text, output_text))
    if len(mlm_pairs) >= 150000:
        break

# Step 4: Create fine-tuning data with one masked if-condition
finetune_pairs = []
import re

def mask_if_condition(func_code):
    conditions = re.findall(r'(if\s+)([^\:]+)(\:)', func_code)
    if not conditions:
        return None
    cond = random.choice(conditions)
    masked_func = func_code.replace(f"{cond[0]}{cond[1]}{cond[2]}", f"{cond[0]}{MASK_TOKEN}{cond[2]}", 1)
    return masked_func, cond[1].strip()

for func in funcs:
    pair = mask_if_condition(func)
    if pair:
        finetune_pairs.append(pair)
    if len(finetune_pairs) >= 50000:
        break

# Shuffle and split finetune_pairs into train/valid/test
random.shuffle(finetune_pairs)
n = len(finetune_pairs)
train, valid, test = finetune_pairs[:int(0.8*n)], finetune_pairs[int(0.8*n):int(0.9*n)], finetune_pairs[int(0.9*n):]

# Step 5: Save datasets
def save_dataset(pairs, path):
    with open(path, "w", encoding="utf-8") as f:
        for inp, outp in pairs:
            f.write(f"{inp}\t{outp}\n")

save_dataset(mlm_pairs, DATA_ROOT / "pretrain_mlm.txt")
save_dataset(train, DATA_ROOT / "finetune_train.txt")
save_dataset(valid, DATA_ROOT / "finetune_valid.txt")
save_dataset(test, DATA_ROOT / "finetune_test.txt")


In [None]:
import os
import requests
import pandas as pd
import parso
import random
import re
import torch
from pathlib import Path
from tqdm.auto import tqdm
from datasets import load_dataset
# Updated imports for the new tokenizer
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from transformers import (
    T5Config,
    T5ForConditionalGeneration,
    PreTrainedTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForLanguageModeling,
)

# --- 1. CONFIGURATION ---
# IMPORTANT: Replace with your actual GitHub Personal Access Token
GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")
# --- Dataset and Model Parameters ---
# NOTE: Sizes are reduced from the prompt for a faster run.
# Change these back to 150000 and 50000 for the full assignment.
PRETRAIN_TARGET_SIZE = 150000   
FINETUNE_TARGET_SIZE = 50000   
NUM_REPOS_TO_FETCH = 25        
MIN_FUNCTION_TOKENS = 20      
TOKENIZER_VOCAB_SIZE = 20000   
MASK_TOKEN = "<extra_id_0>"   

# --- File and Directory Paths ---
DATA_ROOT = Path("./if_statement_project")
RAW_CODE_DIR = DATA_ROOT / "raw_code"
# Corrected TOKENIZER_PATH to point to a .json file
TOKENIZER_PATH = DATA_ROOT / "python_tokenizer.json"
ALL_FUNCTIONS_FILE = DATA_ROOT / "all_functions.txt"
PRETRAIN_FILE = DATA_ROOT / "pretrain.txt"
FINETUNE_TRAIN_FILE = DATA_ROOT / "finetune_train.txt"
FINETUNE_VALID_FILE = DATA_ROOT / "finetune_valid.txt"
FINETUNE_TEST_FILE = DATA_ROOT / "finetune_test.txt"
PRETRAINED_MODEL_DIR = DATA_ROOT / "pretrained_t5"
FINETUNED_MODEL_DIR = DATA_ROOT / "finetuned_if_model"
PROVIDED_TESTSET_PATH = DATA_ROOT / "provided_testset_to_process.csv"

# --- Create Directories ---
DATA_ROOT.mkdir(exist_ok=True)
RAW_CODE_DIR.mkdir(exist_ok=True)

# --- 2. DATA COLLECTION & PROCESSING ---

def get_popular_python_repos(n=NUM_REPOS_TO_FETCH):
    """Fetches the most starred Python repositories from GitHub."""
    print(f"Fetching {n} popular Python repository names...")
    headers = {"Authorization": f"Bearer {GITHUB_API_KEY}"}
    params = {"q": "language:Python", "sort": "stars", "order": "desc", "per_page": n}
    resp = requests.get("https://api.github.com/search/repositories", headers=headers, params=params)
    resp.raise_for_status()
    return [repo['full_name'] for repo in resp.json()['items']]

def get_python_files_from_repo(repo_full_name):
    """Recursively fetches all .py file paths from a repository."""
    headers = {"Authorization": f"Bearer {GITHUB_API_KEY}"}
    api_url = f"https://api.github.com/repos/{repo_full_name}/git/trees/main?recursive=1"
    resp = requests.get(api_url, headers=headers)
    if resp.status_code != 200:
        return []
    tree = resp.json().get('tree', [])
    return [item['path'] for item in tree if item['path'].endswith('.py')]

def download_and_save_code(repo_full_name, file_path):
    """Downloads a single file's content and saves it locally."""
    raw_url = f"https://raw.githubusercontent.com/{repo_full_name}/main/{file_path}"
    try:
        resp = requests.get(raw_url)
        if resp.status_code == 200:
            # Sanitize filename to avoid issues with slashes in paths
            safe_filename = file_path.replace('/', '_')
            with open(RAW_CODE_DIR / f"{repo_full_name.replace('/', '_')}_{safe_filename}", "w", encoding="utf-8") as f:
                f.write(resp.text)
            return True
    except requests.exceptions.RequestException:
        return False
    return False

def extract_functions_from_code(code_str):
    """Uses parso to extract function bodies from a string of Python code."""
    try:
        module = parso.parse(code_str)
        return [
            node.get_code()
            for node in module.iter_funcdefs()
            if len(node.get_code().split()) > MIN_FUNCTION_TOKENS
        ]
    except Exception:
        return []

# --- 3. TOKENIZER TRAINING ---

def train_tokenizer(file_path):
    """Trains a Hugging Face BPE tokenizer from the collected functions."""
    print("Training Hugging Face BPE tokenizer...")
    # Initialize a BPE tokenizer
    tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    # Configure the trainer
    trainer = trainers.BpeTrainer(
        vocab_size=TOKENIZER_VOCAB_SIZE,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", MASK_TOKEN]
    )

    # Train the tokenizer
    tokenizer.train(files=[str(file_path)], trainer=trainer)

    # Save the tokenizer to the specified JSON file
    tokenizer.save(str(TOKENIZER_PATH))
    print("Tokenizer training complete.")


# --- 4. DATASET CREATION ---

def create_pretrain_dataset(functions, tokenizer):
    """Creates MLM dataset for T5 span corruption."""
    print(f"Creating pre-training dataset with {PRETRAIN_TARGET_SIZE} instances...")
    dataset = []
    pbar = tqdm(total=PRETRAIN_TARGET_SIZE)
    while len(dataset) < PRETRAIN_TARGET_SIZE:
        func = random.choice(functions)
        
        # Correctly tokenize the function using the trained tokenizer
        tokens = tokenizer.encode(func).tokens
        if len(tokens) < 10: continue

        num_to_mask = int(len(tokens) * 0.15)
        if num_to_mask == 0: continue

        masked_indices = sorted(random.sample(range(len(tokens)), k=num_to_mask))
        
        input_parts = []
        target_parts = []
        
        last_end = 0
        mask_counter = 0
        for i in masked_indices:
            input_parts.extend(tokens[last_end:i])
            input_parts.append(f"<extra_id_{mask_counter}>")
            target_parts.append(f"<extra_id_{mask_counter}>")
            target_parts.append(tokens[i])
            mask_counter += 1
            last_end = i + 1
        
        input_parts.extend(tokens[last_end:])
        target_parts.append(f"<extra_id_{mask_counter}>")

        # The tokenizer handles spaces correctly, so we can join with a space
        input_str = " ".join(input_parts)
        target_str = " ".join(target_parts)
        
        dataset.append(f"{input_str}\t{target_str}")
        pbar.update(1)
        
    pbar.close()
    with open(PRETRAIN_FILE, "w", encoding="utf-8") as f:
        f.write("\n".join(dataset))

def create_finetune_dataset(functions):
    """Creates the if-statement masking dataset."""
    print(f"Creating fine-tuning dataset with {FINETUNE_TARGET_SIZE} instances...")
    dataset = []
    pbar = tqdm(total=FINETUNE_TARGET_SIZE)
    
    # Simple regex to find `if ...:` statements
    if_regex = re.compile(r'if\s+(.+?):')
    
    random.shuffle(functions)
    for func in functions:
        matches = list(if_regex.finditer(func))
        if not matches:
            continue
            
        match = random.choice(matches)
        condition = match.group(1).strip()
        
        # Replace only the first occurrence of this specific match
        masked_func = func[:match.start(1)] + MASK_TOKEN + func[match.end(1):]
        
        dataset.append(f"{masked_func}\t{condition}")
        pbar.update(1)
        if len(dataset) >= FINETUNE_TARGET_SIZE:
            break
    
    pbar.close()
    
    # Split and save
    train_end = int(len(dataset) * 0.8)
    valid_end = int(len(dataset) * 0.9)
    
    with open(FINETUNE_TRAIN_FILE, "w", encoding="utf-8") as f:
        f.write("\n".join(dataset[:train_end]))
    with open(FINETUNE_VALID_FILE, "w", encoding="utf-8") as f:
        f.write("\n".join(dataset[train_end:valid_end]))
    with open(FINETUNE_TEST_FILE, "w", encoding="utf-8") as f:
        f.write("\n".join(dataset[valid_end:]))

# --- 5. MODEL TRAINING ---

def run_training(is_pretraining):
    """A general function to run either pre-training or fine-tuning."""
    
    model_dir = PRETRAINED_MODEL_DIR if is_pretraining else FINETUNED_MODEL_DIR
    train_file = PRETRAIN_FILE if is_pretraining else FINETUNE_TRAIN_FILE
    valid_file = PRETRAIN_FILE if is_pretraining else FINETUNE_VALID_FILE # Use train as dummy valid for pretrain
    
    print(f"\n--- Starting {'Pre-training' if is_pretraining else 'Fine-tuning'} ---")

    # Load custom tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(TOKENIZER_PATH))
    # FIX: Explicitly set the padding token to fix the ValueError
    tokenizer.pad_token = "<pad>"
    
    # Load or initialize model
    if is_pretraining:
        config = T5Config(
            vocab_size=tokenizer.vocab_size,
            d_model=256,
            d_ff=1024,
            num_layers=4,
            num_heads=4,
            decoder_start_token_id=tokenizer.pad_token_id,
        )
        model = T5ForConditionalGeneration(config)
    else:
        # Load the model we just pre-trained
        model = T5ForConditionalGeneration.from_pretrained(PRETRAINED_MODEL_DIR)

    # Load and process dataset
    dataset = load_dataset('text', data_files={'train': str(train_file), 'validation': str(valid_file)})
    
    def tokenize_function(examples):
        source, target = [], []
        for line in examples['text']:
            if '\t' in line:
                s, t = line.split('\t', 1)
                source.append(s)
                target.append(t)
        
        # The prefix is helpful for T5
        prefix = "complete the python code: "
        source = [prefix + s for s in source]

        # FIX: Reduced max_length to lower memory usage
        model_inputs = tokenizer(source, max_length=256, padding="max_length", truncation=True)
        labels = tokenizer(target, max_length=128, padding="max_length", truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

    # Training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=str(model_dir),
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=16,
        num_train_epochs=3 if is_pretraining else 5,
        eval_strategy="steps",
        eval_steps=2000,
        save_steps=2000,
        learning_rate=5e-4 if is_pretraining else 1e-4,
        weight_decay=0.01,
        save_total_limit=2,
        predict_with_generate=True,
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
        logging_steps=500,
    )

    # Data collator
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    # Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Train
    trainer.train()
    trainer.save_model()
    print(f"--- {'Pre-training' if is_pretraining else 'Fine-tuning'} Complete ---")


# --- 6. EVALUATION ---

def load_provided_test_set(file_path):
    """Loads and parses the specifically formatted provided test CSV."""
    df = pd.read_csv(file_path)
    return df['code'].tolist()

def evaluate_and_create_csv(model, tokenizer, test_data, output_filename):
    """Runs model predictions and saves results to a CSV file."""
    print(f"Evaluating and creating {output_filename}...")
    results = []
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    if_regex = re.compile(r'if\s+(.+?):')

    for code in tqdm(test_data, desc=f"Predicting for {output_filename}"):
        matches = list(if_regex.finditer(code))
        if not matches:
            continue
            
        # For simplicity, always use the first if statement for evaluation
        match = matches[0]
        expected_condition = match.group(1).strip()
        
        input_code = code[:match.start(1)] + MASK_TOKEN + code[match.end(1):]
        
        # Generate prediction
        prefix = "complete the python code: "
        inputs = tokenizer(prefix + input_code, return_tensors="pt", max_length=256, truncation=True).to(device)
        
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_length=128,
                num_beams=5,
                early_stopping=True,
                output_scores=True,
                return_dict_in_generate=True
            )
        
        predicted_condition = tokenizer.decode(output.sequences[0], skip_special_tokens=True)
        
        # Calculate score (confidence)
        # We take the average log probability of the generated sequence
        seq_log_prob = output.sequences_scores[0].item()
        score = (torch.exp(torch.tensor(seq_log_prob)).item()) * 100 # Convert log prob to prob and scale to 0-100

        results.append({
            "Input provided to the model": input_code,
            "Whether the prediction is correct": predicted_condition == expected_condition,
            "Expected if condition": expected_condition,
            "Predicted if condition": predicted_condition,
            "Prediction score (0-100)": score
        })

    pd.DataFrame(results).to_csv(output_filename, index=False)
    print(f"Successfully saved results to {output_filename}")


# --- 7. MAIN EXECUTION ---
if __name__ == '__main__':
    
    # --- Step 1: Collect and Process Data ---
    if not list(RAW_CODE_DIR.glob("*.py")):
        repo_names = get_popular_python_repos()
        for repo_name in tqdm(repo_names, desc="Downloading Repos"):
            py_files = get_python_files_from_repo(repo_name)
            for file_path in tqdm(py_files, desc=f"Files in {repo_name}", leave=False):
                download_and_save_code(repo_name, file_path)
    
    if not ALL_FUNCTIONS_FILE.exists():
        all_funcs = []
        for code_file in tqdm(list(RAW_CODE_DIR.glob("*.py")), desc="Extracting Functions"):
            with open(code_file, 'r', encoding='utf-8') as f:
                all_funcs.extend(extract_functions_from_code(f.read()))
        
        with open(ALL_FUNCTIONS_FILE, "w", encoding="utf-8") as f:
            f.write("\n".join(list(set(all_funcs)))) # Use set for deduplication
        print(f"Extracted and saved {len(all_funcs)} functions.")
    else:
        with open(ALL_FUNCTIONS_FILE, "r", encoding="utf-8") as f:
            all_funcs = f.read().splitlines()

    # --- Step 2: Train Tokenizer ---
    if not TOKENIZER_PATH.exists():
        train_tokenizer(ALL_FUNCTIONS_FILE)

    # --- Step 3: Create Datasets ---
    if not PRETRAIN_FILE.exists() or not FINETUNE_TRAIN_FILE.exists():
        # Correctly load the new tokenizer from the .json file
        tokenizer = Tokenizer.from_file(str(TOKENIZER_PATH))
        create_pretrain_dataset(all_funcs, tokenizer)
        create_finetune_dataset(all_funcs)
    
    # --- Step 4: Pre-train the Model ---
    if not (PRETRAINED_MODEL_DIR / "pytorch_model.bin").exists():
         run_training(is_pretraining=True)

    # --- Step 5: Fine-tune the Model ---
    if not (FINETUNED_MODEL_DIR / "pytorch_model.bin").exists():
        run_training(is_pretraining=False)
    
    # --- Step 6: Evaluate and Generate Submission Files ---
    print("\n--- Starting Final Evaluation ---")
    final_model = T5ForConditionalGeneration.from_pretrained(FINETUNED_MODEL_DIR)
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(TOKENIZER_PATH))
    
    # Evaluate on the test set we generated
    with open(FINETUNE_TEST_FILE, "r", encoding="utf-8") as f:
        generated_test_data = [line.split('\t')[0] for line in f.read().splitlines()]
    evaluate_and_create_csv(final_model, tokenizer, generated_test_data, "generated-testset.csv")
    
    # Evaluate on the test set provided by the user
    if PROVIDED_TESTSET_PATH.exists():
        provided_test_data = load_provided_test_set(PROVIDED_TESTSET_PATH)
        evaluate_and_create_csv(final_model, tokenizer, provided_test_data, "provided-testset.csv")
    else:
        print(f"\nWarning: Could not find provided test set at '{PROVIDED_TESTSET_PATH}'. Skipping its evaluation.")

    print("\n--- PIPELINE COMPLETE ---")


--- Starting Pre-training ---


  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'pad_token_id': 1}.


AcceleratorError: CUDA error: out of memory
Search for `cudaErrorMemoryAllocation' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
