# Import libs

In [1]:
from datasets import Dataset, load_dataset
import numpy as np
from tqdm import tqdm

# Load Datasets

In [29]:
required_field = ["func_documentation_string", "func_code_string"]

In [85]:
data = load_dataset("code_search_net", "python", trust_remote_code=True)

In [89]:
def filter_non_ascii(text):
    """
    Remove non-ASCII characters from text.
    """
    return ''.join(char for char in text if ord(char) < 128)
    
def clean_docstring(doc_string, max_words=20):
    """
    Preprocess the documentation string:
    - Truncate at the first empty line or limit to the first 20 words.
    """
    # Split the documentation into lines
    lines = doc_string.split("\n")
    processed_lines = []

    for line in lines:
        stripped_line = line.strip()
        # Stop if we encounter an empty line
        if not stripped_line:
            break
        processed_lines.append(stripped_line)
    return filter_non_ascii(". ".join(processed_lines))

def clean_code(code):
    """
    Normalize code indentation to PEP 8 standards:
    - Use 4 spaces per indentation level.
    - Dynamically adjust indentation levels based on leading spaces.
    - Skip empty lines for indentation calculations.
    """
    lines = code.split("\n")
    cleaned_lines = []
    current_indent_level = 0  # Track the current indentation level
    previous_spaces = 0  # Track the leading spaces of the last non-empty line

    for line in lines:
        stripped_line = line.lstrip()  # Remove leading whitespace
        leading_spaces = len(line) - len(stripped_line)  # Count leading spaces

        if not stripped_line:  # If the line is empty
            cleaned_lines.append("")  # Preserve it as a blank line
            continue  # Skip further processing for this line

        # Compare leading spaces with the previous meaningful line
        if leading_spaces > previous_spaces:
            current_indent_level += 1  # Increase indentation level
        elif leading_spaces < previous_spaces:
            current_indent_level = max(0, current_indent_level - 1)  # Decrease indentation level

        # Update the previous_spaces for the next comparison
        previous_spaces = leading_spaces

        # Construct the cleaned line with spaces
        cleaned_line = (" " * (current_indent_level * 4)) + stripped_line
        cleaned_lines.append(cleaned_line)

    return filter_non_ascii("\n".join(cleaned_lines))
    
def preprocess_dataset(dataset):

    filtered_data = []
    for record in tqdm(dataset):
        # Ensure both documentation and code are present
        if record['func_documentation_string'] and record['func_code_string']:
            filtered_data.append({
                "description": clean_docstring(record['func_documentation_string']),
                "code": clean_code(record['func_code_string'])
            })
    return filtered_data

In [90]:
train_data = preprocess_dataset(data["train"])

100%|███████████████████████████████████████████████████████████████| 412178/412178 [01:10<00:00, 5806.60it/s]


In [92]:
print(f"""
Train size: {len(train_data)}
""".strip())

Train size: 412178
Test Size: 23107


In [94]:
print(train_data[1]["code"])

def setparents(self):
    """Correct all parent relations for elements within the scop. There is sually no need to call this directly, invoked implicitly by :meth:`copy`"""
    for c in self:
        if isinstance(c, AbstractElement):
            c.parent = self
            c.setparents()


# Prepare tokenizer

In [95]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
from tokenizers.processors import TemplateProcessing
from tokenizers.normalizers import Sequence, NFKC, Strip, StripAccents

def train_tokenizer(data, vocab_size=32000, save_path="./tokenizer"):
    """
    Train a BPE tokenizer with normalization, pre-tokenization, and post-processing.
    """
    # Initialize a tokenizer with a BPE model
    tokenizer = Tokenizer(models.BPE())

    # Step 1: Add a normalizer
    tokenizer.normalizer = Sequence([
        StripAccents(),  # Unicode normalization
        Strip(),  # Remove leading/trailing spaces
    ])

    # Step 2: Add a pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
        pre_tokenizers.Whitespace(),  # Split by whitespace
        pre_tokenizers.Punctuation(),  # Split punctuation
    ])

    # Step 3: Define a trainer
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
    )

    # Train the tokenizer
    tokenizer.train_from_iterator(data, trainer)

    # Step 4: Add a post-processor
    tokenizer.post_processor = TemplateProcessing(
        single="<s> $A </s>",  # For single sequences
        pair="<s> $A </s> $B:1 </s>:1",  # For paired sequences
        special_tokens=[
            ("<s>", tokenizer.token_to_id("<s>")),
            ("</s>", tokenizer.token_to_id("</s>")),
        ],
    )

    # Save the tokenizer
    tokenizer.save(f"{save_path}/custom_tokenizer.json")
    print(f"Tokenizer saved at {save_path}/custom_tokenizer.json")

    return tokenizer

# Prepare the data for training the tokenizer
data = [f"{item['description']} {item['code']}" for item in train_data]

# Train and save the tokenizer
custom_tokenizer = train_tokenizer(data)





Tokenizer saved at ./tokenizer/custom_tokenizer.json
Encoded Tokens: ['<s>', 'Create', 'a', 'function', 'to', 'add', 'two', 'numbers', '</s>']


In [None]:
# Test the tokenizer with a sample input
sample_input = "Create a function to add two numbers"
encoded = custom_tokenizer.encode(sample_input)
print("Encoded Tokens:", encoded.tokens)