<a href="https://colab.research.google.com/github/msamwelmollel/ML-CLUB-2024-Eagle-Labs/blob/main/ML_CLUB_2024_Eagle_Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [86]:
import re
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [87]:
# Example text with Python code
text = """
PEP 8: Style Guide for Python Code

This PEP describes a style guide for Python code. Its goal is to improve code readability and consistency across the Python community. Here are some key points:

1. Use 4 spaces per indentation level.

2. Limit all lines to a maximum of 79 characters.

3. Imports should usually be on separate lines:

import os
import sys

4. Surround top-level function and class definitions with two blank lines.

def example_function():
    '''This is a docstring.'''
    return None

class ExampleClass:
    '''This is a class docstring.'''

    def __init__(self, value):
        self.value = value

    def get_value(self):
        return self.value

5. Use inline comments sparingly:

x = x + 1  # Compensate for border

6. Use docstrings for all public modules, functions, classes, and methods.

Remember, code is read much more often than it is written. Clarity and readability are paramount!
"""

In [88]:
# print(text)

In [89]:
# Tokenizer initialization (using a BERT tokenizer as an example)
try:
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModel.from_pretrained("bert-base-uncased")
except Exception as e:
    print(f"Error loading tokenizer or model: {e}")
    exit(1)

In [90]:
# Helper function for tokenizing text
def tokenize(text):
    return tokenizer.tokenize(text)

In [91]:
# Helper function to convert token list to string
def tokens_to_string(tokens):
    return tokenizer.convert_tokens_to_string(tokens)

In [92]:
# Helper function to generate embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [93]:
# Fixed token without overlap
def fixed_token_without_overlap(text, chunk_size):
    tokens = tokenize(text)
    return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

In [94]:
# Fixed token with overlap
def fixed_token_with_overlap(text, chunk_size, overlap):
    tokens = tokenize(text)
    return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size - overlap)]

In [95]:
# Recursive with overlap
def recursive_with_overlap(text, chunk_size, overlap):
    paragraphs = text.split('\n\n')
    chunks = []
    for paragraph in paragraphs:
        sentences = re.split(r'(?<=[.!?]) +', paragraph)
        paragraph_tokens = []
        for sentence in sentences:
            sentence_tokens = tokenize(sentence)
            paragraph_tokens.extend(sentence_tokens)
            if len(paragraph_tokens) >= chunk_size:
                chunks.append(paragraph_tokens[:chunk_size])
                paragraph_tokens = paragraph_tokens[chunk_size - overlap:]
        if paragraph_tokens:
            chunks.append(paragraph_tokens)
    return chunks

In [96]:
# Recursive Python splitter with overlap
def recursive_python_splitter_with_overlap(text, chunk_size, overlap):
    code_splits = re.split(r'(\nclass|\ndef|\n)', text)
    chunks = []
    current_chunk = []
    for part in code_splits:
        tokens = tokenize(part)
        current_chunk.extend(tokens)
        if len(current_chunk) >= chunk_size:
            chunks.append(current_chunk[:chunk_size])
            current_chunk = current_chunk[chunk_size - overlap:]
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

In [97]:
# Semantic chunking
def semantic_chunking(text, similarity_threshold=0.8):
    sentences = re.split(r'(?<=[.!?]) +', text)
    sentence_groups = [' '.join(sentences[i:i+3]) for i in range(0, len(sentences), 3)]

    embeddings = [get_embedding(group) for group in sentence_groups]

    chunks = []
    current_chunk = []

    for i, group in enumerate(sentence_groups):
        if not current_chunk:
            current_chunk.append(group)
        else:
            similarity = cosine_similarity([embeddings[i]], [np.mean(embeddings[max(0, i-3):i], axis=0)])[0][0]
            if similarity >= similarity_threshold:
                current_chunk.append(group)
            else:
                chunks.append(' '.join(current_chunk))
                current_chunk = [group]

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


In [98]:
# Example usage
chunk_size = 50
overlap = 20

In [119]:
print("Fixed Token without Overlap:")
print("Number of chunks", len(fixed_token_without_overlap(text, chunk_size)))
print(fixed_token_without_overlap(text, chunk_size))

print("\nFixed Token with Overlap:")
print("Number of chunks", len(fixed_token_with_overlap(text, chunk_size, overlap)))
print(fixed_token_with_overlap(text, chunk_size, overlap))

print("\nRecursive with Overlap:")
print("Number of chunks", len(recursive_with_overlap(text, chunk_size, overlap)))
print(recursive_with_overlap(text, chunk_size, overlap))

print("\nRecursive Python Splitter with Overlap:")
print("Number of chunks", len(recursive_python_splitter_with_overlap(text, chunk_size, overlap)))
print(recursive_python_splitter_with_overlap(text, chunk_size, overlap))

Fixed Token without Overlap:
Number of chunks 5
[['pep', '8', ':', 'style', 'guide', 'for', 'python', 'code', 'this', 'pep', 'describes', 'a', 'style', 'guide', 'for', 'python', 'code', '.', 'its', 'goal', 'is', 'to', 'improve', 'code', 'read', '##ability', 'and', 'consistency', 'across', 'the', 'python', 'community', '.', 'here', 'are', 'some', 'key', 'points', ':', '1', '.', 'use', '4', 'spaces', 'per', 'ind', '##entation', 'level', '.', '2'], ['.', 'limit', 'all', 'lines', 'to', 'a', 'maximum', 'of', '79', 'characters', '.', '3', '.', 'imports', 'should', 'usually', 'be', 'on', 'separate', 'lines', ':', 'import', 'os', 'import', 'sy', '##s', '4', '.', 'surround', 'top', '-', 'level', 'function', 'and', 'class', 'definitions', 'with', 'two', 'blank', 'lines', '.', 'def', 'example', '_', 'function', '(', ')', ':', "'", "'"], ["'", 'this', 'is', 'a', 'doc', '##st', '##ring', '.', "'", "'", "'", 'return', 'none', 'class', 'example', '##class', ':', "'", "'", "'", 'this', 'is', 'a', 'cla

In [99]:
strategies = [
    ("Fixed Token without Overlap", fixed_token_without_overlap, False),
    ("Fixed Token with Overlap", fixed_token_with_overlap, True),
    ("Recursive with Overlap", recursive_with_overlap, True),
    ("Recursive Python Splitter with Overlap", recursive_python_splitter_with_overlap, True),
    ("Semantic Chunking", semantic_chunking, False)
]

In [121]:
# for strategy_name, strategy_func, uses_overlap in strategies:
#     print(f"\n{strategy_name}:")
#     if strategy_name == "Semantic Chunking":
#         chunks = strategy_func(text)
#     else:
#         chunks = strategy_func(text, chunk_size, overlap) if uses_overlap else strategy_func(text, chunk_size)

#     for i, chunk in enumerate(chunks):
#         if strategy_name == "Semantic Chunking":
#             print(f"Chunk {i + 1}: {chunk[:100]}...")  # Print first 100 characters of each chunk
#         else:
#             print(f"Chunk {i + 1}: {tokens_to_string(chunk)[:100]}...")  # Print first 100 characters of each chunk
#     print(f"Total chunks: {len(chunks)}")

In [120]:
for strategy_name, strategy_func, uses_overlap in strategies:
    print("\n" + "*" * 60)
    print(f"\n{strategy_name}:")
    if strategy_name == "Semantic Chunking":
        chunks = strategy_func(text)
    else:
        chunks = strategy_func(text, chunk_size, overlap) if uses_overlap else strategy_func(text, chunk_size)

    for i, chunk in enumerate(chunks):
        if strategy_name == "Semantic Chunking":
            print(f"Chunk {i + 1}:\n{chunk}\n")
        else:
            print(f"Chunk {i + 1}:\n{tokens_to_string(chunk)}\n")
    print(f"Total chunks: {len(chunks)}")

PEP 8: Style Guide for Python Code

This PEP describes a style guide for Python code. Its goal is to improve code readability and consistency across the Python community. Here are some key points:


************************************************************

Fixed Token without Overlap:
Chunk 1:
pep 8 : style guide for python code this pep describes a style guide for python code. its goal is to improve code readability and consistency across the python community. here are some key points : 1. use 4 spaces per indentation level. 2

Chunk 2:
. limit all lines to a maximum of 79 characters. 3. imports should usually be on separate lines : import os import sys 4. surround top - level function and class definitions with two blank lines. def example _ function ( ) : ' '

Chunk 3:
' this is a docstring. ' ' ' return none class exampleclass : ' ' ' this is a class docstring. ' ' ' def _ _ init _ _ ( self, value ) : self. value = value def

Chunk 4:
get _ value ( self ) : return self. value 5. use inline comments sparingly : x = x + 1 # compensate for border 6. use docstrings for all public modules, functions, classes, and methods. remember,

Chunk 5:
code is read much more often than i

In [117]:
semantic_chunking_strategy = ("Semantic Chunking", semantic_chunking)

print(f"\n{semantic_chunking_strategy[0]}:")
chunks = semantic_chunking_strategy[1](text)

for i, chunk in enumerate(chunks):
    print(f"Chunk {i + 1}:\n{chunk}\n")
print(f"Total chunks: {len(chunks)}")


Semantic Chunking:
Chunk 1:

PEP 8: Style Guide for Python Code

This PEP describes a style guide for Python code. Its goal is to improve code readability and consistency across the Python community. Here are some key points:

1.

Chunk 2:
Use 4 spaces per indentation level.

2. Limit all lines to a maximum of 79 characters.

3. Imports should usually be on separate lines:

import os
import sys

4. Surround top-level function and class definitions with two blank lines.

def example_function():
    '''This is a docstring.'''
    return None

class ExampleClass:
    '''This is a class docstring.'''
    
    def __init__(self, value):
        self.value = value
    
    def get_value(self):
        return self.value

5. Use inline comments sparingly:

x = x + 1  # Compensate for border

6. Use docstrings for all public modules, functions, classes, and methods.

Remember, code is read much more often than it is written.

Chunk 3:
Clarity and readability are paramount!


Total chunks: 3
