# Python Tokenization

## Tokenization using Python standard tokenizer + nltk
You need to install nltk package

In [1]:
from nltk.tokenize import word_tokenize

def tokenize_nlt(code):
    try:
        return word_tokenize(code.replace('`', '').replace("'", ''))
    except Exception as e:
        print(e)

from tokenize import tokenize
from io import BytesIO

def tokenize_python(code):
    
    g = tokenize(BytesIO(code.encode('utf-8')).readline)
    try:
        tokens = [c[1] for c in g if c[1]!='' and c[1]!='\n'][1:]
    except:
        tokens = tokenize_nlt(code)
    
    clean_tokens = []
    
    for t in tokens:
        if ' ' in t:
            clean_tokens += tokenize_nlt(t.replace('"', '').replace("'", ''))
        else:
            clean_tokens.append(t)
    
    return clean_tokens

In [2]:
code = """
x = 5.
if x > 0:
    print('x is positive')
else:
    print('x is negative')
a.append(x)
"""

In [3]:
tokenize_python(code)

['x',
 '=',
 '5.',
 'if',
 'x',
 '>',
 '0',
 ':',
 'print',
 '(',
 'x',
 'is',
 'positive',
 ')',
 'else',
 ':',
 'print',
 '(',
 'x',
 'is',
 'negative',
 ')',
 'a',
 '.',
 'append',
 '(',
 'x',
 ')']

## Tokenization using ByteLevelBPETokenizer

Our pretrained model is available in the folder shared_resources/pretrained_tokenizer

In [5]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "../shared_resources/pretrained_tokenizer/py_tokenizer-vocab.json",
    "../shared_resources/pretrained_tokenizer/py_tokenizer-merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(
    tokenizer.encode("for x in range(9)").tokens
)

['<s>', 'for', 'Ġx', 'Ġin', 'Ġrange', '(', '9', ')', '</s>']
