In [1]:
from dataset import CodeClassificationDataset
from pathlib import Path
import sentencepiece as spm

datadir = Path("/data/guesslang_data/Dataset/Data/train")
files = [x.suffix for x in datadir.glob("*.*")]
files = sorted(set(files))
label_mapper = {k: i for i, k in enumerate(files)}
label_mapper_i = {i: k for k, i in label_mapper.items()}

tokenizer = spm.SentencePieceProcessor(model_file='../tokenizers/tok.model', num_threads=1)

# dataset = CodeClassificationDataset(
#     "/data/guesslang_data/Dataset/Data",
#     "train",
#     label_mapper,
#     tokenizer, 
#     random_crop=True,
# )

In [7]:
def utf8_byte_tokenizer(text):
    bytes_array = []
    for char in text:
        if 0 <= ord(char) <= 0x7F:
            bytes_array.append(ord(char))
        else:
            continue
    return bytes_array


In [9]:
dataset = CodeClassificationDataset(
    "/data/guesslang_data/Dataset/Data",
    "test",
    label_mapper,
    tokenizer,
    random_crop=True,
)

In [10]:
from datamodule import CodeClassificationDatamodule


dm = CodeClassificationDatamodule(
    "/data/guesslang_data/Dataset/Data",
    label_mapper,
    tokenizer,
    random_crop=True,
)
dm.setup()

In [11]:
tokenizer.GetPieceSize()

4541

In [37]:
def count_parameters(model: nn.Module) -> str:
    """
    Calculate the total number of parameters in a PyTorch model and format it in a human-readable way.

    Parameters:
    model (nn.Module): The PyTorch model.

    Returns:
    str: A formatted string showing the total number of parameters.
    """
    total_params = sum(p.numel() for p in model.parameters())
    
    if total_params >= 1e6:
        formatted_params = f"{total_params / 1e6:.1f} M"
    elif total_params >= 1e3:
        formatted_params = f"{total_params / 1e3:.1f} K"
    else:
        formatted_params = str(total_params)

    return f"{total_params:,} parameters -> {formatted_params}"

In [35]:
from pl_model import CodeClassificationModel
from torch import nn


model = nn.Sequential(
    nn.Embedding(tokenizer.GetPieceSize(), 128),
    # nn.Flatten(),
    nn.Linear(128, 512),
    nn.ReLU(),
    nn.Flatten(),
    nn.Linear(512*4096, 54),
)

batch = next(iter(dm.train_dataloader()))

In [40]:
string = """
from torch import nn

model = nn.Sequential(
    nn.Embedding(tokenizer.GetPieceSize(), 128),
    # nn.Flatten(),
    nn.Linear(128, 512),
    nn.ReLU(),
    nn.Flatten(),
    nn.Linear(512*4096, 54),
)
"""
encoded = tokenizer.encode(string)
len(encoded)

205

In [38]:
count_parameters(model)

'113,893,558 parameters -> 113.9 M'

In [36]:
model(batch[0]).shape

torch.Size([128, 54])

In [None]:
model = nn.Sequential(
    nn.Embedding(tokenizer.GetPieceSize(), 128),
    nn.Flatten(),
    nn.Linear(128*4096, 512),
    nn.ReLU(),
    nn.Linear(512, 54),
)

In [27]:
import torch.nn.functional as F
from torch import nn

class CNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(CNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(32, 128, (embed_dim)
        self.pool = nn.MaxPool1d((2, 1))
        self.fc1 = nn.Linear(100, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.conv1(x)
        x = F.relu(x)
        print(x.shape)
        
        # x = F.relu(self.conv1(x))
        # x = self.pool(x).squeeze(3)
        # x = F.avg_pool2d(x, (x.size(2), 1)).squeeze(2)
        # x = self.fc1(x)
        return x

In [28]:
cnn = CNNClassifier(4096, embed_dim=32, num_classes=54)

In [29]:
cnn(batch[0])

RuntimeError: Given groups=1, weight of size [128, 32, 32], expected input[128, 4096, 32] to have 32 channels, but got 4096 channels instead

In [28]:
model(batch[0]).shape

torch.Size([128, 54])

In [4]:
batch = next(iter(dm.train_dataloader()))

In [6]:
batch[0].shape

torch.Size([128, 4096])

In [4]:
from tqdm import tqdm


for i in tqdm(range(len(dataset))):
    x, y = dataset[i]
    

  0%|          | 0/115238 [00:00<?, ?it/s]

  8%|▊         | 9634/115238 [00:22<04:01, 437.00it/s]


KeyboardInterrupt: 

In [4]:
def utf8_byte_tokenizer(text):
    bytes_array = []
    for char in text:
        if 0 <= ord(char) <= 0x7F:
            bytes_array.append(ord(char))
        else:
            continue
    return bytes_array


In [10]:
c_code = """
#include <stdio.h>

// akld

int main() {
    int a = 5, b = 10;
    printf("Sum: %d\\n", a + b);
    return 0;
}
"""

# Example C++ code
cpp_code = """
#include <iostream>
using namespace std;

int main() {
    int a = 5, b = 10;
    cout << "Sum: " << (a + b) << endl;
    return 0;
}
"""

In [13]:
utf8_byte_tokenizer(cpp_code)

[10,
 35,
 105,
 110,
 99,
 108,
 117,
 100,
 101,
 32,
 60,
 105,
 111,
 115,
 116,
 114,
 101,
 97,
 109,
 62,
 10,
 117,
 115,
 105,
 110,
 103,
 32,
 110,
 97,
 109,
 101,
 115,
 112,
 97,
 99,
 101,
 32,
 115,
 116,
 100,
 59,
 10,
 10,
 105,
 110,
 116,
 32,
 109,
 97,
 105,
 110,
 40,
 41,
 32,
 123,
 10,
 32,
 32,
 32,
 32,
 105,
 110,
 116,
 32,
 97,
 32,
 61,
 32,
 53,
 44,
 32,
 98,
 32,
 61,
 32,
 49,
 48,
 59,
 10,
 32,
 32,
 32,
 32,
 99,
 111,
 117,
 116,
 32,
 60,
 60,
 32,
 34,
 83,
 117,
 109,
 58,
 32,
 34,
 32,
 60,
 60,
 32,
 40,
 97,
 32,
 43,
 32,
 98,
 41,
 32,
 60,
 60,
 32,
 101,
 110,
 100,
 108,
 59,
 10,
 32,
 32,
 32,
 32,
 114,
 101,
 116,
 117,
 114,
 110,
 32,
 48,
 59,
 10,
 125,
 10]

In [7]:
# Example string that includes ASCII and non-ASCII characters
test_string = "Hello, world! Привет мир! 大"

# Tokenize the string
tokenized_bytes = utf8_byte_tokenizer(test_string)

# Print the result
print("Tokenized bytes:", tokenized_bytes)


Tokenized bytes: [72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33, 32, 32, 33, 32]


In [3]:
from tqdm import tqdm


for i in tqdm(range(len(dataset))):
    x, y = dataset[i]
    

  0%|          | 0/115238 [00:00<?, ?it/s]

  6%|▋         | 7215/115238 [03:22<50:30, 35.65it/s]   


KeyboardInterrupt: 

In [13]:
dataset[10000]

(tensor([258, 299, 305,  ...,   0,   0,   0]), tensor(13))

In [7]:
from pathlib import Path

datadir = Path("/data/guesslang_data/Dataset/Data/train")
files = [x.suffix for x in datadir.glob("*.*")]
files = sorted(set(files))
label_mapper = {k: i for i, k in enumerate(files)}
label_mapper_i = {i: k for k, i in label_mapper.items()}

In [10]:
label_mapper = {k: i for i, k in enumerate(files)}
label_mapper_i = {i: k for k, i in label_mapper.items()}
label_mapper_i

{0: '.asm',
 1: '.bat',
 2: '.c',
 3: '.cbl',
 4: '.clj',
 5: '.cmake',
 6: '.coffee',
 7: '.cpp',
 8: '.cs',
 9: '.css',
 10: '.csv',
 11: '.dart',
 12: '.dm',
 13: '.dockerfile',
 14: '.erl',
 15: '.ex',
 16: '.f90',
 17: '.go',
 18: '.groovy',
 19: '.hs',
 20: '.html',
 21: '.ini',
 22: '.java',
 23: '.jl',
 24: '.js',
 25: '.json',
 26: '.kt',
 27: '.lisp',
 28: '.lua',
 29: '.makefile',
 30: '.matlab',
 31: '.md',
 32: '.ml',
 33: '.mm',
 34: '.pas',
 35: '.php',
 36: '.pm',
 37: '.prolog',
 38: '.ps1',
 39: '.py',
 40: '.r',
 41: '.rb',
 42: '.rs',
 43: '.scala',
 44: '.sh',
 45: '.sql',
 46: '.swift',
 47: '.tex',
 48: '.toml',
 49: '.ts',
 50: '.v',
 51: '.vba',
 52: '.xml',
 53: '.yaml'}