In [None]:
from pathlib import Path
import pickle
import joblib

exp_dir = Path("experiments/mlp_tfidf/runs/2024-05-19-17-25-58")
with open(exp_dir / "models/pipeline.pkl", "rb") as f:
    pipeline = pickle.load(f)
with open(exp_dir / "models/label_encoder.pkl", "rb") as f:
    le = pickle.load(f)

In [1]:
import pickle

with open("/home/dmitry.zarubin/ODS_NLP_Project/classic/experiments/logreg_tfidf/runs/2024-05-19-06-20-20/models/pipeline.pkl", "rb") as file:
    vectorizer = pickle.load(file)[0]

In [2]:
vectorizer

In [11]:
import re

re.findall(vectorizer.token_pattern, "def foo(): return 0")

['def', 'foo', '(', ')', ':', 'return']

In [13]:
inv = {v:k for k, v in vectorizer.vocabulary_.items()}

In [19]:
inv[601]

'the same'

In [3]:
from torch import Tensor, nn
import torch.nn.functional as F
import re
from sklearn.feature_extraction.text import TfidfVectorizer


def tokenize_from_vectorizer(
    string: str, 
    vectorizer: TfidfVectorizer,
    pad_value: int,
    max_len: int = 128,
):
    tokens = re.findall(vectorizer.token_pattern, string)
    vocab = vectorizer.vocabulary_
    tokens = [vocab[token] for token in tokens if token in vocab]
    pad = max_len - len(tokens)
    if pad > 0:
        tokens += [pad_value] * pad
    else:
        tokens = tokens[:max_len]
    return tokens
    
    


class CodeClassificationMLP(nn.Sequential):
    def __init__(
        self,
        vocab_size: int,
        embedding_dim: int,
        padding_idx: int,
        num_classes: int,
        hidden_dims: list[int] = [128],
        act_fn: str = "relu",
    ) -> None:
        super().__init__()
        self.add_module("embedding", nn.Embedding(vocab_size, embedding_dim, padding_idx))
        act_fn = nn.ReLU() if act_fn == "relu" else nn.Identity
        prev_dim = embedding_dim
        for i, hidden_dim in enumerate(hidden_dims):
            self.add_module(f"layer_{i}", nn.Linear(prev_dim, hidden_dim))
            self.add_module(f"act_fn_{i}", act_fn)
            prev_dim = hidden_dim
        self.add_module("pool", nn.AdaptiveAvgPool1d(1))
        self.add_module("flatten", nn.Flatten())
        self.add_module("classifier", nn.Linear(hidden_dim, num_classes))

In [90]:
tokens = tokenize_from_vectorizer("def foo(): return 0", vectorizer, max(vectorizer.vocabulary_.values()) + 1)

In [93]:
model = CodeClassificationMLP(711, 32, 710, 54, [128])
model

CodeClassificationMLP(
  (embedding): Embedding(711, 32, padding_idx=710)
  (layer_0): Linear(in_features=32, out_features=128, bias=True)
  (act_fn_0): ReLU()
  (pool): AdaptiveAvgPool1d(output_size=1)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (classifier): Linear(in_features=128, out_features=54, bias=True)
)

In [92]:
import torch

model(torch.tensor(tokens)[None]).shape

torch.Size([1, 54])

In [27]:
max(vectorizer.vocabulary_.values())

709

In [96]:
from torch import nn


class LinearNormAct(nn.Sequential):
    def __init__(self, in_features: int, out_features: int):
        super().__init__()
        self.add_module("linear", nn.Linear(in_features, out_features))
        self.add_module("norm", nn.LayerNorm(out_features))
        self.add_module("act", nn.ReLU())


class ConvNormAct(nn.Sequential):
    def __init__(self, in_features: int, out_features: int, kernel_size: int, groups: int = 1):
        super().__init__()
        self.add_module("conv", nn.Conv1d(in_features, out_features, kernel_size, groups, padding=kernel_size // 2))
        self.add_module("norm", nn.BatchNorm1d(out_features))
        self.add_module("act", nn.ReLU())


class CodeClassificationMLP(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embedding_dim: int,
        padding_idx: int,
        num_classes: int,
        hidden_dims: list[int] = [256],
    ) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx)
        self.layers = nn.Sequential()
        prev_dim = embedding_dim
        for i, hidden_dim in enumerate(hidden_dims):
            self.layers.add_module(f"block_{i}", LinearNormAct(prev_dim, hidden_dim))
            prev_dim = hidden_dim
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.flatten = nn.Flatten()
        self.classifier = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        x = self.layers(self.embedding(x))
        x = self.pool(x.movedim(1, -1)).squeeze(-1)
        return self.classifier(x)
    

class CodeClassificationCNN(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embedding_dim: int,
        padding_idx: int,
        num_classes: int,
        hidden_dims: list[int] = [256],
    ) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx)
        self.layers = nn.Sequential()
        prev_dim = embedding_dim
        for i, hidden_dim in enumerate(hidden_dims):
            self.layers.add_module(f"block_{i}", ConvNormAct(prev_dim, hidden_dim, 5))
            prev_dim = hidden_dim
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.flatten = nn.Flatten()
        self.classifier = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        x = self.embedding(x).movedim(1, -1)
        x = self.layers(x)
        x = self.pool(x).squeeze(-1)
        return self.classifier(x)

In [97]:
import torch
tokens = torch.tensor(
    tokenize_from_vectorizer("def foo(): return 0", vectorizer, max(vectorizer.vocabulary_.values()) + 1)
)
tokens

tensor([334,   0,  40, 148, 627, 811, 811, 811, 811, 811, 811, 811, 811, 811,
        811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811,
        811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811,
        811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811,
        811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811,
        811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811,
        811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811,
        811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811,
        811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811, 811,
        811, 811])

In [98]:
model = CodeClassificationCNN(812, 32, 811, 54, [128, 779])
model(tokens[None]).shape

torch.Size([1, 54])

In [28]:
model.layers, model.classifier

(ModuleList(
   (0): LinearNormAct(
     (linear): Linear(in_features=32, out_features=256, bias=True)
     (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
     (act): ReLU()
   )
 ),
 Linear(in_features=256, out_features=54, bias=True))