In [None]:
import dataLoading
import wolframPreprocessing
import myWMEncoder
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view

import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
from unixcoder import UniXcoder

In [2]:
loader = dataLoading.DataLoaderExporter()
preprocessor = wolframPreprocessing.WolframPreprocessor()

In [5]:
loader.make_json_data()

In [None]:
data = loader.load_json_data()
data

In [None]:
data["transformed_code"] = data["code"].apply(lambda x: preprocessor.preprocessing_composition_naive(x, tokenize=True))
data.drop(data[data["transformed_code"].apply(lambda x: len(x) == 0)].index, inplace=True)
data["tr_code_non_tokenized"] = data["code"].apply(lambda x: preprocessor.preprocessing_composition_naive(x, tokenize=False))
data.reset_index(drop=True, inplace=True)

In [8]:
data = data[data["task"] == data["task"].unique()[0]]

In [9]:
encoder = myWMEncoder.CustomEncoder(50)
encoder.fit(data["transformed_code"].values)
encoder_size = encoder.vec_size

In [10]:
class WindowDataset(Dataset):
    def __init__(self, data, encoder, window_size=5):
        self.data = data
        self.encoder = encoder
        self.encoder.fit(data)
        self.window_size = window_size
        self.windows = None
        self.labels = np.array([])
        self.start = True

        for string in self.data:
            tmp = sliding_window_view(string, window_size + 1)
            for win in tmp:
                encoded_win = self.encoder.transform(win)
                x = np.array([encoded_win[:-1]])
                label = np.array([np.argmax(encoded_win[-1][:-2])])
                if self.start:
                    self.windows = x
                    self.start = False
                else:
                    self.windows = np.concatenate((self.windows, x))
                self.labels = np.concatenate((self.labels, label))

        self.labels = torch.LongTensor(self.labels)
        self.windows = torch.FloatTensor(self.windows)

    def __len__(self):
        return len(self.windows)
    
    def __getitem__(self, idx):
        return self.windows[idx], self.labels[idx]

In [11]:
class WindowDatasetOpt(Dataset):
    def __init__(self, data, encoder, window_size=5):
        self.data = data
        self.encoder = encoder
        self.encoder.fit(data)
        self.window_size = window_size
        self.windows = None
        self.labels = np.array([])
        self.start = True

        for code in self.data:
            transformed_code = self.encoder.transform(code)
            sliding_windows = sliding_window_view(transformed_code, (self.window_size + 1, self.encoder.vec_size)).reshape(-1, self.window_size + 1, self.encoder.vec_size)
            x = sliding_windows[:, :-1]
            labels = np.argmax(sliding_windows[:, -1,: -2], axis=1)
            if self.start:
                self.windows = x
                self.labels = labels
                self.start = False
            else:
                self.windows = np.concatenate((self.windows, x))
                self.labels = np.concatenate((self.labels, labels))
        self.labels = torch.LongTensor(self.labels)
        self.windows = torch.FloatTensor(self.windows)

    def __len__(self):
        return len(self.windows)
    
    def __getitem__(self, idx):
        return self.windows[idx], self.labels[idx]

In [12]:
enc = myWMEncoder.CustomEncoder(50)
new_dataset = WindowDatasetOpt(data["transformed_code"].values, enc, 10)

In [14]:
trainloader = DataLoader(new_dataset, batch_size=64, shuffle=True)

In [15]:
class LSTMWindow(nn.Module):
    def __init__(self, hidden_size, vec_size):
        super(LSTMWindow, self).__init__()
        
        self.hidden_size = hidden_size
        self.vec_size = vec_size
        self.n_classes = vec_size - 2
        self.lstm = nn.LSTM(self.vec_size,
                        self.hidden_size,
                        num_layers=1,
                        batch_first=True,
                        bidirectional=False
                        )
        self.lin_layer = nn.Linear(self.hidden_size, self.n_classes)
        
    def forward(self, inputs):
        h_0 = torch.zeros(1, inputs.size(0), self.hidden_size)
        c_0 = torch.zeros(1, inputs.size(0), self.hidden_size)
        output_features, (h_out, _) = self.lstm(inputs, (h_0, c_0))  
        self.h_out = h_out.view(-1, self.hidden_size)  
        return self.lin_layer(self.h_out)
    
    def emb(self, inputs):
        self.forward(inputs)
        return self.h_out

In [16]:
torch.manual_seed(0)
model = LSTMWindow(300, new_dataset.encoder.vec_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [17]:
from tqdm import tqdm
from tqdm import trange

In [18]:
for epoch in trange(50, desc="Learning:", unit="carrots"):
    i = 1
    mean_sum = 0
    for id, (texts, targets) in enumerate(trainloader):
    
        optimizer.zero_grad()
        pred = model(texts)
        loss = criterion(pred, targets)
        mean_sum += loss.item()
        i += 1
        loss.backward()
        optimizer.step()
    print(f"Epoch[{epoch + 1}/100], mean_epoch_loss: {mean_sum/i}")

Learning::   2%|▏         | 1/50 [00:04<03:55,  4.80s/carrots]

Epoch[1/100], mean_epoch_loss: 2.1130449447265036


Learning::   4%|▍         | 2/50 [00:09<03:49,  4.78s/carrots]

Epoch[2/100], mean_epoch_loss: 1.2032436517568734


Learning::   6%|▌         | 3/50 [00:14<03:43,  4.75s/carrots]

Epoch[3/100], mean_epoch_loss: 0.9379906880855561


Learning::   8%|▊         | 4/50 [00:18<03:35,  4.69s/carrots]

Epoch[4/100], mean_epoch_loss: 0.7723847158138568


Learning::  10%|█         | 5/50 [00:23<03:31,  4.70s/carrots]

Epoch[5/100], mean_epoch_loss: 0.6522561644590817


Learning::  12%|█▏        | 6/50 [00:28<03:26,  4.69s/carrots]

Epoch[6/100], mean_epoch_loss: 0.5658410020516469


Learning::  14%|█▍        | 7/50 [00:32<03:21,  4.69s/carrots]

Epoch[7/100], mean_epoch_loss: 0.49874658465385435


Learning::  16%|█▌        | 8/50 [00:37<03:15,  4.66s/carrots]

Epoch[8/100], mean_epoch_loss: 0.45757623631220595


Learning::  18%|█▊        | 9/50 [00:42<03:11,  4.66s/carrots]

Epoch[9/100], mean_epoch_loss: 0.4083310693273178


Learning::  20%|██        | 10/50 [00:47<03:08,  4.71s/carrots]

Epoch[10/100], mean_epoch_loss: 0.36795809887922726


Learning::  22%|██▏       | 11/50 [00:51<03:03,  4.70s/carrots]

Epoch[11/100], mean_epoch_loss: 0.33442729757382317


Learning::  24%|██▍       | 12/50 [00:56<02:58,  4.69s/carrots]

Epoch[12/100], mean_epoch_loss: 0.3265821451177964


Learning::  26%|██▌       | 13/50 [01:00<02:52,  4.66s/carrots]

Epoch[13/100], mean_epoch_loss: 0.3164060534651463


Learning::  28%|██▊       | 14/50 [01:05<02:47,  4.64s/carrots]

Epoch[14/100], mean_epoch_loss: 0.29723392738745763


Learning::  30%|███       | 15/50 [01:10<02:42,  4.64s/carrots]

Epoch[15/100], mean_epoch_loss: 0.2899903727036256


Learning::  32%|███▏      | 16/50 [01:14<02:37,  4.63s/carrots]

Epoch[16/100], mean_epoch_loss: 0.28301978491819824


Learning::  34%|███▍      | 17/50 [01:19<02:32,  4.64s/carrots]

Epoch[17/100], mean_epoch_loss: 0.2858679527044296


Learning::  36%|███▌      | 18/50 [01:24<02:28,  4.63s/carrots]

Epoch[18/100], mean_epoch_loss: 0.25313861570679225


Learning::  38%|███▊      | 19/50 [01:28<02:23,  4.63s/carrots]

Epoch[19/100], mean_epoch_loss: 0.2494994190793771


Learning::  40%|████      | 20/50 [01:33<02:18,  4.62s/carrots]

Epoch[20/100], mean_epoch_loss: 0.23997730649434604


Learning::  42%|████▏     | 21/50 [01:38<02:14,  4.65s/carrots]

Epoch[21/100], mean_epoch_loss: 0.24770494942481702


Learning::  44%|████▍     | 22/50 [01:42<02:11,  4.71s/carrots]

Epoch[22/100], mean_epoch_loss: 0.25291938554782134


Learning::  46%|████▌     | 23/50 [01:48<02:13,  4.93s/carrots]

Epoch[23/100], mean_epoch_loss: 0.243939335082586


Learning::  48%|████▊     | 24/50 [01:53<02:11,  5.07s/carrots]

Epoch[24/100], mean_epoch_loss: 0.21624074496901952


Learning::  50%|█████     | 25/50 [01:58<02:03,  4.95s/carrots]

Epoch[25/100], mean_epoch_loss: 0.21777233343857985


Learning::  52%|█████▏    | 26/50 [02:03<01:56,  4.87s/carrots]

Epoch[26/100], mean_epoch_loss: 0.22610657403102288


Learning::  54%|█████▍    | 27/50 [02:07<01:50,  4.80s/carrots]

Epoch[27/100], mean_epoch_loss: 0.2242743625319921


Learning::  56%|█████▌    | 28/50 [02:12<01:44,  4.77s/carrots]

Epoch[28/100], mean_epoch_loss: 0.25714075525219626


Learning::  58%|█████▊    | 29/50 [02:17<01:39,  4.76s/carrots]

Epoch[29/100], mean_epoch_loss: 0.24892475948883938


Learning::  60%|██████    | 30/50 [02:21<01:34,  4.73s/carrots]

Epoch[30/100], mean_epoch_loss: 0.21459933342841955


Learning::  62%|██████▏   | 31/50 [02:26<01:29,  4.72s/carrots]

Epoch[31/100], mean_epoch_loss: 0.19103860532435088


Learning::  64%|██████▍   | 32/50 [02:31<01:24,  4.70s/carrots]

Epoch[32/100], mean_epoch_loss: 0.18347709686137162


Learning::  66%|██████▌   | 33/50 [02:35<01:19,  4.68s/carrots]

Epoch[33/100], mean_epoch_loss: 0.17535006540325973


Learning::  68%|██████▊   | 34/50 [02:40<01:14,  4.67s/carrots]

Epoch[34/100], mean_epoch_loss: 0.1762904783624869


Learning::  70%|███████   | 35/50 [02:45<01:10,  4.68s/carrots]

Epoch[35/100], mean_epoch_loss: 0.19471746511757373


Learning::  72%|███████▏  | 36/50 [02:50<01:06,  4.74s/carrots]

Epoch[36/100], mean_epoch_loss: 0.23060439807864336


Learning::  74%|███████▍  | 37/50 [02:55<01:02,  4.83s/carrots]

Epoch[37/100], mean_epoch_loss: 0.23678834818876707


Learning::  76%|███████▌  | 38/50 [02:59<00:57,  4.77s/carrots]

Epoch[38/100], mean_epoch_loss: 0.23850450911200963


Learning::  78%|███████▊  | 39/50 [03:04<00:52,  4.73s/carrots]

Epoch[39/100], mean_epoch_loss: 0.2283591104241518


Learning::  80%|████████  | 40/50 [03:08<00:46,  4.68s/carrots]

Epoch[40/100], mean_epoch_loss: 0.19762776418374134


Learning::  82%|████████▏ | 41/50 [03:13<00:41,  4.66s/carrots]

Epoch[41/100], mean_epoch_loss: 0.16722210069115345


Learning::  84%|████████▍ | 42/50 [03:18<00:37,  4.64s/carrots]

Epoch[42/100], mean_epoch_loss: 0.1515310064645914


Learning::  86%|████████▌ | 43/50 [03:22<00:32,  4.63s/carrots]

Epoch[43/100], mean_epoch_loss: 0.1441304641503554


Learning::  88%|████████▊ | 44/50 [03:27<00:27,  4.65s/carrots]

Epoch[44/100], mean_epoch_loss: 0.14867161154173889


Learning::  90%|█████████ | 45/50 [03:32<00:23,  4.67s/carrots]

Epoch[45/100], mean_epoch_loss: 0.15442519210278988


Learning::  92%|█████████▏| 46/50 [03:36<00:18,  4.64s/carrots]

Epoch[46/100], mean_epoch_loss: 0.184585843785451


Learning::  94%|█████████▍| 47/50 [03:41<00:13,  4.65s/carrots]

Epoch[47/100], mean_epoch_loss: 0.23394611691053097


Learning::  96%|█████████▌| 48/50 [03:45<00:09,  4.65s/carrots]

Epoch[48/100], mean_epoch_loss: 0.24834877635423955


Learning::  98%|█████████▊| 49/50 [03:50<00:04,  4.62s/carrots]

Epoch[49/100], mean_epoch_loss: 0.21012466879991384


Learning:: 100%|██████████| 50/50 [03:55<00:00,  4.71s/carrots]

Epoch[50/100], mean_epoch_loss: 0.16424053297019922





In [None]:
enc_need = new_dataset.encoder

In [20]:
works = [model.emb(torch.FloatTensor(np.array([enc_need.transform(tokens)])))[0].tolist() for tokens in data["transformed_code"].values]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import ratio

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_unix = UniXcoder("microsoft/unixcoder-base")
model_unix.to(device)

In [None]:
lstm_embs, unixcoder_embs = {}, {}
for name, code in enumerate(data[['name', 'task']].values):
    tokens_ids = model.tokenize([code],max_length=512,mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    _, unix = model(source_ids)

    lstm = model.emb([code])
    lstm_embs[name] = lstm
    unixcoder_embs[name] = unix


In [None]:
def compute_unix_embs(ind_pairs):
    unix_values = []
    for pair in ind_pairs:
        sim_unix = torch.einsum("ac,bc->ab",
                                unixcoder_embs[pair[0]],
                                unixcoder_embs[pair[1]])[0][0].item()
        unix_values.append(sim_unix)
    return unix_values

def compute_lstm_embs(ind_pairs):
    lstm_values = []
    for pair in ind_pairs:
        sim_lstm = cosine_similarity(lstm_embs[pair[0]], lstm_embs[pair[1]])[0][0]
        lstm_values.append(sim_lstm)
    return lstm_values


In [None]:
def compute_plags_alg1(clust, vertecies, models_weights, threshold):
    total_info = []
    for v0_ind in range(len(vertecies)):
        for v1_ind in range(len(vertecies)):
            if v0_ind < v1_ind:
                v0, v1 = vertecies[[v0_ind, v1_ind]]
                works_v0 = clust[clust["name"] == v0][["file", "index"]].values
                works_v1 = clust[clust["name"] == v1][["file", "index"]].values
                plags = []
                for v0_file, v0_edge in works_v0:
                    s0 = clust[clust["index"] == v0_edge]["tr_code_non_tokenized"].values[0]
                    for v1_file, v1_edge in works_v1:
                        s1 = clust[clust["index"] == v1_edge]["tr_code_non_tokenized"].values[0]
                        sim_leven = Levenshtein.ratio(s0, s1)
                        sim_unix = torch.einsum("ac,bc->ab",unixcoder_embs[v0_edge],unixcoder_embs[v1_edge])[0][0].item()
                        sim_lstm = cosine_similarity(lstm_embs[v0_edge], lstm_embs[v1_edge])[0][0]
                        total_sim = models_weights @ [sim_leven, sim_lstm, sim_unix]
                        if total_sim > threshold:
                            plags.append([v0_file, v1_file, (sim_leven, sim_lstm, sim_unix, total_sim)])
                if len(plags) > 0:
                    total_info.append([v0, v1, plags, len(plags),"label", "label"])
    return total_info    


In [None]:
def compute_plags_alg2(clust, vertecies, models_weights, threshold_lvn_upper, threshold_lvn_lower, threshold_nn):
    total_info = []
    for v0_ind in range(len(vertecies)):
        for v1_ind in range(len(vertecies)):
            if v0_ind < v1_ind:
                v0, v1 = vertecies[[v0_ind, v1_ind]]
                works_v0 = clust[clust["name"] == v0][["file", "index"]].values
                works_v1 = clust[clust["name"] == v1][["file", "index"]].values
                plags = []
                for v0_file, v0_edge in works_v0:
                    s0 = clust[clust["index"] == v0_edge]["tr_code_non_tokenized"].values[0]
                    for v1_file, v1_edge in works_v1:
                        s1 = clust[clust["index"] == v1_edge]["tr_code_non_tokenized"].values[0]
                        sim_leven = Levenshtein.ratio(s0, s1)
                        sim_unix = torch.einsum("ac,bc->ab",unixcoder_embs[v0_edge],unixcoder_embs[v1_edge])[0][0].item()
                        sim_lstm = cosine_similarity(lstm_embs[v0_edge], lstm_embs[v1_edge])[0][0]

                        if sim_leven >= threshold_lvn_upper:
                            plags.append([v0_file, v1_file, (sim_leven, 0, 0, sim_leven)])
                        elif threshold_lvn_lower <= sim_leven < threshold_lvn_upper:
                            nn_sim = models_weights @ [sim_lstm, sim_unix]
                            if nn_sim >= threshold_nn:
                                plags.append([v0_file, v1_file, (sim_leven, 0, sim_unix, 0)])

                if len(plags) > 0:
                    total_info.append([v0, v1, plags, len(plags),"label", "label"])
