# Pipe for create embeddings

## Imports

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import helper
import debug
import json

import pandas as pd
import numpy as np
import seaborn as sns
import math

import sklearn.metrics as metrics
from torch.nn.utils.rnn import pad_sequence

  from .autonotebook import tqdm as notebook_tqdm


## Model

In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

## Data

In [3]:
df = pd.read_json("results/CLR10.jsonl", lines=True)
big_boys = df.copy()

In [4]:
big_boys["completion"] = big_boys["completion"].str.len()
big_boys = big_boys[big_boys.completion.astype(int) > 1000]

In [5]:
df.shape

(1640, 2)

In [3]:
samples_10 = helper.read_samples('results/CLR10.jsonl')
samples_1 = helper.read_samples('results/CLR1.jsonl_results.jsonl')

## Get Embeddings

In [5]:
blocks = helper.split_into_blocks(samples_10, 10)

In [6]:
emb_pad = helper.form_emb(blocks, model, tokenizer, "pad")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


TypeError: expected np.ndarray (got Tensor)

In [4]:
emb_mean = helper.read_embeddings("bert_emb.jsonl")

## Pipe

In [12]:
kernel_entropy_values = helper.get_entropies_rbf(emb)

In [13]:
kernel_entropy_values

[-1.9950682450138693,
 -2.0144155449129757,
 -2.0077029536845803,
 -2.074465923161821,
 -1.915913419605499,
 -2.0076395911559426,
 -1.812191007422799,
 -1.9846798583990086,
 -2.026638581599241,
 -2.0295814852834506,
 -1.9933336417901064,
 -1.870360911420635,
 -1.9789821102689256,
 -2.058458895474496,
 -2.0077087595730077,
 -1.9959996767255848,
 -1.9867403430738737,
 -1.9109460521071406,
 -1.9995485174856646,
 -1.9098062493708063,
 -1.9345736694541595,
 -1.8855601262964534,
 -1.9923696528366635,
 -2.010116594773857,
 -1.9632306345773123,
 -2.05459597879119,
 -2.0361128173727,
 -1.97271416134572,
 -1.9723297192749663,
 -1.9877621257497846,
 -2.0005912904816205,
 -2.0951032614470226,
 -1.9804816109159957,
 -2.0026303005478083,
 -2.039662196924724,
 -2.0720435221750413,
 -2.09201835295013,
 -2.0422678862119668,
 -1.9853996497318456,
 -1.9568654684002607,
 -2.0127755473819025,
 -2.0008534893627723,
 -2.0101718810318894,
 -1.9533153272244297,
 -2.031628662050735,
 -2.118094151671981,
 -1.921

In [14]:
kernel_entropy_values_flipped = []
for i in kernel_entropy_values:
    kernel_entropy_values_flipped.append(abs(i))
kernel_entropy_values_flipped

[1.9950682450138693,
 2.0144155449129757,
 2.0077029536845803,
 2.074465923161821,
 1.915913419605499,
 2.0076395911559426,
 1.812191007422799,
 1.9846798583990086,
 2.026638581599241,
 2.0295814852834506,
 1.9933336417901064,
 1.870360911420635,
 1.9789821102689256,
 2.058458895474496,
 2.0077087595730077,
 1.9959996767255848,
 1.9867403430738737,
 1.9109460521071406,
 1.9995485174856646,
 1.9098062493708063,
 1.9345736694541595,
 1.8855601262964534,
 1.9923696528366635,
 2.010116594773857,
 1.9632306345773123,
 2.05459597879119,
 2.0361128173727,
 1.97271416134572,
 1.9723297192749663,
 1.9877621257497846,
 2.0005912904816205,
 2.0951032614470226,
 1.9804816109159957,
 2.0026303005478083,
 2.039662196924724,
 2.0720435221750413,
 2.09201835295013,
 2.0422678862119668,
 1.9853996497318456,
 1.9568654684002607,
 2.0127755473819025,
 2.0008534893627723,
 2.0101718810318894,
 1.9533153272244297,
 2.031628662050735,
 2.118094151671981,
 1.9218401403329162,
 2.0109836049129606,
 2.01776957

In [15]:
is_prediction_correct = helper.get_pass(samples_1)

In [16]:
kent_auroc = metrics.roc_auc_score(
    is_prediction_correct,
    kernel_entropy_values_flipped
)

In [17]:
kent_auroc

0.5124332570556827

In [13]:
res = helper.compare_results(kernel_entropy_values_flipped, is_prediction_correct)

In [14]:
print(len(res))

164


In [15]:
for t in res:
    print(t)

(0.9906867133246527, False)
(0.9682647705078125, False)
(0.9707090589735243, True)
(0.9903168572319878, True)
(0.9212972852918837, True)
(0.9754938761393229, True)
(0.869362301296658, False)
(0.9773200141059027, True)
(0.9749673631456163, True)
(0.9839932759602864, True)
(0.9735575358072917, False)
(0.9128739251030816, False)
(0.9762363857693143, True)
(0.9879859076605902, True)
(0.9941034952799479, True)
(0.9768653869628906, True)
(0.9672409057617187, True)
(0.9234320746527778, False)
(0.9815115186903212, True)
(0.9100140889485677, False)
(0.9689888848198784, False)
(0.9166670905219184, True)
(0.9953699747721354, True)
(0.9807843526204427, True)
(0.9145840115017361, True)
(0.9767080518934462, True)
(0.9756429884168837, False)
(0.9758702596028646, True)
(0.978177981906467, True)
(0.9874637179904514, True)
(0.9849557664659289, True)
(0.9841891818576389, True)
(0.9680437723795573, False)
(0.9768463982476129, False)
(0.9884674919976129, True)
(0.9680828518337674, True)
(0.9915179782443576

## Debug

In [5]:
data = []
with open('CLR10.jsonl') as f:
    for line in f:
        data.append(json.loads(line))

In [7]:
data[10]["task_id"]

'HumanEval/1'

In [80]:
print(len(block_0))

10


In [109]:
def form_emb(data):
    embeddings = []
    for line in data:
        print(line["task_id"])
        try:
            embedding = functions.get_embedding(model, tokenizer, line["completion"])
            fix = embedding.detach()
            embeddings.append(fix)
        except:
            continue
            
    return embeddings

In [110]:
block_emb = form_emb(block_0)

HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2


In [None]:
sample0_emb = functions.get_embedding(model, tokenizer, sample0)
sample1_emb = functions.get_embedding(model, tokenizer, sample1)
sample2_emb = functions.get_embedding(model, tokenizer, sample2)
sample3_emb = functions.get_embedding(model, tokenizer, sample3)
sample4_emb = functions.get_embedding(model, tokenizer, sample4)
sample5_emb = functions.get_embedding(model, tokenizer, sample5)
sample6_emb = functions.get_embedding(model, tokenizer, sample6)
sample7_emb = functions.get_embedding(model, tokenizer, sample7)
sample8_emb = functions.get_embedding(model, tokenizer, sample8)
sample9_emb = functions.get_embedding(model, tokenizer, sample9)


embedding_room_detach = [sample0_emb[0].detach(), sample1_emb[0].detach(), sample2_emb[0].detach(), sample3_emb[0].detach(), sample4_emb[0].detach(), sample5_emb[0].detach(), sample6_emb[0].detach(), sample7_emb[0].detach(), sample8_emb[0].detach(), sample9_emb[0].detach()]
embedding_room_0 = [sample0_emb[0], sample1_emb[0], sample2_emb[0], sample3_emb[0], sample4_emb[0], sample5_emb[0], sample6_emb[0], sample7_emb[0], sample8_emb[0], sample9_emb[0]]
embedding_room_1 = [sample0_emb, sample1_emb, sample2_emb, sample3_emb, sample4_emb, sample5_emb, sample6_emb, sample7_emb, sample8_emb, sample9_emb]



embeddings = [i.mean(0).detach().numpy() for i in embedding_room_1]
embeddings = np.array(embeddings)

embeddings_nomean = [i.mean(0).detach().numpy() for i in embedding_room_1]
embeddings_nomean = np.array(embeddings)