# Pipe for create embeddings

## Imports

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import helper
import debug
import json

import pandas as pd
import numpy as np
import seaborn as sns
import math

import sklearn.metrics as metrics
from torch.nn.utils.rnn import pad_sequence
import evaluate

## Model

In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

## Data

In [3]:
df = pd.read_json("results/CLR10.jsonl", lines=True)
big_boys = df.copy()

In [4]:
big_boys["completion"] = big_boys["completion"].str.len()
big_boys = big_boys[big_boys.completion.astype(int) > 1000]

In [5]:
df.shape

(1640, 2)

In [6]:
samples_10 = helper.read_samples('results/CLR10.jsonl')
samples_1 = helper.read_samples('results/CLR1.jsonl_results.jsonl')

## Get Embeddings

In [7]:
emb = helper.read_embeddings("bert_emb.jsonl")

## Kernel entropie

In [69]:
kernel_entropy_values = helper.get_entropies_rbf(emb)

In [70]:
kernel_entropy_values

[-0.9906866839043654,
 -0.9682647596326586,
 -0.9707090033675423,
 -0.9903168230827406,
 -0.9212972728820279,
 -0.9754937684746404,
 -0.8693623332230956,
 -0.9773200281372018,
 -0.9749673609618376,
 -0.9839933281619947,
 -0.9735575282920603,
 -0.9128740040103315,
 -0.9762364771357336,
 -0.9879859406668807,
 -0.9941037023334877,
 -0.976865346196268,
 -0.9672409687729532,
 -0.9234320513677676,
 -0.9815115570905041,
 -0.9100140229182206,
 -0.9689889110721438,
 -0.9166670650951859,
 -0.9953700546404302,
 -0.9807842870293858,
 -0.9145839346461689,
 -0.9767080232988707,
 -0.9756429657931432,
 -0.9758702291536924,
 -0.9781779397859965,
 -0.9874637404619095,
 -0.984955783880895,
 -0.9841891582771264,
 -0.9680437043067428,
 -0.9768463799061754,
 -0.9884675435394297,
 -0.9680828035342145,
 -0.9915180166859443,
 -0.979698892156884,
 -0.9898779595274375,
 -0.9239467127943595,
 -0.9937407435793499,
 -0.9645234930081781,
 -0.9710443378497532,
 -0.9281242784045861,
 -0.9771309140496339,
 -0.993750014

In [71]:
kernel_entropy_values_flipped = []
for i in kernel_entropy_values:
    kernel_entropy_values_flipped.append(abs(i))
kernel_entropy_values_flipped

[0.9906866839043654,
 0.9682647596326586,
 0.9707090033675423,
 0.9903168230827406,
 0.9212972728820279,
 0.9754937684746404,
 0.8693623332230956,
 0.9773200281372018,
 0.9749673609618376,
 0.9839933281619947,
 0.9735575282920603,
 0.9128740040103315,
 0.9762364771357336,
 0.9879859406668807,
 0.9941037023334877,
 0.976865346196268,
 0.9672409687729532,
 0.9234320513677676,
 0.9815115570905041,
 0.9100140229182206,
 0.9689889110721438,
 0.9166670650951859,
 0.9953700546404302,
 0.9807842870293858,
 0.9145839346461689,
 0.9767080232988707,
 0.9756429657931432,
 0.9758702291536924,
 0.9781779397859965,
 0.9874637404619095,
 0.984955783880895,
 0.9841891582771264,
 0.9680437043067428,
 0.9768463799061754,
 0.9884675435394297,
 0.9680828035342145,
 0.9915180166859443,
 0.979698892156884,
 0.9898779595274375,
 0.9239467127943595,
 0.9937407435793499,
 0.9645234930081781,
 0.9710443378497532,
 0.9281242784045861,
 0.9771309140496339,
 0.9937500143639106,
 0.9179362649244074,
 0.9755149026845

## Lexical sim

In [37]:
def lexical_sim(samples_1, samples_10):

    rouge = evaluate.load("rouge")
    sims = []
    completions = []

    for samp in samples_10:
        completions.append(samp["completion"])
    
    blocks = helper.split_into_blocks(completions, 10)

    for i in range(0, 164):
        print(i)
        predictions = [samples_1[i]["completion"]]
        references = [blocks[i]]
        result = rouge.compute(predictions=predictions, references=references)
        sims.append(result["rougeLsum"])
        i = i+1




    return sims

In [38]:
res = lexical_sim(samples_1, samples_10)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163


In [41]:
res

[0.9333333333333333,
 0.4292237442922375,
 0.8421052631578948,
 1.0,
 0.5647058823529413,
 0.8571428571428571,
 0.4761904761904763,
 1.0,
 0.8727272727272727,
 0.5901639344262295,
 0.7213114754098361,
 0.8823529411764706,
 1.0,
 0.9666666666666666,
 1.0,
 1.0,
 1.0,
 0.5714285714285713,
 1.0,
 0.9500000000000001,
 0.7014925373134328,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.8928571428571429,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9411764705882353,
 0.7272727272727274,
 0.9666666666666666,
 1.0,
 0.9333333333333333,
 1.0,
 1.0,
 1.0,
 0.7173913043478259,
 0.9078014184397162,
 0.7483870967741935,
 1.0,
 1.0,
 1.0,
 1.0,
 0.9866666666666666,
 0.9090909090909091,
 0.9473684210526316,
 1.0,
 1.0,
 0.8846153846153846,
 1.0,
 1.0,
 1.0,
 1.0,
 0.4819277108433735,
 1.0,
 0.9600000000000001,
 0.9122807017543859,
 1.0,
 0.8518518518518519,
 0.617283950617284,
 1.0,
 1.0,
 1.0,
 1.0,
 0.6000000000000001,
 0.822695035460993,
 1.0,
 0.8235294117647058,
 0.8831168831168831,
 0.6938775510204083,
 0.6417582417582417,
 0.

## Evaluate

In [42]:
is_prediction_correct = helper.get_pass(samples_1)

In [43]:
def convert_pred(pred):
    arr = []
    for i in pred:
        if i == False:
            arr.append(0)
        else:
            arr.append(1)
    return arr

In [44]:
test = convert_pred(is_prediction_correct)

In [45]:
kent_auroc = metrics.roc_auc_score(
    test,
    res
)

In [46]:
kent_auroc

0.6411136536994662

In [47]:
result = helper.compare_results(res, is_prediction_correct)

In [48]:
print(len(res))

164


In [49]:
for t in result:
    print(t)

(0.9333333333333333, False)
(0.4292237442922375, False)
(0.8421052631578948, True)
(1.0, True)
(0.5647058823529413, True)
(0.8571428571428571, True)
(0.4761904761904763, False)
(1.0, True)
(0.8727272727272727, True)
(0.5901639344262295, True)
(0.7213114754098361, False)
(0.8823529411764706, False)
(1.0, True)
(0.9666666666666666, True)
(1.0, True)
(1.0, True)
(1.0, True)
(0.5714285714285713, False)
(1.0, True)
(0.9500000000000001, False)
(0.7014925373134328, False)
(1.0, True)
(1.0, True)
(1.0, True)
(1.0, True)
(1.0, True)
(0.8928571428571429, False)
(1.0, True)
(1.0, True)
(1.0, True)
(1.0, True)
(0.9411764705882353, True)
(0.7272727272727274, False)
(0.9666666666666666, False)
(1.0, True)
(0.9333333333333333, True)
(1.0, False)
(1.0, False)
(1.0, False)
(0.7173913043478259, False)
(0.9078014184397162, True)
(0.7483870967741935, False)
(1.0, True)
(1.0, True)
(1.0, False)
(1.0, True)
(0.9866666666666666, True)
(0.9090909090909091, True)
(0.9473684210526316, False)
(1.0, True)
(1.0, F

## Debug

In [5]:
data = []
with open('CLR10.jsonl') as f:
    for line in f:
        data.append(json.loads(line))

In [7]:
data[10]["task_id"]

'HumanEval/1'

In [80]:
print(len(block_0))

10


In [109]:
def form_emb(data):
    embeddings = []
    for line in data:
        print(line["task_id"])
        try:
            embedding = functions.get_embedding(model, tokenizer, line["completion"])
            fix = embedding.detach()
            embeddings.append(fix)
        except:
            continue
            
    return embeddings

In [110]:
block_emb = form_emb(block_0)

HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2
HumanEval/2


In [None]:
sample0_emb = functions.get_embedding(model, tokenizer, sample0)
sample1_emb = functions.get_embedding(model, tokenizer, sample1)
sample2_emb = functions.get_embedding(model, tokenizer, sample2)
sample3_emb = functions.get_embedding(model, tokenizer, sample3)
sample4_emb = functions.get_embedding(model, tokenizer, sample4)
sample5_emb = functions.get_embedding(model, tokenizer, sample5)
sample6_emb = functions.get_embedding(model, tokenizer, sample6)
sample7_emb = functions.get_embedding(model, tokenizer, sample7)
sample8_emb = functions.get_embedding(model, tokenizer, sample8)
sample9_emb = functions.get_embedding(model, tokenizer, sample9)


embedding_room_detach = [sample0_emb[0].detach(), sample1_emb[0].detach(), sample2_emb[0].detach(), sample3_emb[0].detach(), sample4_emb[0].detach(), sample5_emb[0].detach(), sample6_emb[0].detach(), sample7_emb[0].detach(), sample8_emb[0].detach(), sample9_emb[0].detach()]
embedding_room_0 = [sample0_emb[0], sample1_emb[0], sample2_emb[0], sample3_emb[0], sample4_emb[0], sample5_emb[0], sample6_emb[0], sample7_emb[0], sample8_emb[0], sample9_emb[0]]
embedding_room_1 = [sample0_emb, sample1_emb, sample2_emb, sample3_emb, sample4_emb, sample5_emb, sample6_emb, sample7_emb, sample8_emb, sample9_emb]



embeddings = [i.mean(0).detach().numpy() for i in embedding_room_1]
embeddings = np.array(embeddings)

embeddings_nomean = [i.mean(0).detach().numpy() for i in embedding_room_1]
embeddings_nomean = np.array(embeddings)