In [223]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
import torch
from torch import nn, optim
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

tqdm.pandas()

### Load Dataset

In [224]:
df = pd.read_pickle('data/processed.pickle')
df.head()

Unnamed: 0,docstring,code,relevance,repo,func_url,split
0,Attempt to convert the specified value to epoc...,private static String coerceToEpoch(String s) ...,2,spring-projects/spring-boot,https://github.com/spring-projects/spring-boot...,train
1,Generate a server side cookie given the cookie...,private Cookie createCookie(String str) throws...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train
2,Generate httponly cookie from HS2 cookie\n@par...,private static String getHttpOnlyCookieHeader(...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train
3,Copies all files from source to target and set...,"public static void copy(Path sourcePath, Path ...",3,apache/flink,https://github.com/apache/flink/blob/b62db93bf...,train
4,Saves the configuration info to the disk.,public synchronized void save() {\n if ...,3,jenkinsci/jenkins,https://github.com/jenkinsci/jenkins/blob/44c4...,train


### Get Embeddings

In [225]:
EMBEDDING_DIMENSION = 400

def get_tokens_and_embeddings(natural_language, code, tokenizer, model):

    nl_tokens = tokenizer.tokenize(natural_language)
    code_tokens = tokenizer.tokenize(code)

    tokens = ['CLS'] + nl_tokens + ['SEP'] + code_tokens + ['EOS']

    if len(tokens) > EMBEDDING_DIMENSION:
        tokens = tokens[:EMBEDDING_DIMENSION]

    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    embedding = model(torch.tensor(token_ids)[None, :])[0][0]

    pad_value_before = (EMBEDDING_DIMENSION - len(tokens)) // 2
    pad_value_after = pad_value_before + ((EMBEDDING_DIMENSION - len(tokens)) % 2)

    padded = nn.functional.pad(embedding, (0, 0, pad_value_before, pad_value_after), "constant", 0)

    return tokens, len(tokens), padded.detach().numpy()

In [226]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
bert_model = RobertaModel.from_pretrained("microsoft/codebert-base")
bert_model.to(device)

print()




In [227]:
df = df.iloc[:100]
df.head(10)

Unnamed: 0,docstring,code,relevance,repo,func_url,split
0,Attempt to convert the specified value to epoc...,private static String coerceToEpoch(String s) ...,2,spring-projects/spring-boot,https://github.com/spring-projects/spring-boot...,train
1,Generate a server side cookie given the cookie...,private Cookie createCookie(String str) throws...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train
2,Generate httponly cookie from HS2 cookie\n@par...,private static String getHttpOnlyCookieHeader(...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train
3,Copies all files from source to target and set...,"public static void copy(Path sourcePath, Path ...",3,apache/flink,https://github.com/apache/flink/blob/b62db93bf...,train
4,Saves the configuration info to the disk.,public synchronized void save() {\n if ...,3,jenkinsci/jenkins,https://github.com/jenkinsci/jenkins/blob/44c4...,train
5,Randomize server list using local IPv4 address...,public static <T extends EurekaEndpoint> List<...,0,Netflix/eureka,https://github.com/Netflix/eureka/blob/48446d9...,train
6,"Decompress an input stream from a file, based ...",public static InputStream decompress(final Inp...,3,apache/incubator-druid,https://github.com/apache/incubator-druid/blob...,train
7,Save distribution statistics to the file syste...,"public void save(@NonNull File meanFile, @NonN...",0,deeplearning4j/deeplearning4j,https://github.com/deeplearning4j/deeplearning...,train
8,Score the given multi layer network\n@param mo...,public static double score(MultiLayerNetwork m...,0,deeplearning4j/deeplearning4j,https://github.com/deeplearning4j/deeplearning...,train
9,This method takes a string as input reverses i...,private static String reverseString(String in)...,3,zaproxy/zaproxy,https://github.com/zaproxy/zaproxy/blob/0cbe51...,train


In [228]:
df[['tokens', 'token_count', 'embeddings']] = df.progress_apply(lambda x: get_tokens_and_embeddings(x['docstring'], x['code'], tokenizer=bert_tokenizer, model=bert_model), axis=1, result_type='expand')
df.head()

100%|██████████| 100/100 [00:10<00:00,  9.62it/s]


Unnamed: 0,docstring,code,relevance,repo,func_url,split,tokens,token_count,embeddings
0,Attempt to convert the specified value to epoc...,private static String coerceToEpoch(String s) ...,2,spring-projects/spring-boot,https://github.com/spring-projects/spring-boot...,train,"[CLS, Attempt, Ġto, Ġconvert, Ġthe, Ġspecified...",202,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,Generate a server side cookie given the cookie...,private Cookie createCookie(String str) throws...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train,"[CLS, Gener, ate, Ġa, Ġserver, Ġside, Ġcookie,...",223,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,Generate httponly cookie from HS2 cookie\n@par...,private static String getHttpOnlyCookieHeader(...,2,apache/spark,https://github.com/apache/spark/blob/25ee0474f...,train,"[CLS, Gener, ate, Ġhttp, only, Ġcookie, Ġfrom,...",129,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,Copies all files from source to target and set...,"public static void copy(Path sourcePath, Path ...",3,apache/flink,https://github.com/apache/flink/blob/b62db93bf...,train,"[CLS, Cop, ies, Ġall, Ġfiles, Ġfrom, Ġsource, ...",253,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,Saves the configuration info to the disk.,public synchronized void save() {\n if ...,3,jenkinsci/jenkins,https://github.com/jenkinsci/jenkins/blob/44c4...,train,"[CLS, S, aves, Ġthe, Ġconfiguration, Ġinfo, Ġt...",321,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


### Train Classification Model
Classes: 0, 1, 2, 3 - relevance score provided by dataset, with 3 being the highest and 0 being the lowest

#### Create dataset to use for training
1. Start with X = cosine similarity and y = relvance score
2. Later add more features to X (code and docstring embeddings, results from static analysis?)

In [229]:
model_df = df[['split', 'embeddings', 'relevance']].copy().dropna()
model_df.head()

Unnamed: 0,split,embeddings,relevance
0,train,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
1,train,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
2,train,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
3,train,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3
4,train,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3


In [259]:
train_df = model_df[model_df.split == 'train'].drop('split', axis = 1)
test_df = model_df[model_df.split == 'test'].drop('split', axis = 1)

train_df.head()

Unnamed: 0,embeddings,relevance
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3


#### Train model

In [231]:
embedding_dims = 768
kernel_size = 3
hidden_dims = 132
number_of_class = 4
dropout = 0.5
learning_rate = 1e-4
epochs = 1

In [232]:
model = nn.Sequential(
    nn.Conv1d(embedding_dims, hidden_dims, kernel_size),
    nn.ReLU(),
    nn.MaxPool1d(kernel_size),
    nn.Linear(hidden_dims, number_of_class),
    nn.Sigmoid()
)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [287]:
loss = []

for i in tqdm(range(epochs)):
    model.train()

    for index, row in train_df.iterrows():
        x = torch.tensor(row.embeddings).T
        y = torch.tensor(row.relevance)

        y_hat = model(x)

        loss = criterion(y_hat, y)

        break
        # loss.append([loss.item])
        # loss.backward()
        # optimizer.step()

  0%|          | 0/1 [00:00<?, ?it/s]


ValueError: Expected input batch_size (132) to match target batch_size (0).

In [None]:
sns.lineplot(loss)
plt.xlabel('epoch')
plt.ylabel('loss')

### Evaluate

In [None]:
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred, zero_division=True))