## Imports

In [None]:
import os
import time
import pyparsing
import jsonpickle
from enum import Enum
import matplotlib.pyplot as plt
from ast import literal_eval as make_tuple
#
#import base64
import pybase64 as base64
#
import torch
import einops
from einops import rearrange
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from transformers import AutoModelForSequenceClassification

In [None]:
import numpy as np
import pandas as pd
#
from datasets import Dataset
#
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
#
from torch.utils.data import DataLoader

## Constant

In [None]:
DATA_FOLDER = "./data/"

In [None]:
#FILE = "files_of_interest.json"
FILE = "files_of_interest_8.json"

In [None]:
BERT_EMBEDDING_SIZE = 768
BERT_MAX_TOKENS = 512

## Utils

In [None]:
class ProgrammingLanguage(Enum):
    PYTHON = "Python"
    JAVA = "Java"
    JAVASCRIPT = "JavaScript"
    GO = "Go"
    PHP = "PHP"
    RUBY = "Ruby"
    
    @staticmethod
    def get_comment_filters(programming_language):
        if programming_language in [ProgrammingLanguage.PYTHON, ProgrammingLanguage.RUBY]:
            return [pyparsing.pythonStyleComment.suppress()]
        if programming_language in [ProgrammingLanguage.JAVA, ProgrammingLanguage.JAVASCRIPT, ProgrammingLanguage.GO]:
            return [pyparsing.cppStyleComment.suppress()]
        return [pyparsing.pythonStyleComment.suppress(), pyparsing.cppStyleComment.suppress()]

    @staticmethod
    def get_lang_from_file_name(file_name):
        if file_name.endswith(".py"):
            return ProgrammingLanguage.PYTHON
        if file_name.endswith(".php"):
            return ProgrammingLanguage.PHP
        if file_name.endswith(".js"):
            return ProgrammingLanguage.JAVASCRIPT
        if file_name.endswith(".java"):
            return ProgrammingLanguage.JAVA
        if file_name.endswith(".go"):
            return ProgrammingLanguage.GO
        return ProgrammingLanguage.RUBY

## Load data

In [None]:
with open(os.path.join(DATA_FOLDER, FILE)) as f_in:
    for line in f_in:
        data = jsonpickle.decode(line)

## Decode files

In [None]:
file_states = {}

In [None]:
for repo in data:
    print(repo)
    for fileId, file_name in enumerate(data[repo]):
        if (fileId+1)%10 == 0:
            print(f"\t{fileId+1}/{len(data[repo])}")
        for commit in data[repo][file_name]:
            file_content = None
            file_encoding = None
            for file in commit["files"]:
                if file_name == file["name"]:
                    file_sha = file["sha"]
                    file_content = file["content"]
                    if "content_encoding" in file:
                        file_encoding = file["content_encoding"]
                    break
            if file_content and file_encoding and file_encoding=="base64":
                try:
                    decoded_content = base64.b64decode(file_content).decode("utf-8")
                    clean_decoded_content = decoded_content
                    programming_lang = ProgrammingLanguage.get_lang_from_file_name(file_name)
                    for commentFilter in ProgrammingLanguage.get_comment_filters(programming_lang):
                        clean_decoded_content = commentFilter.transformString(clean_decoded_content)
                    clean_decoded_content = "\n".join([s for s in clean_decoded_content.split("\n") if len(s.strip()) > 0])
                    file_states[(repo, file_sha, commit["sha"])] = {
                        "source": clean_decoded_content
                    }
                except Exception as e:
                    print("ERROR: ", e)

In [None]:
repos = [repo for repo in data]

In [None]:
encoded_file_data = jsonpickle.encode(file_states)
#
with open(os.path.join(DATA_FOLDER, "files_of_interest_source_lookup.json"), "w") as f_out:
    f_out.write(encoded_file_data)

In [None]:
with open(os.path.join(DATA_FOLDER, "repo_to_id.json"), "r") as f_in:
    for line in f_in:
        REPO_TO_ID = jsonpickle.decode(line)

In [None]:
REPO_TO_ID

In [None]:
REPO_TO_ID[repos[0]] = '8'

In [None]:
REPO_TO_ID

## Alternative - load previous save

In [None]:
with open(os.path.join(DATA_FOLDER, "file_of_interest_embedding_lookup.json"), "r") as f_in:
    for line in f_in:
        file_states = jsonpickle.decode(line)
#       
adjusted_file_states = {}
for entry in file_states:
    t = make_tuple(entry)
    adjusted_file_states[t] = file_states[entry]
#
file_states = adjusted_file_states

In [None]:
repos = list(set([repo for repo, _, _ in file_states]))

In [None]:
repos

## Alternative - Load existing file source lookup

In [None]:
with open(os.path.join(DATA_FOLDER, "files_of_interest_source_lookup.json"), "r") as f_in:
    for line in f_in:
        file_states = jsonpickle.decode(line)

In [None]:
adjusted_file_states = {}
for entry in file_states:
    t = make_tuple(entry)
    adjusted_file_states[t] = file_states[entry]
#
file_states = adjusted_file_states

In [None]:
repos = list(set([repo for repo, _, _ in file_states]))

In [None]:
for repo in repos:
    cnt = 0
    for entry_repo, _, _ in file_states:
        if repo == entry_repo:
            cnt = cnt + 1
    print(f"{repo} \t => {cnt}")

## Encode files

In [None]:
nlp_dataset = {}
for repo in repos:
    nlp_dataset[repo] = []
    for key in file_states:
        file_repo, _, _  = key
        source = file_states[key]["source"]
        if repo == file_repo:
            nlp_dataset[repo].append({
                "text": source,
            })

In [None]:
for repo in nlp_dataset:
    print(f"{repo} \t => {len(nlp_dataset[repo])}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", num_labels=2)

In [None]:
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    model.cuda()

In [None]:
step = 256
size = 512
batch_size = 64
#
for repo in REPO_TO_ID:
    if repo not in nlp_dataset:
        continue
    print(repo)
    for i, instance in enumerate(nlp_dataset[repo]):
        if (i+1)%10==0:
            print(f"\t => {i+1}/{len(nlp_dataset[repo])}")
        source = instance["text"]
        #===
        if "embedding" in instance and instance["embedding"] is not None:
            continue
        #===
        if source is None or len(source.strip()) == 0:
            instance["embedding"] = None
            continue            
        #===
        tokens = tokenizer.tokenize(source)
        #===
        chunks = []
        for j in range(0, len(tokens), step):
            chunk = [tokenizer.cls_token] + tokens[j:min(len(tokens),j+size)-1]
            chunks.append(chunk)
            if j+size>len(tokens):
                break
        #===
        if len(chunks) == 0:
            instance["embedding"] = None
            continue
        #===
        while len(chunks[-1]) < size:
            chunks[-1].append(tokenizer.pad_token)
        #===
        for j in range(len(chunks)):
            chunks[j] = tokenizer.convert_tokens_to_ids(chunks[j])
        #===
        all_hidden = []
        for j in range(0, len(chunks), batch_size):
            batch = chunks[j:min(j+batch_size, len(chunks))]
            #===
            t = torch.tensor(batch).to(device)
            #===
            with torch.no_grad():
                hs = model(t, output_hidden_states=True)
                last_hiddens = hs.hidden_states[-1][:,0,:]
                all_hidden.append(last_hiddens.detach().cpu())
        #===
        all_hidden = torch.cat(all_hidden, 0)
        instance["embedding"] = torch.mean(all_hidden, axis=0).numpy() 
        #
        del t
        torch.cuda.empty_cache()
    #==========================================
    #Save the created data
    save_data = {}
    save_data[repo] = nlp_dataset[repo]
    encoded = jsonpickle.encode(save_data)
    with open(os.path.join(DATA_FOLDER, f"file_of_interest_embedding_lookup_{REPO_TO_ID[repo]}.json"), "w") as f_out:
        f_out.write(encoded)
    #del nlp_dataset[repo]
    #==========================================
encoded = jsonpickle.encode(nlp_dataset)
with open(os.path.join(DATA_FOLDER, f"file_of_interest_embedding_lookup.json"), "w") as f_out:
    f_out.write(encoded)

In [None]:
print("test")

## Order encodings

In [None]:
sorted_file_embeddings = {}
for repo in data:
    sorted_file_embeddings[repo] = {}
    for file_name in data[repo]:
        sorted_embedding = []
        sorted_commits = sorted(data[repo][file_name], key=lambda c: c["date"])
        for commit in sorted_commits:
            this_file = None
            for file in commit["files"]:
                if file_name == file["name"]:
                    this_file = file
                    break            
            #
            if this_file:
                embedding = file_states[(repo, file["sha"], commit["sha"])]["embedding"]
                sorted_embedding.append({"commit": commit, "embedding": embedding})
        sorted_file_embeddings[repo][file_name] = sorted_embedding

In [None]:
encoded_sorted_file_embeddings = jsonpickle.encode(sorted_file_embeddings)
#
with open(os.path.join(DATA_FOLDER, f"p2_file_of_interest_sorted_embeddings.json"), "w") as f_out:
    f_out.write(encoded_sorted_file_embeddings)