In [None]:
! pip install nucliadb-sdk==1.2.5
! pip install nucliadb-dataset==1.2.3
! pip install nucliadb-models==2.0.4
! pip install transformers
! pip install sentence-transformers

In [4]:

import requests
from nucliadb_sdk.knowledgebox import KnowledgeBox
from nucliadb_sdk.utils import create_knowledge_box, get_or_create

from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer


## Setup

Once we've started **NucliaDB's container**

``` 
docker run -it \
       -e LOG=INFO \
       -p 8080:8080 \
       -p 8060:8060 \
       -p 8040:8040 \
       -v nucliadb-standalone:/data \
       nuclia/nucliadb:latest
```
we'll check the connection:

In [5]:
response = requests.get(f"http://0.0.0.0:8080")
response

<Response [200]>

## Setup - creating a KB

In nucliadb our data containers are called knowledge boxes.

To start working, we need to create one:

In [6]:
my_kb = get_or_create("my_code_search_kb_3")
my_kb

<nucliadb_sdk.knowledgebox.KnowledgeBox at 0x1786870a0>

## Data preparation  - Collection

Then we gather the data. 

In this case we use the inspect library to gather all the functions from our nucliadb_sdk module


In [None]:
import nucliadb_sdk
from inspect import getmembers, isfunction, ismodule,isclass,getsource

def get_all_code(target_module):
    functions=[]
    functions_code=[]
    for  module_name, module in getmembers(target_module,ismodule):
        if module_name != "logging":
            functions.extend([(name,fn) for name, fn in getmembers(module, isfunction) if fn.__module__ == module.__name__])
            for my_class_name,my_class in [(name,fn) for name, fn in getmembers(module, isclass) if fn.__module__ == module.__name__]:
                functions.extend([(name,fn) for name, fn in getmembers(my_class, isfunction) if fn.__module__ == module.__name__ and ("__" not in fn.__name__)])
    functions_code=[getsource(function) for function_name,function in functions ]
    return functions_code
my_functions = [i.strip() for i in get_all_code(nucliadb_sdk)]


Just a quick check to see how many functions we gathered

In [None]:
len(my_functions)

## Data preparation  - Create vectors

Once we have all the code, we need to calculate the vectors.
In this case we are using:

Microsoft's unixcoder-base

model_t5 = SentenceTransformer("krlvi/sentence-t5-base-nlpl-code_search_net")

model_bert = SentenceTransformer("krlvi/sentence-msmarco-bert-base-dot-v5-nlpl-code_search_net")

model_distilroberta = SentenceTransformer("flax-sentence-embeddings/st-codesearch-distilroberta-base")


In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/unixcoder-base")

model = AutoModel.from_pretrained("microsoft/unixcoder-base")

def get_vectors_roberta_pool(tokenizer, model, code_list):
    encoded_input = tokenizer(list(code_list),padding=True, truncation=True,max_length =1024, return_tensors="pt")
    outputs = model(**encoded_input)
    return outputs[0][0][0]

In [None]:
model_t5 = SentenceTransformer("krlvi/sentence-t5-base-nlpl-code_search_net")
model_bert = SentenceTransformer("krlvi/sentence-msmarco-bert-base-dot-v5-nlpl-code_search_net")
model_distilroberta = SentenceTransformer("flax-sentence-embeddings/st-codesearch-distilroberta-base")

tokenizer = AutoTokenizer.from_pretrained("microsoft/unixcoder-base")

model = AutoModel.from_pretrained("microsoft/unixcoder-base")



## Upload our Data

Now we have the data and we have created the KB (knowledgebox), so we just need to upload our resources

In [None]:
for i in range(len(my_functions)):
    label = "nucliadb_sdk"
    my_kb.upload(
        text=my_functions[i],
        labels=[f"code/{label}"],
        vectors={"unixcoder-meanpooling": get_vectors_roberta_pool(tokenizer, model,[my_functions[i]]).tolist(),
                 "t5": model_t5.encode([my_functions[i]])[0].tolist(),
                 "bert":  model_bert.encode([my_functions[i]])[0].tolist(),
                 "distilroberta":  model_distilroberta.encode([my_functions[i]])[0].tolist(),
                 },
    )

## Checks I 

We uploaded only data with one label. 

But we could have added more if we had code from other modules, or if we wanted to label some other code features

Let's check if the numbers agree!

In [None]:
my_kb.get_uploaded_labels()

## Checks II

We can also list all the different sets of vectors we've uploaded and their dimensions

In [None]:
my_kb.list_vectorset().vectorsets

## Searches

Now let's start with the most interesting part, the searches!

We are going to use a small function to iterate over our search results.

For legibility reasons I used a simple regex to print only the name of the function,
but feel free to modify it if you want the whole code!



In [None]:
import re

def print_results(model_name, results):
    print(f"\t***{model_name.upper()} RESULTS***")
    for result in results:
        print("Function name:",re.findall('def ([^\(]+)', result.text)[0], end=" -- ")
        #print("Function code:",'%.300s' %result.text,"\n\t...")
        #print("Function labels:"," ".join(result.labels))
        print("Similarity score:",result.score) 
    print("-----------")
    

## Text search

First we search only in the text fields

We will look for `create_resource` and `create a new knowledge box`

In [None]:
results = my_kb.search(text="create_resource")
print_results("Full text search", results)

In [None]:
results = my_kb.search(text="create a new knowledge box")
print_results("Full text search", results)

## Vector search

Full text search has its limitations, so let's try our semantic search and compare the results from different models

To perform these searches we need to encode our query and pass it to the search function with the `vector` argument.
The results will be retrieved in order from more to less similar (based on cosine similarity).
Note that you can define a threshold (`min_score`) so that the serach will only return results with similarity higher than a certain value.


In [None]:
query =["create a new knowledge box"]
print("QUERY: ",query[0])
results_unixcoder = my_kb.search(vector=get_vectors_roberta_pool(tokenizer, model, query).tolist(), vectorset="unixcoder-meanpooling", min_score=0.3)
print_results("Unicoxder", results_unixcoder)

results_t5 = my_kb.search(
    vector=model_t5.encode(query)[0], 
    vectorset="t5", 
    min_score=0.3)
print_results("T5", results_t5)


results_roberta = my_kb.search(
    vector=model_distilroberta.encode(query)[0], 
    vectorset="distilroberta", 
    min_score=0.3)
print_results("DISTILROBERTA", results_roberta)


results_bert = my_kb.search(
    vector=model_bert.encode(query)[0], 
    vectorset="bert", 
    min_score=0.3)

print_results("BERT", results_roberta)

  

In [None]:
query =["Upload vectors"]
print("QUERY: ",query[0])
results_unixcoder = my_kb.search(vector=get_vectors_roberta_pool(tokenizer, model, query).tolist(), vectorset="unixcoder-meanpooling", min_score=0.3)
print_results("Unicoxder", results_unixcoder)

results_t5 = my_kb.search(
    vector=model_t5.encode(query)[0], 
    vectorset="t5", 
    min_score=0.4)
print_results("T5", results_t5)


results_roberta = my_kb.search(
    vector=model_distilroberta.encode(query)[0].tolist(), 
    vectorset="distilroberta", 
    min_score=0.4)
print_results("DISTILROBERTA", results_roberta)


results_bert = my_kb.search(
    vector=model_bert.encode(query)[0].tolist(), 
    vectorset="bert", 
    min_score=0.4)

print_results("BERT", results_roberta)

  

In [None]:
query =["create labels"]

print("QUERY: ",query[0])
results_unixcoder = my_kb.search(vector=get_vectors_roberta_pool(tokenizer, model, query).tolist(), vectorset="unixcoder-meanpooling", min_score=0.3)
print_results("Unicoxder", results_unixcoder)

results_t5 = my_kb.search(
    vector=model_t5.encode(query)[0], 
    vectorset="t5", 
    min_score=0.4)
print_results("T5", results_t5)


results_roberta = my_kb.search(
    vector=model_distilroberta.encode(query)[0].tolist(), 
    vectorset="distilroberta", 
    min_score=0.4)
print_results("DISTILROBERTA", results_roberta)


results_bert = my_kb.search(
    vector=model_bert.encode(query)[0].tolist(), 
    vectorset="bert", 
    min_score=0.4)

print_results("BERT", results_roberta)

  
    

## Results

As we can see the models with better overall results are **T5**,**BERT**, and **DISTILROBERTA**.
And as a curiosity, even though the **BERT** and **DISTILROBERTA** were supposed to be different, their results are exactly the same
