In [None]:
! pip install "nucliadb-sdk<=2.42.1"
! pip install nucliadb-dataset
! pip install transformers
! pip install sentence-transformers

In [None]:
import requests
from nucliadb_sdk import KnowledgeBox, create_knowledge_box, get_or_create

from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer


## Setup NucliaDB

- Run **NucliaDB** image:
```bash
docker run -it \
       -e LOG=INFO \
       -p 8080:8080 \
       -p 8060:8060 \
       -p 8040:8040 \
       -v nucliadb-standalone:/data \
       nuclia/nucliadb:latest
```
- Or install with pip and run:

```bash
pip install nucliadb
nucliadb
```

## Check everything's up and running

In [3]:
import requests
response = requests.get(f"http://0.0.0.0:8080")

assert response.status_code == 200, "Ups, it seems something is not properly installed"

## Setup - creating a KB

In nucliadb our data containers are called knowledge boxes.

To start working, we need to create one:

In [4]:
my_kb = get_or_create("my_code_search_kb")
my_kb

<nucliadb_sdk.knowledgebox.KnowledgeBox at 0x175d900d0>

## Data preparation  - Collection

Then we gather the data. 

In this case we use the inspect library to gather all the functions from our nucliadb_sdk module


In [5]:
import nucliadb_sdk
from inspect import getmembers, isfunction, ismodule,isclass,getsource

def get_all_code(target_module):
    functions=[]
    functions_code=[]
    for  module_name, module in getmembers(target_module,ismodule):
        if module_name != "logging":
            functions.extend([(name,fn) for name, fn in getmembers(module, isfunction) if fn.__module__ == module.__name__])
            for my_class_name,my_class in [(name,fn) for name, fn in getmembers(module, isclass) if fn.__module__ == module.__name__]:
                functions.extend([(name,fn) for name, fn in getmembers(my_class, isfunction) if fn.__module__ == module.__name__ and ("__" not in fn.__name__)])
    functions_code=[getsource(function) for function_name,function in functions ]
    return functions_code
my_functions = [i.strip() for i in get_all_code(nucliadb_sdk)]


Just a quick check to see how many functions we gathered

In [7]:
len(my_functions)

56

## Data preparation  - Create vectors

Once we have all the code, we need to calculate the vectors.
In this case we are using:

Microsoft's unixcoder-base

model_t5 = SentenceTransformer("krlvi/sentence-t5-base-nlpl-code_search_net")

model_bert = SentenceTransformer("krlvi/sentence-msmarco-bert-base-dot-v5-nlpl-code_search_net")

model_distilroberta = SentenceTransformer("flax-sentence-embeddings/st-codesearch-distilroberta-base")


In [10]:
def get_vectors_roberta_pool(tokenizer, model, code_list):
    encoded_input = tokenizer(list(code_list),padding=True, truncation=True,max_length =1024, return_tensors="pt")
    outputs = model(**encoded_input)
    return outputs[0][0][0]

In [11]:
model_t5 = SentenceTransformer("krlvi/sentence-t5-base-nlpl-code_search_net")
model_bert = SentenceTransformer("krlvi/sentence-msmarco-bert-base-dot-v5-nlpl-code_search_net")
model_distilroberta = SentenceTransformer("flax-sentence-embeddings/st-codesearch-distilroberta-base")

tokenizer = AutoTokenizer.from_pretrained("microsoft/unixcoder-base")

model = AutoModel.from_pretrained("microsoft/unixcoder-base")


## Upload our Data

Now we have the data and we have created the KB (knowledgebox), so we just need to upload our resources

In [12]:
for i in range(len(my_functions)):
    label = "nucliadb_sdk"
    my_kb.upload(
        text=my_functions[i],
        labels=[f"code/{label}"],
        vectors={"unixcoder-meanpooling": get_vectors_roberta_pool(tokenizer, model,[my_functions[i]]).tolist(),
                 "t5": model_t5.encode([my_functions[i]])[0].tolist(),
                 "bert":  model_bert.encode([my_functions[i]])[0].tolist(),
                 "distilroberta":  model_distilroberta.encode([my_functions[i]])[0].tolist(),
                 },
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Vectorset is not created, we will create it for you
Vectorset is not created, we will create it for you
Vectorset is not created, we will create it for you
Vectorset is not created, we will create it for you


## Checks I 

We uploaded only data with one label. 

But we could have added more if we had code from other modules, or if we wanted to label some other code features

Let's check if the numbers agree!

In [13]:
my_kb.get_uploaded_labels()

{'code': LabelSet(count=56, labels={'nucliadb_sdk': 56})}

## Checks II

We can also list all the different sets of vectors we've uploaded and their dimensions

In [14]:
my_kb.list_vectorset().vectorsets

{'distilroberta': VectorSet(dimension=768),
 'bert': VectorSet(dimension=768),
 't5': VectorSet(dimension=768),
 'unixcoder-meanpooling': VectorSet(dimension=768)}

## Searches

Now let's start with the most interesting part, the searches!

We are going to use a small function to iterate over our search results.

For legibility reasons I used a simple regex to print only the name of the function,
but feel free to modify it if you want the whole code!



In [15]:
import re

def print_results(model_name, results):
    print(f"\t***{model_name.upper()} RESULTS***")
    for result in results:
        print("Function name:",re.findall('def ([^\(]+)', result.text)[0], end=" -- ")
        #print("Function code:",'%.300s' %result.text,"\n\t...")
        #print("Function labels:"," ".join(result.labels))
        print(f"Similarity score: {result.score}") 
    print("-----------")
    

## Text search

First we search only in the text fields

We will look for `create_resource` and `create a new knowledge box`

In [16]:
results = my_kb.search(text="create_resource")
print_results("Full text search", results)

	***FULL TEXT SEARCH RESULTS***
Function name: create_resource -- Similarity score: 3.6681838035583496
Function name: async_create_resource -- Similarity score: 3.6050631999969482
Function name: list_resources -- Similarity score: 2.788604259490967
Function name: async_list_resources -- Similarity score: 2.716294050216675
Function name: async_upload -- Similarity score: 2.551388740539551
Function name: upload -- Similarity score: 2.551388740539551
Function name: create_resource -- Similarity score: 0.769873857498169
-----------


In [17]:
results = my_kb.search(text="create a new knowledge box")
print_results("Full text search", results)

	***FULL TEXT SEARCH RESULTS***
-----------


## Vector search

Full text search has its limitations, so let's try our semantic search and compare the results from different models

To perform these searches we need to encode our query and pass it to the search function with the `vector` argument.
The results will be retrieved in order from more to less similar (based on cosine similarity).
Note that you can define a threshold (`min_score`) so that the search will only return results with similarity higher than a certain value.


In [18]:
query =["create a new knowledge box"]
print("QUERY: ",query[0])
results_unixcoder = my_kb.search(vector=get_vectors_roberta_pool(tokenizer, model, query).tolist(), vectorset="unixcoder-meanpooling", min_score=0.3)
print_results("Unicoxder", results_unixcoder)

results_t5 = my_kb.search(
    vector=model_t5.encode(query)[0], 
    vectorset="t5", 
    min_score=0.3)
print_results("T5", results_t5)


results_roberta = my_kb.search(
    vector=model_distilroberta.encode(query)[0], 
    vectorset="distilroberta", 
    min_score=0.3)
print_results("DISTILROBERTA", results_roberta)


results_bert = my_kb.search(
    vector=model_bert.encode(query)[0], 
    vectorset="bert", 
    min_score=0.3)

print_results("BERT", results_roberta)

  

QUERY:  create a new knowledge box
	***UNICOXDER RESULTS***
Function name: get_labels -- Similarity score: 0.41939425468444824
Function name: get_or_create -- Similarity score: 0.4122781753540039
Function name: create_knowledge_box -- Similarity score: 0.38341182470321655
Function name: get_kb -- Similarity score: 0.3585122525691986
Function name: list_kbs -- Similarity score: 0.334780216217041
Function name: search -- Similarity score: 0.3201015889644623
Function name: get_entities -- Similarity score: 0.3016570508480072
-----------
	***T5 RESULTS***
Function name: create_knowledge_box -- Similarity score: 0.6352006196975708
Function name: get_labels -- Similarity score: 0.47330352663993835
Function name: get_labels -- Similarity score: 0.4565504193305969
Function name: get_kb -- Similarity score: 0.43946319818496704
Function name: get_entities -- Similarity score: 0.4362731873989105
Function name: async_length -- Similarity score: 0.4227059781551361
Function name: length -- Similarit

In [19]:
query =["Upload vectors"]
print("QUERY: ",query[0])
results_unixcoder = my_kb.search(vector=get_vectors_roberta_pool(tokenizer, model, query).tolist(), vectorset="unixcoder-meanpooling", min_score=0.3)
print_results("Unicoxder", results_unixcoder)

results_t5 = my_kb.search(
    vector=model_t5.encode(query)[0], 
    vectorset="t5", 
    min_score=0.4)
print_results("T5", results_t5)


results_roberta = my_kb.search(
    vector=model_distilroberta.encode(query)[0].tolist(), 
    vectorset="distilroberta", 
    min_score=0.4)
print_results("DISTILROBERTA", results_roberta)


results_bert = my_kb.search(
    vector=model_bert.encode(query)[0].tolist(), 
    vectorset="bert", 
    min_score=0.4)

print_results("BERT", results_roberta)

  

QUERY:  Upload vectors
	***UNICOXDER RESULTS***
Function name: list_vectorset -- Similarity score: 0.33021214604377747
-----------
	***T5 RESULTS***
Function name: async_upload -- Similarity score: 0.5734764337539673
Function name: start_tus_upload -- Similarity score: 0.4033115804195404
-----------
	***DISTILROBERTA RESULTS***
Function name: async_upload -- Similarity score: 0.5523496866226196
Function name: upload -- Similarity score: 0.5517340302467346
Function name: patch_tus_upload -- Similarity score: 0.43836045265197754
Function name: start_tus_upload -- Similarity score: 0.4324209690093994
-----------
	***BERT RESULTS***
Function name: async_upload -- Similarity score: 0.5523496866226196
Function name: upload -- Similarity score: 0.5517340302467346
Function name: patch_tus_upload -- Similarity score: 0.43836045265197754
Function name: start_tus_upload -- Similarity score: 0.4324209690093994
-----------


In [20]:
query =["create labels"]

print("QUERY: ",query[0])
results_unixcoder = my_kb.search(vector=get_vectors_roberta_pool(tokenizer, model, query).tolist(), vectorset="unixcoder-meanpooling", min_score=0.3)
print_results("Unicoxder", results_unixcoder)

results_t5 = my_kb.search(
    vector=model_t5.encode(query)[0], 
    vectorset="t5", 
    min_score=0.4)
print_results("T5", results_t5)


results_roberta = my_kb.search(
    vector=model_distilroberta.encode(query)[0].tolist(), 
    vectorset="distilroberta", 
    min_score=0.4)
print_results("DISTILROBERTA", results_roberta)


results_bert = my_kb.search(
    vector=model_bert.encode(query)[0].tolist(), 
    vectorset="bert", 
    min_score=0.4)

print_results("BERT", results_roberta)
    

QUERY:  create labels
	***UNICOXDER RESULTS***
Function name: list_vectorset -- Similarity score: 0.33020952343940735
-----------
	***T5 RESULTS***
Function name: set_labels -- Similarity score: 0.5127634406089783
Function name: get_labels -- Similarity score: 0.41345250606536865
-----------
	***DISTILROBERTA RESULTS***
Function name: set_labels -- Similarity score: 0.6097429394721985
Function name: get_labels -- Similarity score: 0.45817020535469055
-----------
	***BERT RESULTS***
Function name: set_labels -- Similarity score: 0.6097429394721985
Function name: get_labels -- Similarity score: 0.45817020535469055
-----------


## Results

As we can see the models with better overall results are **T5**,**BERT**, and **DISTILROBERTA**.
And as a curiosity, even though the **BERT** and **DISTILROBERTA** were supposed to be different, their results are exactly the same
