5_BuildSearchIndex.py

#!/usr/bin/env python
# coding: utf-8

# ### Prerequisites
# 
# You should have completed steps 1-4 of this tutorial before beginning this exercise.  The files required for this notebook are generated by those previous steps.
# 
# Creating the search engine for this example is extremely CPU and memory intensive.  We used an an AWS `x1.32xlarge` instance (128 cores) in order to achieve the maximum speed with building the search index. 

# In[8]:


from pathlib import Path
import numpy as np
import pandas as pd
import torch
import nmslib
from lang_model_utils import load_lm_vocab, Query2Emb
from general_utils import create_nmslib_search_index

input_path = Path('./data/processed_data/')
code2emb_path = Path('./data/code2emb/')
output_path = Path('./data/search')
output_path.mkdir(exist_ok=True)


# ## Read in Metadata
# 
# We will want to organize the data that we will want to display for the search results, which will be:
# 
# 1. The original code
# 2. A link to the original code
# 
# For convenience, we will collect this data into a pandas dataframe.

# In[2]:


# read file of urls
url_df = pd.read_csv(input_path/'without_docstrings.lineage', header=None, names=['url'])


# read original code
code_df = pd.read_json(input_path/'without_docstrings_original_function.json.gz')
code_df.columns = ['code']

# make sure these files have same number of rows
assert code_df.shape[0] == url_df.shape[0]

# collect these two together into a dataframe
ref_df = pd.concat([url_df, code_df], axis = 1).reset_index(drop=True)
ref_df.head()


# For reference the above files are also available for download incase you skipped step 1:
# 
# `without_docstrings.lineage`: https://storage.googleapis.com/kubeflow-examples/code_search/data/without_docstrings.lineage
# 
# `without_docstrings_original_function.json.gz`: https://storage.googleapis.com/kubeflow-examples/code_search/data/without_docstrings_original_function.json.gz

# ## Create Search Index For Vectorized Code

# First read in the vectorized code

# In[3]:


nodoc_vecs = np.load(code2emb_path/'nodoc_vecs.npy')
assert nodoc_vecs.shape[0] == ref_df.shape[0]


# Now build the search index. **Warning:** this step takes ~ 18 minutes on an `x1.32xlarge` instance.

# In[6]:


get_ipython().run_cell_magic('time', '', "search_index = create_nmslib_search_index(nodoc_vecs)\nsearch_index.saveIndex('./data/search/search_index.nmslib')")


# This cached version of this index can be downloaded here:  

# # Create A Minimal Search Engine

# You can find the cached version of the required files on google cloud:
# 
# `lang_model_cpu_v2.torch`: https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model/lang_model_cpu_v2.torch
# 
# `vocab_v2.cls`: https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model/vocab_v2.cls
# 
# `search_index.nmslib`: https://storage.googleapis.com/kubeflow-examples/code_search/data/search/search_index.nmslib
# 

# In[9]:


lang_model = torch.load('./data/lang_model/lang_model_cpu_v2.torch', 
                        map_location=lambda storage, loc: storage)

vocab = load_lm_vocab('./data/lang_model/vocab_v2.cls')
q2emb = Query2Emb(lang_model = lang_model.cpu(),
                  vocab = vocab)

search_index = nmslib.init(method='hnsw', space='cosinesimil')
search_index.loadIndex('./data/search/search_index.nmslib')


# `Query2Emb` is a helper class that will vectorize sentences using the language model trained in Part 3.  
# 
# In this case, we call the method `emb_mean` because we are taking the mean over the time steps of the hidden states in order to construct a sentence embedding for the query supplied by the user.  

# In[177]:


test = q2emb.emb_mean('Hello World!  This is a test.')
test.shape


# ### Create an object to make the process of showing search results easier
# 
# The below object organizes all the pieces together for searching the index and displaying the results with a method call.  

# In[185]:


class search_engine:
    """Organizes all the necessary elements we need to make a search engine."""
    def __init__(self, 
                 nmslib_index, 
                 ref_df, 
                 query2emb_func):
        """
        Parameters
        ==========
        nmslib_index : nmslib object
            This is pre-computed search index.
        ref_df : pandas.DataFrame
            This dataframe contains meta-data for search results, 
            must contain the columns 'code' and 'url'.
        query2emb_func : callable
            This is a function that takes as input a string and returns a vector
            that is in the same vector space as what is loaded into the search index.

        """
        assert 'url' in ref_df.columns
        assert 'code' in ref_df.columns
        
        self.search_index = nmslib_index
        self.ref_df = ref_df
        self.query2emb_func = query2emb_func
    
    def search(self, str_search, k=2):
        """
        Prints the code that are the nearest neighbors (by cosine distance)
        to the search query.
        
        Parameters
        ==========
        str_search : str
            a search query.  Ex: "read data into pandas dataframe"
        k : int
            the number of nearest neighbors to return.  Defaults to 2.
        
        """
        query = self.query2emb_func(str_search)
        idxs, dists = self.search_index.knnQuery(query, k=k)
        
        for idx, dist in zip(idxs, dists):
            code = self.ref_df.iloc[idx].code
            url = self.ref_df.iloc[idx].url
            print(f'cosine dist:{dist:.4f}  url: {url}\n---------------\n')
            print(code)


# In[186]:


se = search_engine(nmslib_index=search_index,
                   ref_df=ref_df,
                   query2emb_func=q2emb.emb_mean)


# # Run Some Queries Against The Index!!
# 
# Now that we have instantiated the search engine, we can use the `search` method to display the results.
# 
# **Warning:** some of the displayed links may not work since this is historical data retrieved from a [historical open dataset Google has hosted on BigQuery](https://cloud.google.com/bigquery/public-data/github)

# In[187]:


se.search('read data into pandas dataframe')


# # Use Custom Ipython Magic Function To Create A Fake Search Box
# 
# You don't know how to build a website?  No problem!  You can still impress your friends by using a [custom magic function](https://ipython.org/ipython-doc/3/config/custommagics.html) to allow you to do a live demonstration in a Jupyter notebook.  This is what I did when I first created this prototype!

# In[127]:


from IPython.core.magic import (register_line_magic, register_cell_magic,
                                register_line_cell_magic)
@register_cell_magic
def search(line, cell):
    return se.search(cell)


# ### Live Semantic Search of Code (Searching Holdout Set Only)

# In[176]:


get_ipython().run_cell_magic('search', '', '')


# In[ ]:




























# In[ ]:





# In[ ]:





# In[ ]: