-
Notifications
You must be signed in to change notification settings - Fork 0
/
5_BuildSearchIndex.py
261 lines (147 loc) · 7.11 KB
/
5_BuildSearchIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/usr/bin/env python
# coding: utf-8
# ### Prerequisites
#
# You should have completed steps 1-4 of this tutorial before beginning this exercise. The files required for this notebook are generated by those previous steps.
#
# Creating the search engine for this example is extremely CPU and memory intensive. We used an an AWS `x1.32xlarge` instance (128 cores) in order to achieve the maximum speed with building the search index.
# In[8]:
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import nmslib
from lang_model_utils import load_lm_vocab, Query2Emb
from general_utils import create_nmslib_search_index
input_path = Path('./data/processed_data/')
code2emb_path = Path('./data/code2emb/')
output_path = Path('./data/search')
output_path.mkdir(exist_ok=True)
# ## Read in Metadata
#
# We will want to organize the data that we will want to display for the search results, which will be:
#
# 1. The original code
# 2. A link to the original code
#
# For convenience, we will collect this data into a pandas dataframe.
# In[2]:
# read file of urls
url_df = pd.read_csv(input_path/'without_docstrings.lineage', header=None, names=['url'])
# read original code
code_df = pd.read_json(input_path/'without_docstrings_original_function.json.gz')
code_df.columns = ['code']
# make sure these files have same number of rows
assert code_df.shape[0] == url_df.shape[0]
# collect these two together into a dataframe
ref_df = pd.concat([url_df, code_df], axis = 1).reset_index(drop=True)
ref_df.head()
# For reference the above files are also available for download incase you skipped step 1:
#
# `without_docstrings.lineage`: https://storage.googleapis.com/kubeflow-examples/code_search/data/without_docstrings.lineage
#
# `without_docstrings_original_function.json.gz`: https://storage.googleapis.com/kubeflow-examples/code_search/data/without_docstrings_original_function.json.gz
# ## Create Search Index For Vectorized Code
# First read in the vectorized code
# In[3]:
nodoc_vecs = np.load(code2emb_path/'nodoc_vecs.npy')
assert nodoc_vecs.shape[0] == ref_df.shape[0]
# Now build the search index. **Warning:** this step takes ~ 18 minutes on an `x1.32xlarge` instance.
# In[6]:
get_ipython().run_cell_magic('time', '', "search_index = create_nmslib_search_index(nodoc_vecs)\nsearch_index.saveIndex('./data/search/search_index.nmslib')")
# This cached version of this index can be downloaded here:
# # Create A Minimal Search Engine
# You can find the cached version of the required files on google cloud:
#
# `lang_model_cpu_v2.torch`: https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model/lang_model_cpu_v2.torch
#
# `vocab_v2.cls`: https://storage.googleapis.com/kubeflow-examples/code_search/data/lang_model/vocab_v2.cls
#
# `search_index.nmslib`: https://storage.googleapis.com/kubeflow-examples/code_search/data/search/search_index.nmslib
#
# In[9]:
lang_model = torch.load('./data/lang_model/lang_model_cpu_v2.torch',
map_location=lambda storage, loc: storage)
vocab = load_lm_vocab('./data/lang_model/vocab_v2.cls')
q2emb = Query2Emb(lang_model = lang_model.cpu(),
vocab = vocab)
search_index = nmslib.init(method='hnsw', space='cosinesimil')
search_index.loadIndex('./data/search/search_index.nmslib')
# `Query2Emb` is a helper class that will vectorize sentences using the language model trained in Part 3.
#
# In this case, we call the method `emb_mean` because we are taking the mean over the time steps of the hidden states in order to construct a sentence embedding for the query supplied by the user.
# In[177]:
test = q2emb.emb_mean('Hello World! This is a test.')
test.shape
# ### Create an object to make the process of showing search results easier
#
# The below object organizes all the pieces together for searching the index and displaying the results with a method call.
# In[185]:
class search_engine:
"""Organizes all the necessary elements we need to make a search engine."""
def __init__(self,
nmslib_index,
ref_df,
query2emb_func):
"""
Parameters
==========
nmslib_index : nmslib object
This is pre-computed search index.
ref_df : pandas.DataFrame
This dataframe contains meta-data for search results,
must contain the columns 'code' and 'url'.
query2emb_func : callable
This is a function that takes as input a string and returns a vector
that is in the same vector space as what is loaded into the search index.
"""
assert 'url' in ref_df.columns
assert 'code' in ref_df.columns
self.search_index = nmslib_index
self.ref_df = ref_df
self.query2emb_func = query2emb_func
def search(self, str_search, k=2):
"""
Prints the code that are the nearest neighbors (by cosine distance)
to the search query.
Parameters
==========
str_search : str
a search query. Ex: "read data into pandas dataframe"
k : int
the number of nearest neighbors to return. Defaults to 2.
"""
query = self.query2emb_func(str_search)
idxs, dists = self.search_index.knnQuery(query, k=k)
for idx, dist in zip(idxs, dists):
code = self.ref_df.iloc[idx].code
url = self.ref_df.iloc[idx].url
print(f'cosine dist:{dist:.4f} url: {url}\n---------------\n')
print(code)
# In[186]:
se = search_engine(nmslib_index=search_index,
ref_df=ref_df,
query2emb_func=q2emb.emb_mean)
# # Run Some Queries Against The Index!!
#
# Now that we have instantiated the search engine, we can use the `search` method to display the results.
#
# **Warning:** some of the displayed links may not work since this is historical data retrieved from a [historical open dataset Google has hosted on BigQuery](https://cloud.google.com/bigquery/public-data/github)
# In[187]:
se.search('read data into pandas dataframe')
# # Use Custom Ipython Magic Function To Create A Fake Search Box
#
# You don't know how to build a website? No problem! You can still impress your friends by using a [custom magic function](https://ipython.org/ipython-doc/3/config/custommagics.html) to allow you to do a live demonstration in a Jupyter notebook. This is what I did when I first created this prototype!
# In[127]:
from IPython.core.magic import (register_line_magic, register_cell_magic,
register_line_cell_magic)
@register_cell_magic
def search(line, cell):
return se.search(cell)
# ### Live Semantic Search of Code (Searching Holdout Set Only)
# In[176]:
get_ipython().run_cell_magic('search', '', '')
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]: