## Semantic text search using embeddings

We can search through all our reviews semantically in a very efficient manner and at very low cost, by simply embedding our search query, and then finding the most similar reviews. The dataset is created in the [Obtain_dataset Notebook](Obtain_dataset.ipynb).

In [3]:
!pip install docutils

Collecting docutils
  Downloading docutils-0.19-py3-none-any.whl (570 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m570.5/570.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: docutils
Successfully installed docutils-0.19


In [2]:
import markdown2
import docutils.core
import os
from html.parser import HTMLParser
from io import StringIO

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()

    def handle_data(self, d):
        self.text.write(d)

    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def load_docs_from_dir(directory):
    docs = []
    for filename in os.listdir(directory):
        if filename.endswith(".rst") or filename.endswith(".md"):
            with open(os.path.join(directory, filename)) as f:
                docs.append(parse_doc(f.read(), filename.split(".")[-1]))
    return docs

def parse_doc(doc, type):
    if type == "rst":
        pass#return docutils.core.publish_string(doc, writer_name='html')
    elif type == "md":
        return strip_tags(markdown2.markdown(doc))
    else:
        raise Exception("Unknown file type: " + type)

#os.chdir(os.path.dirname(os.path.abspath(__file__)))
dirs = ["eth", "uniswap", "solidity_hierarchical"]
#print(os.getcwd())

docs = []
for dir in dirs:
    docs.extend(load_docs_from_dir("data/" + dir))

#os.chdir("data/solidity_hierarchical")
#docs.extend(load_docs_from_dir("."))

In [8]:
docs = [doc for doc in docs if doc is not None]

subsequences = []

excess_length = 0
#produce subsequences of 100 tokens from each document. subsequences should be 50% overlapping
for doc in docs:
    doc = doc.split(".")
    for sentence in doc:
        tokens = sentence.split(" ")
    subsequences += doc

In [32]:
import json

training_data = []

#produce training data from subsequences by taking every 2nd sequence as the label
for i in range(0, len(subsequences)-1):
    prompt = subsequences[i]
    completion = " " + subsequences[i + 1]
    if len((prompt + completion).split(" ")) > 2048:
        tokens = (prompt + completion).split(" ")[:2048]
        prompt = " ".join(tokens[:1024])
        completion = " " + " ".join(tokens[1024:])
    training_data.append({"prompt": prompt, "completion": completion})

#save training data to file
with open("data/training_data.jsonl", "w") as f:
    for data in training_data:
        f.write(json.dumps(data) + "\n")

In [34]:
#downsample and save training data to file
with open("data/test_training_data.jsonl", "w") as f:
    for data in training_data[::100]:
        f.write(json.dumps(data) + "\n")

In [35]:
sum([len(entry['prompt'].split(" ")) + len(entry['completion'].split(" ")) for entry in training_data])

1470916

In [38]:
1470916/27916

52.69078664565124

In [37]:
1443000/1000*0.0300

43.29

In [39]:
len(training_data)

27916