## Semantic text search using embeddings

We can search through all our reviews semantically in a very efficient manner and at very low cost, by simply embedding our search query, and then finding the most similar reviews. The dataset is created in the [Obtain_dataset Notebook](Obtain_dataset.ipynb).

In [3]:
!pip install docutils

Collecting docutils
  Downloading docutils-0.19-py3-none-any.whl (570 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m570.5/570.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: docutils
Successfully installed docutils-0.19


In [1]:
import markdown2
import docutils.core
import os
from html.parser import HTMLParser
from io import StringIO

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()

    def handle_data(self, d):
        self.text.write(d)

    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def load_docs_from_dir(directory):
    docs = []
    for filename in os.listdir(directory):
        if filename.endswith(".rst") or filename.endswith(".md"):
            with open(os.path.join(directory, filename)) as f:
                docs.append(parse_doc(f.read(), filename.split(".")[-1]))
    return docs

def parse_doc(doc, type):
    if type == "rst":
        pass#return docutils.core.publish_string(doc, writer_name='html')
    elif type == "md":
        return strip_tags(markdown2.markdown(doc))
    else:
        raise Exception("Unknown file type: " + type)

#os.chdir(os.path.dirname(os.path.abspath(__file__)))
dirs = ["eth", "uniswap", "solidity_hierarchical"]
#print(os.getcwd())

docs = []
for dir in dirs:
    docs.extend(load_docs_from_dir("data/" + dir))

#os.chdir("data/solidity_hierarchical")
#docs.extend(load_docs_from_dir("."))

In [25]:
docs = [doc for doc in docs if doc is not None]

subsequences = []

excess_length = 0
#produce subsequences from each document
for doc in docs:
    doc = doc.split(".")
    for sentence in doc:
        tokens = sentence.split(" ")
        if len(tokens) > 500:
            subsequences += sentence.split("\n")
        else:
            subsequences.append(sentence)

In [49]:
subsequences = [subsequence for subsequence in subsequences if len(subsequence.split(" ")) > 2 and len(subsequence.split(" ")) < 3000]

In [50]:
print(min([len(s.split(" ")) for s in subsequences]))

3


In [51]:
for subsequence in subsequences:
    if len(subsequence.split(" ")) > 3000:
        print(subsequence)

In [52]:
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embeddings, cosine_similarity

embeddings = []
#split the subsequences into batches of 2048 and iterate through them
for i in range(0, len(subsequences), 2048):
    #get the embeddings for the current batch
    split_subsequences = subsequences[i:i+2048]
    #add them to embeddings
    try:
        embeddings += get_embeddings(
        split_subsequences,
        engine="text-embedding-ada-002"
    )
    except Exception as e:
        print(split_subsequences)
        raise e

In [63]:
df = pd.DataFrame({"text": subsequences, "embeddings": embeddings})


In [64]:

df[0:5]


Unnamed: 0,text,embeddings
0,\n\ntitle: Ethash\ndescription: A detailed loo...,"[-0.0025863207411020994, 0.034494005143642426,..."
1,\n\nlang: en\n\n\n Ethash was Ethereum's pro...,"[-0.0010527808917686343, 0.0034272498451173306..."
2,Proof-of-work has now been switched off entir...,"[0.006531394086778164, -0.026499561965465546, ..."
3,"Read more on The Merge, proof-of-stake and st...","[0.011598732322454453, -0.010688896290957928, ..."
4,This page is for historical interest! \n\n\nE...,"[-0.010270166210830212, 0.010270166210830212, ..."


In [65]:
len(df)

29431

In [66]:
#save the df to a csv file
df.to_csv("data/embeddings.csv")

In [56]:

len(embeddings)

30190

In [57]:
len(subsequences)

29431

In [58]:
len(subsequences)//2048*2048

28672

In [59]:
29431-28672

759

In [60]:
759+29431

30190

In [61]:
embeddings = embeddings[:29431]

In [62]:
len(embeddings)

29431