In [43]:
import requests
from bs4 import BeautifulSoup
import re
import os

## Extract the words from Gutenberg

In [44]:
r = requests.get("https://www.gutenberg.org/files/10681/old/20040627-10681-h-body-pos.htm")
html = r.text
soup = BeautifulSoup(html, 'html.parser')

### Let's explain how the words along with their classes and divisions and sections are organized in the page

The words are organized in the page in the following manner:

1) They belong to a class
2) They belong to a division (if it exists) within the class
3) They belong to a section within the division (if it exists) or within the class


1) We can notice that all classes on the page are represented by a `<dt>` tag with an `<a>` tag inside it. 
The `<a>` tag has a name attribute that starts with **`CLASS`**. The text of the `<a>` tag is the name of the class.


2) We can notice that all divisions on the page are represented by a `<dt>` tag with an `<a>` tag inside it.
The `<a>` tag has a name attribute that starts with **`DIVISION`**. The text of the `<a>` tag is the name of the division.


3) We can notice that all sections on the page are represented by a `<dt>` tag with an `<a>` tag inside it.
The `<a>` tag has a name attribute that starts with **`SECTION`**. The text of the `<a>` tag is the name of the section.

Lastly, the words are represented by a `<dt>` tag with an `<a>` tag inside it. The `<a>` tag has a name attribute that is a number.
We notice that the words come after the `<a>` tag where the name attribute is a number and also that tag is inside a `<b>` tag.
So in order to get the words we must get the text of the `<b>` tag that comes after the `<a>` tag where the name attribute is a number.

With the above in mind, we can extract the hierarchy of classes, divisions, sections, and words from the page.
We can then create a directory structure that mirrors the structure of the page and save the words under each section to a file in the directory structure 
using a dictionary to hold the entire hierarchy.


In [45]:
# Initialize a dictionary to hold the entire hierarchy
hierarchy = {}
current_class = None
current_division = None
current_section = None

# Find all <dt> tags
dt_tags = soup.find_all('dt')

Having the dictionary initialized and the `<dt>` tags found, we can now iterate through each `<dt>` tag and extract the hierarchy of classes, divisions, sections, and words from the page.

In [79]:
# Adjusted regular expression to capture both cases: numbers only and numbers followed by a letter
word_regex = re.compile("^\d+[a-zA-Z]?$")
# Iterate through each <dt> tag
for dt in dt_tags:
    # Check for class
    class_a_tag = dt.find('a', attrs={'name': re.compile("^CLASS")})
    if class_a_tag:
        current_class = re.sub(r'\s+', ' ', class_a_tag.text).strip()
        hierarchy[current_class] = {'divisions': {}, 'sections': {}}
        current_division = None
        current_section = None
        # Now that we got the class we can continue to the next <dt> tag to check for division/section
        continue

    # Check for division
    division_a_tag = dt.find('a', attrs={'name': re.compile("^DIVISION")})
    if division_a_tag and current_class:
        current_division = re.sub(r'\s+', ' ', division_a_tag.text).strip()
        hierarchy[current_class]['divisions'][current_division] = {'sections': {}}
        current_section = None
        # Now that we got the division we can continue to the next <dt> tag to check for section
        continue

    # Check for section
    section_a_tag = dt.find('a', attrs={'name': re.compile("^SECTION")})
    if section_a_tag:
        current_section = re.sub(r'\s+', ' ', section_a_tag.text).strip()
        if current_division:
            hierarchy[current_class]['divisions'][current_division]['sections'][current_section] = []
        else:
            hierarchy[current_class]['sections'][current_section] = []
        # Now that we got the section we can continue to the next <dt> tag to check for words
        continue

    # Check for words (the words are before an a tag with a name attribute that is a number (integer or float))
    word_a_tags = dt.find_all('a', attrs={'name': re.compile("^\d+(\.\d+)?$")})
    for word_a_tag in word_a_tags:
        word = word_a_tag.find_next('b').get_text() if word_a_tag.find_next('b') else ''
        print(word)
        word = re.sub(r'\s+', ' ', word).strip()
        if current_section:
            if current_division:
                hierarchy[current_class]['divisions'][current_division]['sections'][current_section].append(word)
            else:
                hierarchy[current_class]['sections'][current_section].append(word)

Existence
Inexistence
Substantiality
Unsubstantiality
Intrinsicality
Extrinsicality
State
Circumstance
Relation
Irrelation
Consanguinity
Correlation
Identity
Contrariety
Difference
Uniformity
Nonuniformity
Similarity
Dissimilarity
Imitation
Nonimitation
Variation
Copy
Prototype
Agreement
Disagreement
Quantity
Degree
Equality
Inequality
Mean
Compensation
Greatness
Smallness
Superiority
Inferiority
Increase
Nonincrease, Decrease
Addition
Nonaddition. Subtraction
Adjunct
Remainder
Decrement
Mixture
Simpleness
Junction
Disjunction
Connection
Coherence
Incoherence
Combination
Decomposition
Whole
Part
Completeness
Incompleteness
Composition
Exclusion
Component
Extraneousness
Order
Disorder
Complexity
Arrangement
Derangement
Precedence
Sequence
Precursor
Sequel
Beginning
End
Middle
Continuity
Discontinuity
Term
Assemblage
Nonassemblage. Dispersion
Focus
Class
Inclusion
Exclusion
Generality
Speciality
Normality
Multiformity
Conformity
Unconformity
Number
Numeration
List
Unity
Accompaniment
Dua

In [80]:
hierarchy

{'WORDS EXPRESSING ABSTRACT RELATIONS': {'divisions': {},
  'sections': {'EXISTENCE': ['Existence',
    'Inexistence',
    'Substantiality',
    'Unsubstantiality',
    'Intrinsicality',
    'Extrinsicality',
    'State',
    'Circumstance'],
   'RELATION': ['Relation',
    'Irrelation',
    'Consanguinity',
    'Correlation',
    'Identity',
    'Contrariety',
    'Difference',
    'Uniformity',
    'Nonuniformity',
    'Similarity',
    'Dissimilarity',
    'Imitation',
    'Nonimitation',
    'Variation',
    'Copy',
    'Prototype',
    'Agreement',
    'Disagreement'],
   'QUANTITY': ['Quantity',
    'Degree',
    'Equality',
    'Inequality',
    'Mean',
    'Compensation',
    'Greatness',
    'Smallness',
    'Superiority',
    'Inferiority',
    'Increase',
    'Nonincrease, Decrease',
    'Addition',
    'Nonaddition. Subtraction',
    'Adjunct',
    'Remainder',
    'Decrement',
    'Mixture',
    'Simpleness',
    'Junction',
    'Disjunction',
    'Connection',
    'Cohere

In [81]:
# Count the words (the words are inside the sections in the hierarchy)
word_count = sum([len(words) for class_content in hierarchy.values() for division_content in
                  class_content.get('divisions', {}).values() for words in
                  division_content.get('sections', {}).values()])
word_count

389

### Create the directory structure and save the words to files

Now that we have the hierarchy, we can create a directory structure that mirrors the structure of the page and save the words under each section to a file in the directory structure.

Let's start by creating a function that writes the words to a file.

In [82]:
def write_words_to_file(words, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write('\n'.join(words))

We start by creating a directory structure that mirrors the structure of the page.

In [83]:
def create_directory_structure(base_path, hierarchy):
    for class_name, class_content in hierarchy.items():
        class_path = os.path.join(base_path, class_name)
        os.makedirs(class_path, exist_ok=True)

        for division_name, division_content in class_content.get('divisions', {}).items():
            division_path = os.path.join(class_path, division_name)
            os.makedirs(division_path, exist_ok=True)

            for section_name, words in division_content.get('sections', {}).items():
                section_path = os.path.join(division_path, section_name)
                os.makedirs(section_path, exist_ok=True)
                write_words_to_file(words, os.path.join(section_path, 'words.txt'))

        for section_name, words in class_content.get('sections', {}).items():
            section_path = os.path.join(class_path, section_name)
            os.makedirs(section_path, exist_ok=True)
            write_words_to_file(words, os.path.join(section_path, 'words.txt'))


# Create the directory structure
base_path = 'roget_thesaurus'
create_directory_structure(base_path, hierarchy)

In [84]:
# Count the lines in the files
line_count = sum([sum([len(open(os.path
                                .join(root, file)).readlines()) for file in files]) for root, dirs, files in
                  os.walk(base_path)])
line_count

1044

## Get Word Embeddings

There are two ways which we can get the embeddings for the words.

1) Get the OpenAI text-embedding-3-small
2) Use the Ollama Embeddings , more specifically from the latest Mistral-7B model

### 1) Get the OpenAI text-embedding-3-small

First we need to set up the OpenAI API from the environment variable which contains the API key.

In [ ]:
import openai

# Set the API
openai.api_key = os.environ.get("OPENAI_API_KEY")

Now we can use the OpenAI API to get the embeddings for the words

In [ ]:
from openai import OpenAI

client = OpenAI()


def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


# Apply the function to the words files and save the embeddings to a file
def get_embeddings_from_files(base_path, model="text-embedding-3-small"):
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file == 'words.txt':
                words_file_path = os.path.join(root, file)
                embeddings_file_path = os.path.join(root, 'embeddings.txt')
                with open(words_file_path, 'r', encoding='utf-8') as file:
                    words = file.read().splitlines()
                embeddings = [get_embedding(word, model) for word in words]
                with open(embeddings_file_path, 'w', encoding='utf-8') as file:
                    file.write('\n'.join([str(embedding) for embedding in embeddings]))


# Get the embeddings
get_embeddings_from_files(base_path)

OR


In [ ]:
import os
import pandas as pd
import numpy as np
from openai import OpenAI

client = OpenAI()


def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


def read_text(file_path):
    with open(file_path, 'r') as file:
        return file.read()


def process_directory(root_dir):
    embeddings = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file == 'word.txt':
                file_path = os.path.join(root, file)
                text = read_text(file_path)
                embedding = get_embedding(text)
                embeddings.append({'file_path': file_path, 'embedding': embedding})
    return embeddings


# Example usage
root_dir = 'your/directory/path'  # Replace with your directory path
embeddings = process_directory(root_dir)

# Convert embeddings to DataFrame and save
df = pd.DataFrame(embeddings)
df.to_csv('output/embeddings.csv', index=False)


### 2) Use the Ollama Embeddings

In order to use the mistral embeddings we first need download Ollama in a linux environment from the command line using the following command:

```bash
curl -fsSL https://ollama.com/install.sh | sh
```

Then we can get the mistral model and embeddings using the following command:

```bash
ollama run mistral
```

After we have downloaded the mistral model we can use the OllamaEmbeddings class to get the embeddings for the words.
We can start by executing the following command in our linux terminal:

```bash
ollama serve
```

Then we can use the OllamaEmbeddings class to get the embeddings for the words.

> Note: In windows we can use the WSL to run the Ollama commands and get the embeddings.

Having now the embeddings we must load the words from the files and get the embeddings for the words.
To make this process easier we will use the `langchain` library which provides a class called `OllamaEmbeddings` that we can use to get the embeddings for the words.

> Note: Langchain also provides many other models that we can use to get the embeddings for the words along with Ollama.

In [15]:
from langchain_community.embeddings import OllamaEmbeddings

mistral_embeddings = OllamaEmbeddings(model="mistral")

Langchain also provides many useful tools so we can use the `OllamaEmbeddings` class to get the embeddings for the words.

We can start by loading the words from the files using the `DirectoryLoader` and `TextLoader` classes.

In [8]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

path = "data2"
text_loader_kwargs = {'autodetect_encoding': True}
loader = DirectoryLoader(path, glob="**/*.txt", loader_cls=TextLoader,
                         loader_kwargs=text_loader_kwargs, show_progress=True,
                         use_multithreading=True)
docs = loader.load()

100%|██████████| 39/39 [00:00<00:00, 7791.65it/s]


Let's have a look at the documents

In [9]:
docs

[Document(page_content='Cause\nEffect\nAttribution\nChance\nPower\nImpotence\nStrength\nWeakness\nProduction\nDestruction\nReproduction\nProducer\nDestroyer\nPaternity\nPosterity\nProductiveness\nUnproductiveness\nAgency\nPhysical Energy\nPhysical Inertness\nViolence\nModeration\nInfluence\nTendency\nLiability\nConcurrence\nCounteraction', metadata={'source': 'data2\\WORDS EXPRESSING ABSTRACT RELATIONS\\CAUSATION\\words.txt'}),
 Document(page_content='Change\nPermanence\nCessation\nContinuance in action\nConversion\nReversion\nRevolution\nSubstitution\nInterchange\nChangeableness\nStability\nEventuality\nDestiny', metadata={'source': 'data2\\WORDS EXPRESSING ABSTRACT RELATIONS\\CHANGE\\words.txt'}),
 Document(page_content='Existence\nInexistence\nSubstantiality\nUnsubstantiality\nIntrinsicality\nExtrinsicality\nState\nCircumstance', metadata={'source': 'data2\\WORDS EXPRESSING ABSTRACT RELATIONS\\EXISTENCE\\words.txt'}),
 Document(page_content='Number\nNumeration\nList\nUnity\nAccompan

We can see that the docs contain the `\n` characters so we must remove them and then get the embeddings for the words.
To do that effectively and apply later the mistral embeddings, we will use the `SentenceTransformersTokenTextSplitter` text splitter.


In [10]:
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

text_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)



  return self.fget.__get__(instance, owner)()


Let's see how that works with an example file

In [11]:
# Read a single txt file from the directory
text = open("data2/WORDS EXPRESSING ABSTRACT RELATIONS/CAUSATION/words.txt", "r").read()
texts = text_splitter.create_documents([text])
print(texts)

[Document(page_content='cause effect attribution chance power impotence strength weakness production destruction reproduction producer destroyer paternity posterity productiveness unproductiveness agency physical energy physical inertness violence moderation influence tendency liability concurrence counteraction')]


Let's now apply this text splitter to the documents

In [12]:
# Split the documents
splits = text_splitter.split_documents(docs)

Now that we have the splits we can get the embeddings for the words using the `OllamaEmbeddings` class and store the embeddings in a vector
database, in this case we will store them in a Chroma database. We will make the vector database using the `Chroma` class from the `langchain` library 
and make it persistent by writing it to a directory named `chroma_db` with a collection named `roget`.


In [ ]:
from langchain_community.vectorstores.chroma import Chroma

vectorstore = Chroma.from_documents(documents=splits, embedding=mistral_embeddings,
                                    persist_directory="./chroma_db", collection_name="roget")

Having saved the embeddings in the vector database we can now use the `Chroma` class to get the embeddings for the words from the vector database.

In [17]:
# load from disk
from langchain_community.vectorstores.chroma import Chroma

vector_db = Chroma(persist_directory="./chroma_db",
                   embedding_function=mistral_embeddings,
                   collection_name="roget")

In [22]:
import chromadb

client = chromadb.PersistentClient(path="./chroma_db")

In [23]:
collection = client.get_collection("roget")

In [25]:
collection.get()

{'ids': ['22b400c6-c816-11ee-98a8-bcf4d4820bd6',
  '22b400c7-c816-11ee-8d01-bcf4d4820bd6',
  '22b400c8-c816-11ee-8248-bcf4d4820bd6',
  '22b400c9-c816-11ee-a7ce-bcf4d4820bd6',
  '22b400ca-c816-11ee-b44f-bcf4d4820bd6',
  '22b400cb-c816-11ee-b65f-bcf4d4820bd6',
  '22b400cc-c816-11ee-834e-bcf4d4820bd6',
  '22b400cd-c816-11ee-b479-bcf4d4820bd6',
  '22b400ce-c816-11ee-9438-bcf4d4820bd6',
  '22b400cf-c816-11ee-91c7-bcf4d4820bd6',
  '22b400d0-c816-11ee-a6a0-bcf4d4820bd6',
  '22b400d1-c816-11ee-a688-bcf4d4820bd6',
  '22b400d2-c816-11ee-b748-bcf4d4820bd6',
  '22b400d3-c816-11ee-8fd8-bcf4d4820bd6',
  '22b400d4-c816-11ee-acec-bcf4d4820bd6',
  '22b400d5-c816-11ee-9d2f-bcf4d4820bd6',
  '22b400d6-c816-11ee-9c7c-bcf4d4820bd6',
  '22b400d7-c816-11ee-ad1f-bcf4d4820bd6',
  '22b400d8-c816-11ee-88ba-bcf4d4820bd6',
  '22b400d9-c816-11ee-95af-bcf4d4820bd6',
  '22b400da-c816-11ee-b2fb-bcf4d4820bd6',
  '22b400db-c816-11ee-a846-bcf4d4820bd6',
  '22b400dc-c816-11ee-9ce0-bcf4d4820bd6',
  '22b400dd-c816-11ee-ba3d-