In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
r = requests.get("https://en.wiktionary.org/wiki/Appendix:Roget's_thesaurus_classification")
html = r.text
soup = BeautifulSoup(html, 'html.parser')

# Extracting the Classes, Divisions and Sections and the words under each section

Write a script that extracts the classes, divisions and sections from the page and the words under each section. The script should create a directory structure that mirrors the structure of the page and save the words under each section to a file. The directory structure should look like this:


In [3]:
import os


def create_directory(path):
    if not os.path.exists(path):
        print(f"Creating directory: {path}")
        os.makedirs(path)


def save_words_to_file(words, file_path):
    with open(file_path, 'w') as file:
        for word in words:
            file.write(word + '\n')


base_dir = "data"
# Create the base directory if it doesn't exist
create_directory(base_dir)

# Extract Classes
for class_title in soup.find_all("h2"):
    class_headline = class_title.find('span', {'class': 'mw-headline'})
    if class_headline:
        print(class_headline.get_text())
        class_name = class_headline.get_text()
        class_dir = os.path.join(base_dir, class_name)
        create_directory(class_dir)

        next_element = class_title.find_next_sibling()
        while next_element and next_element.name != "h2":
            # Check if it's a Division 
            # If it is then we need to proceed to the section of the division and process the contents of the section
            # Else we just need to process the contents of the section

            if next_element.name == "h3" and 'Division' in next_element.find('span', {'class': 'mw-headline'}).get('id',
                                                                                                                   ''):
                division_headline = next_element.find('span', {'class': 'mw-headline'})
                print(f"  {division_headline.get_text()}")
                division_name = division_headline.get_text()
                division_dir = os.path.join(class_dir, division_name)
                create_directory(division_dir)

                # Iterate for Sections within Division
                next_section = next_element.find_next_sibling()
                while next_section and next_section.name not in ["h2", "h3"]:
                    if next_section.name == "h4":
                        section_headline = next_section.find('span', {'class': 'mw-headline'})
                        print(f"    {section_headline.get_text()}")
                        section_name = section_headline.get_text()
                        section_dir = os.path.join(division_dir, section_name)
                        create_directory(section_dir)

                        words = set()

                        # Print the words and only the words under this section (not subsections)
                        content_element = next_section.find_next_sibling()
                        while content_element and content_element.name not in ["h2", "h3", "h4"]:
                            if content_element.name == 'dl':
                                for dd in content_element.find_all('dd', recursive=True):
                                    if dd.find('dl'):
                                        subsection_title = dd.get_text().split('\n')[0].strip()
                                        #print(f"    {subsection_title}")

                                        for nested_dd in dd.find('dl').find_all('dd'):
                                            first_a = nested_dd.find('a')
                                            if first_a:
                                                words.add(first_a.get_text())
                                                print(f"        {first_a.get_text()}")
                            content_element = content_element.find_next_sibling()
                        file_path = os.path.join(section_dir, section_name + '.txt')
                        save_words_to_file(words, file_path)

                    next_section = next_section.find_next_sibling()
            elif next_element.name == "h3":
                section_headline = next_element.find('span', {'class': 'mw-headline'})
                print(f"  {section_headline.get_text()}")
                section_name = section_headline.get_text()
                section_dir = os.path.join(class_dir, section_name)
                create_directory(section_dir)

                words = set()

                # Get the words under this section
                content_element = next_element.find_next_sibling()
                while content_element and content_element.name not in ["h2", "h3"]:
                    if content_element.name == 'dl':
                        for dd in content_element.find_all('dd', recursive=True):
                            if dd.find('dl'):
                                subsection_title = dd.get_text().split('\n')[0].strip()
                                #print(f"    {subsection_title}")

                                for nested_dd in dd.find('dl').find_all('dd'):
                                    first_a = nested_dd.find('a')
                                    if first_a:
                                        print(f"        {first_a.get_text()}")
                                        words.add(first_a.get_text())
                    content_element = content_element.find_next_sibling()
                file_path = os.path.join(section_dir, section_name + '.txt')
                save_words_to_file(words, file_path)

            next_element = next_element.find_next_sibling()

Class I. Words Expressing Abstract Relations
  Section I. Existence
        existence
        inexistence
        substantiality
        unsubstantiality
        intrinsicality
        extrinsicality
        state
        circumstance
  Section II. Relation
        relation
        irrelation
        consanguinity
        correlation
        identity
        contrariety
        difference
        uniformity
        nonuniformity
        similarity
        dissimilarity
        imitation
        nonimitation
        variation
        copy
        prototype
        agreement
        disagreement
  Section III. Quantity
        quantity
        degree
        equality
        inequality
        mean
        compensation
        greatness
        smallness
        superiority
        inferiority
        increase
        nonincrease
        addition
        nonaddition
        adjunct
        remainder
        decrement
        mixture
        simpleness
        junction
        disjunction

## Extract the words from Gutenberg

In [ ]:
r = requests.get("https://www.gutenberg.org/files/10681/old/20040627-10681-h-body-pos.htm")
html = r.text
soup = BeautifulSoup(html, 'html.parser')

In [ ]:
# Revised approach to extract the hierarchy of classes, divisions, sections, and words
import re

# Initialize a dictionary to hold the entire hierarchy
hierarchy = {}
current_class = None
current_division = None
current_section = None

dt_tags = soup.find_all('dt')

# Iterate through each <dt> tag
for dt in dt_tags:
    # Check for class
    class_a_tag = dt.find('a', attrs={'name': re.compile("^CLASS")})
    if class_a_tag:
        current_class = re.sub(r'\s+', ' ', class_a_tag.text).strip()
        hierarchy[current_class] = {'divisions': {}, 'sections': {}}
        current_division = None
        current_section = None
        continue

    # Check for division
    division_a_tag = dt.find('a', attrs={'name': re.compile("^DIVISION")})
    if division_a_tag and current_class:
        current_division = re.sub(r'\s+', ' ', division_a_tag.text).strip()
        hierarchy[current_class]['divisions'][current_division] = {'sections': {}}
        current_section = None
        continue

    # Check for section
    section_a_tag = dt.find('a', attrs={'name': re.compile("^SECTION")})
    if section_a_tag:
        current_section = re.sub(r'\s+', ' ', section_a_tag.text).strip()
        if current_division:
            hierarchy[current_class]['divisions'][current_division]['sections'][current_section] = []
        else:
            hierarchy[current_class]['sections'][current_section] = []
        continue

    # Check for words
    word_a_tags = dt.find_all('a', attrs={'name': re.compile("^\d+$")})
    for word_a_tag in word_a_tags:
        word = word_a_tag.find_next('b').get_text() if word_a_tag.find_next('b') else ''
        word = re.sub(r'\s+', ' ', word).strip()
        if current_section:
            if current_division:
                hierarchy[current_class]['divisions'][current_division]['sections'][current_section].append(word)
            else:
                hierarchy[current_class]['sections'][current_section].append(word)

## Create the directory structure and save the words to files

In [ ]:
import os


def create_directory_structure(base_path, hierarchy):
    for class_name, class_content in hierarchy.items():
        class_path = os.path.join(base_path, class_name)
        os.makedirs(class_path, exist_ok=True)

        for division_name, division_content in class_content.get('divisions', {}).items():
            division_path = os.path.join(class_path, division_name)
            os.makedirs(division_path, exist_ok=True)

            for section_name, words in division_content.get('sections', {}).items():
                section_path = os.path.join(division_path, section_name)
                os.makedirs(section_path, exist_ok=True)
                with open(os.path.join(section_path, 'words.txt'), 'w', encoding='utf-8') as file:
                    file.write('\n'.join(words))

        for section_name, words in class_content.get('sections', {}).items():
            section_path = os.path.join(class_path, section_name)
            os.makedirs(section_path, exist_ok=True)
            with open(os.path.join(section_path, 'words.txt'), 'w', encoding='utf-8') as file:
                file.write('\n'.join(words))


# Example usage
base_path = 'data2'  # Replace with the actual path
create_directory_structure(base_path, hierarchy)

## Get Word Embeddings

There are two ways which we can get the embeddings for the words.

1) Get the OPENAI GPT-3.5 Embeddings using the API
2) Use the Ollama Embeddings , more specifically from the latest Mistral-7B model

It is straightforward to get the embeddings using the OPENAI API. We can use the API to get the embeddings for the words.


In order to get the mistral embeddings we need to pull the mistral model from Ollama to our local machine which must be a linux machine. We can then use the model to get the embeddings for the words.


### Get the OPENAI GPT-3.5 Embeddings using the API

In [ ]:
import openai
import os

# Read the API key from the environment variable
api_key = os.environ.get("OPENAI_API_KEY")

# Set the API
openai.api_key = api_key




In [1]:
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="mistral")

In [2]:
import tensorflow as tf



In [3]:
dataset = tf.keras.preprocessing.text_dataset_from_directory(
    "data2",
    batch_size=32,
    seed=1234,
    validation_split=0
)

Found 39 files belonging to 6 classes.


In [ ]:
# Get the embeddings
response = openai.Embedding.create(
    engine="text-embedding-3-large",
    inputs=dataset
)


In [6]:
for text_batch, label_batch in dataset.take(1):
    for i in range(3):
        text = text_batch.numpy()[i].decode('utf-8')

        # Replace '\r\n' with a space or any other character as needed
        text = text.replace('\r\n', ' ')
        # Print the decoded and cleaned text
        print(text)
        print(label_batch.numpy()[i])

Cause Effect Attribution Chance Power Impotence Strength Weakness Production Destruction Reproduction Producer Destroyer Paternity Posterity Productiveness Unproductiveness Agency Physical Energy Physical Inertness Violence Moderation Influence Tendency Liability Concurrence Counteraction
0
Supposition Imagination
3
Relation Irrelation Consanguinity Correlation Identity Contrariety Difference Uniformity Similarity Dissimilarity Imitation Nonimitation Copy Prototype Agreement Disagreement
0


In [7]:
class_names = dataset.class_names

In [8]:
class_names

['WORDS EXPRESSING ABSTRACT RELATIONS',
 'WORDS RELATING TO MATTER',
 'WORDS RELATING TO SPACE',
 'WORDS RELATING TO THE INTELLECTUAL FACULTIES',
 'WORDS RELATING TO THE SENTIENT AND MORAL POWERS',
 'WORDS RELATING TO THE VOLUNTARY POWERS']

In [9]:
for text_batch, label_batch in dataset.take(1):
    for i in range(3):  # Adjust the range as needed
        text = text_batch.numpy()[i].decode('utf-8').replace('\r\n', ' ')
        label_index = label_batch.numpy()[i]
        label_name = class_names[label_index]
        print(f"Text: {text}")
        print(f"Label Index: {label_index}, Label Name: {label_name}")


Text: Deity Angel Satan Jupiter Demon Heaven Hell Theology Heterodoxy Judeo-Christian Revelation Pseudo-Revelation Piety Impiety Irreligion Worship Idolatry Sorcery Spell Sorcerer Churchdom Clergy Laity Rite Canonicals Temple
Label Index: 4, Label Name: WORDS RELATING TO THE SENTIENT AND MORAL POWERS
Text: Change Permanence Cessation Continuance in action Conversion Reversion Revolution Substitution Interchange Changeableness Stability Eventuality Destiny
Label Index: 0, Label Name: WORDS EXPRESSING ABSTRACT RELATIONS
Text: Curiosity Incuriosity Attention Inattention Care Neglect Inquiry Answer Experiment Comparison Discrimination Measurement
Label Index: 3, Label Name: WORDS RELATING TO THE INTELLECTUAL FACULTIES


In [17]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

path = "data2"
text_loader_kwargs = {'autodetect_encoding': True}
loader = DirectoryLoader(path, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs
                         , show_progress=True, use_multithreading=True)
docs = loader.load()

100%|██████████| 39/39 [00:00<00:00, 7797.96it/s]


In [23]:
docs

[Document(page_content='Cause\nEffect\nAttribution\nChance\nPower\nImpotence\nStrength\nWeakness\nProduction\nDestruction\nReproduction\nProducer\nDestroyer\nPaternity\nPosterity\nProductiveness\nUnproductiveness\nAgency\nPhysical Energy\nPhysical Inertness\nViolence\nModeration\nInfluence\nTendency\nLiability\nConcurrence\nCounteraction', metadata={'source': 'data2\\WORDS EXPRESSING ABSTRACT RELATIONS\\CAUSATION\\words.txt'}),
 Document(page_content='Change\nPermanence\nCessation\nContinuance in action\nConversion\nReversion\nRevolution\nSubstitution\nInterchange\nChangeableness\nStability\nEventuality\nDestiny', metadata={'source': 'data2\\WORDS EXPRESSING ABSTRACT RELATIONS\\CHANGE\\words.txt'}),
 Document(page_content='Existence\nInexistence\nSubstantiality\nUnsubstantiality\nIntrinsicality\nExtrinsicality\nState\nCircumstance', metadata={'source': 'data2\\WORDS EXPRESSING ABSTRACT RELATIONS\\EXISTENCE\\words.txt'}),
 Document(page_content='Number\nNumeration\nList\nUnity\nAccompan

In [44]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
    keep_separator=False
)

In [45]:
# Read a single txt file from the directory
text = open("data2/WORDS EXPRESSING ABSTRACT RELATIONS/CAUSATION/words.txt", "r").read()


texts = text_splitter.create_documents([text])
print(texts[0])

page_content='Cause\nEffect\nAttribution\nChance\nPower\nImpotence\nStrength\nWeakness\nProduction\nDestruction\nReproduction\nProducer\nDestroyer\nPaternity\nPosterity\nProductiveness\nUnproductiveness\nAgency\nPhysical Energy\nPhysical Inertness\nViolence\nModeration\nInfluence\nTendency\nLiability\nConcurrence\nCounteraction'


In [51]:
from langchain_community.vectorstores.chroma import Chroma

# Store the embeddings in a vector database

splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OllamaEmbeddings(model="mistral"), persist_directory="./chroma_db", collection_name="roget")

In [ ]:
# load from disk
vector_db = Chroma(persist_directory="./chroma_db", embedding_function=OllamaEmbeddings(model="mistral"), collection_name="roget")