In [7]:
import vector_database
import embedding
import wikipedia

Titles of wikipedia articles that we want to have in our database:

In [8]:
page_titles = [
    "United States",
    "Python (programming language)",
    "Blueberry",
    "Donald Tusk",
    "Vector space",
    "Necktie",
    "Sushi",
    "Bicycle",
    "Computer",
    "Horse",
    "Jupiter",
    "Ordovician",
    "Piano",
    "2006 FIFA World Cup final",
    "Tennis",
    "Albus Dumbledore",
    "The Beatles",
    "World War I",
    "World War II",
    "Durum wheat"
]

Database name should end in "db", so then it is ignored by git:

In [9]:
database_name = "wiki_db"

This cell may take some time:

In [None]:
embeddings = []
metadata = []

# initialize empty - no folder exists yet
db = vector_database.VectorDatabaseWraper()

for page_title in page_titles:
    section_titles = wikipedia.get_section_titles(page_title)
    for section_title in section_titles:

        # check if the section is already in the databases
        if db.has_record({"page_title": page_title, "section_title": section_title}):
            print(f"Skipping {page_title} - {section_title}")
            continue
        section_text = wikipedia.get_section_text(page_title, section_title)

        embeddings.extend(embedding.embedding([section_text]))

        metadata.append({
            "page_title": page_title,
            "section_title": section_title
        })
    print(f"Loaded embeddings for {page_title}")
    

Loaded embeddings for United States
Loaded embeddings for Python (programming language)
Loaded embeddings for Blueberry
Loaded embeddings for Donald Tusk
Loaded embeddings for Vector space
Loaded embeddings for Necktie
Loaded embeddings for Sushi
Loaded embeddings for Bicycle
Loaded embeddings for Computer
Loaded embeddings for Horse
Loaded embeddings for Jupiter
Loaded embeddings for Ordovician
Loaded embeddings for Piano
Loaded embeddings for 2006 FIFA World Cup final
Loaded embeddings for Tennis
Loaded embeddings for Albus Dumbledore
Loaded embeddings for The Beatles
Loaded embeddings for World War I
Loaded embeddings for World War II
Loaded embeddings for Durum wheat


In [29]:
[dic for dic in db.metadata if dic['page_title'] == 'United States']

[{'page_title': 'United States', 'section_title': 'Etymology'},
 {'page_title': 'United States', 'section_title': 'History'},
 {'page_title': 'United States', 'section_title': 'Geography'},
 {'page_title': 'United States', 'section_title': 'Government and politics'},
 {'page_title': 'United States', 'section_title': 'Economy'},
 {'page_title': 'United States', 'section_title': 'Demographics'},
 {'page_title': 'United States', 'section_title': 'Culture and society'}]

In [25]:
db.add(embeddings, metadata)
db.save(database_name)