In [31]:
import vector_database
import embedding
import wikipedia
import wikipediaapi

Titles of wikipedia articles that we want to have in our database:

In [45]:
import wikipediaapi

def get_articles_from_list_page(list_page, wiki):
    """
    Fetch all full articles linked from a list page, excluding redirects and section links.
    
    Args:
        list_page (wikipediaapi.WikipediaPage): The list page to process.
        wiki (wikipediaapi.Wikipedia): The Wikipedia API object.
    
    Returns:
        list: A list of article titles linked from the list page.
    """
    linked_articles = []
    for link_title, link_page in list_page.links.items():
        if link_page.ns == 0 and "#" not in link_title:
            linked_articles.append(link_title)
    return linked_articles

def get_category_members(category_name, wiki, visited_categories=None):
    """
    Recursively fetch articles and subcategories from a Wikipedia category, 
    including list pages, excluding redirects and section links.
    
    Args:
        category_name (str): The name of the Wikipedia category.
        wiki (wikipediaapi.Wikipedia): The Wikipedia API object.
        visited_categories (set): Tracks visited categories to avoid duplicates and loops.
    
    Returns:
        list: A list of article titles in the category and its subcategories.
    """
    if visited_categories is None:
        visited_categories = set()

    # Avoid re-visiting categories
    if category_name in visited_categories:
        return []
    visited_categories.add(category_name)

    category_page = wiki.page(f"Category:{category_name}")
    if not category_page.exists():
        print(f"Category '{category_name}' does not exist.")
        return []

    articles = []
    subcategories = []
    list_pages = []

    # Iterate over category members
    for member in category_page.categorymembers.values():
        if member.ns == 0:  # Namespace 0 indicates an article
            if member.title.startswith("List of"):
                list_pages.append(member)  # Treat "List of..." pages separately
            elif "#" not in member.title:  # Exclude redirects and section links
                articles.append(member.title)
        elif member.ns == 14:  # Namespace 14 indicates a subcategory
            subcategories.append(member.title.replace("Category:", ""))

    # Process list pages to extract linked articles
    for list_page in list_pages:
        articles += get_articles_from_list_page(list_page, wiki)

    # Recursively process subcategories
    for subcategory in subcategories:
        articles += get_category_members(subcategory, wiki, visited_categories)

    return articles

In [46]:
# Initialize Wikipedia API
wiki = wikipediaapi.Wikipedia("NLP WUT 2024")  # English Wikipedia

# Starting category
start_category = "Star Wars"

print(f"Fetching all articles in the '{start_category}' category...")
all_articles = get_category_members(start_category, wiki)
print(f"Found {len(all_articles)} articles.")

Fetching all articles in the 'Star Wars' category...
Found 36956 articles.


In [50]:
all_articles = list(set(all_articles))
len(all_articles)

9394

In [None]:
# sometimes we get duds and/or duplicates, let's clean that
[article for article in all_articles if 'Joruus' in article]

["Joruus c'baoth",
 'Joruus cbaoth',
 "Joruus C'Baoth",
 'Joruus',
 "Joruus C'baoth"]

In [None]:
# an interesting hack I found -- there must be a better way to do this! (JS)
page = wiki.page("Joruus c'baoth")
print(page.title)
page.summary
print(page.title)

Joruus c'baoth
List of Star Wars Legends characters


In [83]:
all_articles_true_titles = []
for i in range(len(all_articles)):
    if i % 1000 == 0:
        print(f"{i}/{len(all_articles)} articles processed...")
    page = wiki.page(all_articles[i])
    page.summary # calling summary makes it forget its a redirect
    all_articles_true_titles.append(page.title)

all_articles_true_titles = list(set(all_articles_true_titles))

0/9394 articles processed...
1000/9394 articles processed...
2000/9394 articles processed...
3000/9394 articles processed...
4000/9394 articles processed...
5000/9394 articles processed...
6000/9394 articles processed...
7000/9394 articles processed...
8000/9394 articles processed...
9000/9394 articles processed...


In [84]:
len(all_articles_true_titles)

4944

In [35]:
all_articles

['Star Wars',
 'Anakin (given name)',
 'Battle of Yavin',
 'Clone Wars',
 'Death Star (business)',
 'Han (trilobite)',
 'Journey to Star Wars',
 'Kylo (given name)',
 'Charley Lippincott',
 'Lucasfilm Ltd v Ainsworth',
 'Meco',
 'Ricinus vaderi',
 'Skywalker hoolock gibbon',
 'Star Wars sources and analogues',
 'Star Wars opening crawl',
 'Star Wars religion',
 'Star Wars: Force for Change',
 'Star Wars: Duel of the Fates',
 'Trigonopterus yoda',
 'Xenokeryx',
 'Yoda purpurata',
 'Zweibrücken Observatory',
 'List of Star Wars air, aquatic, and ground vehicles',
 'List of Star Wars artists',
 'List of Star Wars books',
 'List of Star Wars reference books',
 'List of box office records set by Star Wars: The Force Awakens',
 'Lists of Star Wars actors',
 'Changes in Star Wars re-releases',
 'List of Star Wars comic books',
 'List of Star Wars creatures',
 'List of Star Wars film actors',
 'List of Star Wars filming locations',
 'List of Star Wars films',
 'List of Star Wars video game act

In [8]:
page_titles = [
    "United States",
    "Python (programming language)",
    "Blueberry",
    "Donald Tusk",
    "Vector space",
    "Necktie",
    "Sushi",
    "Bicycle",
    "Computer",
    "Horse",
    "Jupiter",
    "Ordovician",
    "Piano",
    "2006 FIFA World Cup final",
    "Tennis",
    "Albus Dumbledore",
    "The Beatles",
    "World War I",
    "World War II",
    "Durum wheat"
]

Database name should end in "db", so then it is ignored by git:

In [9]:
database_name = "wiki_db"

This cell may take some time:

In [None]:
embeddings = []
metadata = []

# initialize empty - no folder exists yet
db = vector_database.VectorDatabaseWraper()

for page_title in page_titles:
    section_titles = wikipedia.get_section_titles(page_title)
    for section_title in section_titles:

        # check if the section is already in the databases
        if db.has_record({"page_title": page_title, "section_title": section_title}):
            print(f"Skipping {page_title} - {section_title}")
            continue
        section_text = wikipedia.get_section_text(page_title, section_title)

        embeddings.extend(embedding.embedding([section_text]))

        metadata.append({
            "page_title": page_title,
            "section_title": section_title
        })
    print(f"Loaded embeddings for {page_title}")
    

Loaded embeddings for United States
Loaded embeddings for Python (programming language)
Loaded embeddings for Blueberry
Loaded embeddings for Donald Tusk
Loaded embeddings for Vector space
Loaded embeddings for Necktie
Loaded embeddings for Sushi
Loaded embeddings for Bicycle
Loaded embeddings for Computer
Loaded embeddings for Horse
Loaded embeddings for Jupiter
Loaded embeddings for Ordovician
Loaded embeddings for Piano
Loaded embeddings for 2006 FIFA World Cup final
Loaded embeddings for Tennis
Loaded embeddings for Albus Dumbledore
Loaded embeddings for The Beatles
Loaded embeddings for World War I
Loaded embeddings for World War II
Loaded embeddings for Durum wheat


In [29]:
[dic for dic in db.metadata if dic['page_title'] == 'United States']

[{'page_title': 'United States', 'section_title': 'Etymology'},
 {'page_title': 'United States', 'section_title': 'History'},
 {'page_title': 'United States', 'section_title': 'Geography'},
 {'page_title': 'United States', 'section_title': 'Government and politics'},
 {'page_title': 'United States', 'section_title': 'Economy'},
 {'page_title': 'United States', 'section_title': 'Demographics'},
 {'page_title': 'United States', 'section_title': 'Culture and society'}]

In [25]:
db.add(embeddings, metadata)
db.save(database_name)