In [None]:
# THE WIKI GAME
# https://www.thewikigame.com
#
# A player starts on a given wikipedia page, and has to navigate to another wikipedia page only be clicking the hyperlinks within the article.
# The code below can help a player navigate quickly from one page to another by using various transformer models

# (2022)

In [12]:
# Packages needed

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# articles to start and end on
# Feel free to change them :D

start_article = 'Amazon River'
end_article = 'Emotion'

In [13]:
def getLinksFromTextBS(start_article):
    page_titles = []
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'parse',
        'page': start_article,
        'format': 'json',
        'prop': 'text',
        'redirects': ''
    }

    filter_sections = ['See also',
                       'References',
                       'External links',
                       'Further reading',
                       'Notes']

    response = requests.get(url, params=params)
    data = response.json()

    raw_html = data['parse']['text']['*']
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Get all the section names
    allSections = soup.find_all(class_='mw-headline')

    sectionNames = []

    for section in allSections:
        sectionNames.append(section.get_text())

    sectionNames = [x for x in sectionNames if x not in filter_sections]

    # Get links from summary section up to the Table of Contents
    target = soup.find(class_='mw-parser-output')
    if target:
        for sib in target.find_all_next():
            # Only look in current section, end if hitting next section
            # print(sib)
            if sib.name == "h2":
                break
            elif 'title' in sib.attrs and 'Edit this' in sib.attrs['title']:
                # Don't include the hrefs to edit the pages
                continue
            else:
                # Check if href contains internal /wiki/ path
                check1 = 'href' in sib.attrs and \
                         '/wiki/' in sib.attrs['href']
                # Check if tag contains class mw-redirect
                check2 = 'class' in sib.attrs and \
                         'mw-redirect' in sib.attrs['class']
                if (check1 or check2) and 'title' in sib.attrs and 'wiktionary' not in sib.attrs['title']:
                    page_titles.append(sib.attrs['title'])

    # Get all the links from each of the relevant sections
    for thisSection in sectionNames:
        # print('==--------' + thisSection + '--------==')
        target = soup.find(class_='mw-headline', id=thisSection.replace(' ', '_'))
        if target:
            for sib in target.find_all_next():
                # Only look in current section, end if hitting next section
                # print(sib)
                if sib.name == "h2":
                    break
                else:
                    # Check if href contains internal /wiki/ path
                    check1 = 'href' in sib.attrs and \
                    '/wiki/' in sib.attrs['href']
                    # Check if tag contains class mw-redirect
                    check2 = 'class' in sib.attrs and \
                        'mw-redirect' in sib.attrs['class']
                    if (check1 or check2) and 'title' in sib.attrs:
                        page_titles.append(sib.attrs['title'])
    # Unique pages only
    page_titles = list(set(page_titles))
    return page_titles

In [14]:
# for the following
# start_article = 'Amazon River'
# end_article = 'Emotion'

links = getLinksFromTextBS('Amazon River')
print(links)

['Kawahíb people', 'Iquitos, Peru', 'Indo-Iranian languages', 'Ecology', 'Solimões', 'Casiquiare canal', 'Muisca economy', 'Leticia, Amazonas', 'Óbidos, Brazil', 'Inca Empire', 'Amazons', 'Pororoca', 'Pinzón Island', 'Tonantins River', 'Freshwater fish', 'Bay', 'Ecosystem', 'Agriculture', 'National Geographic', 'Estuary', 'Nile', 'British English', 'Marañón River', 'Ecosystem collapse', 'Amazônia National Park', 'Quito', 'Orinoco', 'Ticuna', 'Lake Titicaca', 'Huallaga River', 'Pardo', 'Resin', 'Breves, Pará', 'Colonization', 'White Brazilian', 'List of rivers by discharge', 'Pracuúba', 'Ethnonym', 'Catfish', 'Meeting of Waters', 'Cretaceous', 'Tambaqui', 'Anabranch', 'Sea level', 'Piranha', 'Juruá River', 'Pre-Columbian era', 'Geology', 'Bridge', 'Juruá', 'Category:Birds of the Amazon rainforest', 'Caquetá River', 'Carl Friedrich Philipp von Martius', 'International Union for Conservation of Nature', 'Midden', 'Category:Trees of the Amazon rainforest', 'Negro River (Amazon)', 'Vaupés R

In [15]:
# Packages for Word Similarity Check
from sentence_transformers import SentenceTransformer, util
from nltk import sent_tokenize
import numpy as np

# wordScore takes in the list of words from a wikipedia link and the name of the goal page.
def wordScore(word1, word2, model, logs=0):
    if model == 'bert':
        model = SentenceTransformer('bert-base-nli-mean-tokens')
    elif model == 'roberta':
        model = SentenceTransformer('all-distilroberta-v1')
    elif model == 'microsoftNet':
        model = SentenceTransformer('all-mpnet-base-v2')
    elif model == 'L12':
        model = SentenceTransformer('all-MiniLM-L12-v2')
    elif model == 'L6':
        model = SentenceTransformer('all-MiniLM-L6-v2')

    cosine_scores = []

    # Make word 1 to be an equal length to word 2
    word1_arr = [word1] * len(word2)

    embeddings1 = model.encode(word1_arr)
    embeddings2 = model.encode(word2)

    # compute cosine_scores
    cosine_matrix = util.cos_sim(embeddings1, embeddings2)
    cosine_scores = cosine_matrix.diagonal()

    scores = cosine_scores.tolist()

    if logs == 1:
        # create dataframe
        df_wiki = pd.DataFrame(list(zip(word2, scores)),
               columns =['target', 'weight'])

        # sort dataframe and reorganize index
        df_wiki_org = df_wiki.sort_values(by=['weight'], ascending=False)
        df_wiki_org = df_wiki_org.reset_index(drop=True)

        print(df_wiki_org)

    # convert the tensor array to a list and return
    return scores

In [16]:
# for the following
# start_article = 'Amazon River'
# end_article = 'Emotion'

word1 = 'Emotion'
word2 = ['Karma', 'Pará', 'Fur', 'Feather', 'Bridge']
model = "L6"
scores = wordScore(word1, word2, model, 1)

    target    weight
0    Karma  0.421835
1     Pará  0.335476
2      Fur  0.328533
3  Feather  0.328417
4   Bridge  0.317793


In [17]:
# check page views
# If a wiki page didnt have a lot of page veiws, ignore the game.

def getPageViews(start_article):
    start_article_formatted = start_article.replace(" ","_")
    start_article_formatted
    links = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/user/{}/monthly/20221101/20221201".format(start_article_formatted)

    header={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
    }

    response = requests.get(links, headers=header).json()
    page_views = response['items'][0]['views']
    return page_views

In [18]:
print(getPageViews('Karma'))

60968


In [19]:
def playWikiGame(head, final, model):
    steps = 0
    pages = [head]
    pages_visited = []

    tic = time.perf_counter()

    # forces the words to be the same case
    while not str.lower(head) == str.lower(final):
        result_links = getLinksFromTextBS(head)
        len_target = len(final)

        scores = wordScore(final, result_links, model)

        # create dataframe
        df_wiki = pd.DataFrame(list(zip(result_links, scores)),
               columns =['target', 'weight'])

        # sort dataframe and reorganize index
        df_wiki_org = df_wiki.sort_values(by=['weight'], ascending=False)
        df_wiki_org = df_wiki_org.reset_index(drop=True)

        # if the page has already been searched, drop it from the current links
        while df_wiki_org['target'][0] in pages:
            df_wiki_org = df_wiki_org.drop(labels=0, axis=0)
            df_wiki_org = df_wiki_org.reset_index(drop=True)

        head = df_wiki_org['target'][0]
        pages.append(head)
        print(head)
        steps = steps + 1
        pages_visited.append(head)

    toc = time.perf_counter()

    print(model + " took " + str(toc - tic) + " seconds to complete")
    return steps, pages_visited, str(toc - tic)

In [20]:
# Identify Starting Page
#start_article = 'Amazon River'
#end_article = 'Emotion'

current_article = start_article

print("\nRunning MiniLM-L6")
steps_l6, pages_visited_l6, time_l6 = playWikiGame(start_article, end_article, 'L6')
print("L6: It took " + str(steps_l6) + " links to get from " + start_article + " to " + end_article)

print("\nRunning MiniLM-L12")
steps_l12, pages_visited_l12, time_l12 = playWikiGame(start_article, end_article, 'L12')
print("L12: It took " + str(steps_l12) + " links to get from " + start_article + " to " + end_article)

print("\nRunning MPNet")
steps_micro, pages_visited_micro, time_micro = playWikiGame(start_article, end_article, 'microsoftNet')
print("Microsoft: It took " + str(steps_micro) + " links to get from " + start_article + " to " + end_article)

print("\nRunning Bert")
steps_bert, pages_visited_bert, time_bert = playWikiGame(start_article, end_article, 'bert')
print("Bert: It took " + str(steps_bert) + " links to get from " + start_article + " to " + end_article)

print("\nRunning Roberta")
steps_roberta, pages_visited, time_roberta = playWikiGame(start_article, end_article, 'roberta')
print("Roberta: It took " + str(steps_roberta) + " links to get from " + start_article + " to " + end_article)

print("\nL6:        It took " + str(steps_l6) + " links to get from " + start_article + " to " + end_article +  " in " + time_l6)
print("L12:       It took " + str(steps_l12) + " links to get from " + start_article + " to " + end_article +  " in " + time_l12)
print("Microsoft: It took " + str(steps_micro) + " links to get from " + start_article + " to " + end_article +  " in " + time_micro)
print("Bert:      It took " + str(steps_bert) + " links to get from " + start_article + " to " + end_article +  " in " + time_bert)
print("Roberta:   It took " + str(steps_roberta) + " links to get from " + start_article + " to " + end_article +  " in " + time_roberta)


Running MiniLM-L6
Karma
Psychoanalysis
Regressive emotionality
Emotion
L6 took 8.705769099999998 seconds to complete
L6: It took 4 links to get from Amazon River to Emotion

Running MiniLM-L12
Karma
Oceanic feeling
Positive psychology
Emotions
Negative emotion
Emotion
L12 took 15.757753100000059 seconds to complete
L12: It took 6 links to get from Amazon River to Emotion

Running MPNet
Karma
Spirituality
Happiness
Emotion
microsoftNet took 30.05556189999993 seconds to complete
Microsoft: It took 4 links to get from Amazon River to Emotion

Running Bert
Sediment
Concretion
Lithification
Pressure
Force
Motion
Vibration
Friction
Energy
Atmosphere
Terrain
Fluvial
Flow velocity
Velocity
Area
Shapes
Material
Forces
Dynamic pressure
Irrotational flow
Compressible flow
Compressibility
Fluid
Matter
Mass
Resonance
Attenuation
Phenomenon
Phenomenalism
Phenomena
Observation
Perception
Emotions
Emotional expression
Emotional intelligence
Emotion
bert took 138.31370530000004 seconds to complete
Ber