In [22]:
#Function to get the wikipedia article
from wikipydia import dict_storage, wikidb, parse, wikisyn
from wiki_sections import get_article_obj
wiki_db = wikidb.WikiDb()

def get_article(href):
    
    article, _ = wiki_db.get_article_by_href(href)
    wiki_db.save()
    
    return get_article_obj(article.title(), article.html())

In [32]:
#Function to get the keywords from a piece of text

from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

stop_words = set(stopwords.words('english')).union(set(punctuation)).union(set(["''", "``"]))

def get_text_keywords(text, n=10):
    tokens = word_tokenize(text.lower())
    token_counter = Counter()
    
    for word in tokens:
        if len(word) <= 1:
            continue
        if word in stop_words:
            continue
    
        token_counter[word] += 1
    
    #return token_counter.most_common(n)
    
    return [w for w, count in token_counter.most_common(n)]

#get_text_keywords("Linear algebra is the branch of mathematics concerning vector spaces and linear mappings between such spaces. It includes the study of lines, planes, and subspaces, but is also concerned with properties common to all vector spaces.")

In [21]:
#Function to search wikipedia articles

import requests

def search_wikipedia(term, lang="en", timeout=60):
    """Function search articles on wikipedia."""
    #import exceptions

    #https://en.wikipedia.org/w/api.php?action=query&utf8&list=search&srsearch=neural
    #https://www.mediawiki.org/wiki/API:Search

    # https://en.wikipedia.org/w/api.php?action=parse&redirects&page=fluid_mechanics

    req_params = [
        'action=query',
        'utf8',
        'list=search',
        'format=json',
        'srwhat=text',
        'srprop',
        'srlimit=500',
        'srsearch=' + term
    ]

    wikipedia_api_url = "https://" + lang + ".wikipedia.org/w/api.php?" + "&".join(req_params)

    page_data = requests.get(wikipedia_api_url, timeout=timeout).json()

    results = [result['title'] for result in page_data['query']['search']]
    
    return results

#search_wikipedia("neural")

In [118]:
DictStorage = dict_storage.DictStorage
wiki_links = DictStorage("wiki_links")
from collections import defaultdict

def get_wikipedia_links(pages):
    
    pages_to_download = list()
    
    for page in pages:
        if page.lower() not in wiki_links:
            pages_to_download.append(page)
    
    #If there is pages not locally present, download and save data
    if len(pages_to_download) > 0:
        #Query all the pages not present
        downloaded_links = query_wikipedia_links(pages_to_download)
        for t, l in downloaded_links.items():
            wiki_links[t.lower()] = l
        wiki_links.save()
    
    links_to_return = dict()
    for page in pages:
        links_to_return[page] = wiki_links[page.lower()]
        
    return links_to_return
    
def query_wikipedia_links(pages, lang="en", timeout=60):
    
    links = defaultdict(set)
    plcontinue = None
    
    while True:
        print("Querying plcontinue: " + str(plcontinue))
        
        partial_links, plcontinue = partial_query_wikipedia_links(pages, plcontinue)
        
        for title, link_set in partial_links.items():
            links[title] = links[title].union(link_set)

        if not plcontinue:
            break;
            
    return links
        

def partial_query_wikipedia_links(pages, plcontinue=None, lang="en", timeout=60):
    #https://en.wikipedia.org/w/api.php?action=query&titles=MQTT&prop=links&pllimit=500&pltitles=Adafruit|Internet%20layer
    
    req_params = [
        'action=query',
        'utf8',
        'prop=links',
        'plnamespace=0',
        'format=json',
        'pllimit=500',
        'titles=' + "|".join(pages)
    ]
    
    if plcontinue:
        req_params.append("plcontinue=" + plcontinue)

    wikipedia_api_url = "https://" + lang + ".wikipedia.org/w/api.php?" + "&".join(req_params)

    page_data = requests.get(wikipedia_api_url, timeout=timeout).json()

    
    #If there is a continue page
    if "continue" in page_data:
        plcontinue_param = page_data['continue']['plcontinue']
    else:
        plcontinue_param = None
        
    links = dict()
    
    for pageid, data in page_data['query']['pages'].items():
        page_title = data["title"]
        links[page_title] = set()
        if "links" not in data:
            continue
        for l in data['links']:
            links[page_title].add(l['title'])
            
    return links, plcontinue_param
        
    
def test_query_wikilinks1():
    
    test_query_links = get_wikipedia_links(['Deep learning',
      'Feature learning',
      'Artificial neural network',
      'Convolutional neural network',
      'Recurrent neural network',
      'Machine learning',
      'Autoencoder',
      'Hierarchical temporal memory',
      'Pattern recognition',
      'Sepp Hochreiter',
      'sofuasduasjd',
      "MQTT",
      "Linear algebra"])

    print("")
    for page, links in test_query_links.items():
        print(page, len(links))
        
def test_query_wikilinks2():
    print(wiki_links.items())
    
#test_query_wikilinks1()

Querying plcontinue: None

Artificial neural network 344
Pattern recognition 201
Recurrent neural network 187
Machine learning 314
Linear algebra 240
Autoencoder 94
MQTT 76
Hierarchical temporal memory 54
sofuasduasjd 0
Deep learning 397
Feature learning 115
Sepp Hochreiter 125
Convolutional neural network 175


In [28]:
def get_sections_text(href):
    sections_text = list()
    for sec in get_article(href):
        sections_text.append(str(sec).replace("\n", " "))
    return sections_text

#len(get_sections_text("MQTT"))

In [37]:
def get_wiki_flow(href):
    
    #1. Get article sections text
    sections_text = get_sections_text(href)
    
    #2. Get sections keywords
    sections_keywords = list()
    for sec_text in sections_text:
        sections_keywords.append(get_text_keywords(sec_text))
        
    #3. Get articles sugestions for each section
    sections_articles_candidates = list()
    for sec_keywords in sections_keywords:
        search_term = " ".join(word for word in sec_keywords)
        sections_articles_candidates.append(search_wikipedia(search_term)[:10])
    
    return sections_articles_candidates
    
    #2. Get the links from each article.
    #3. Try to find an optimal flow following the links, starting from the last article, to the first.
    
    #sections_articles_candidates = list()
    
    #for sec_text in sections_text:
        #Get snippet keywords
        
get_wiki_flow("Deep_learning")

[['Deep learning',
  'Feature learning',
  'Artificial neural network',
  'Convolutional neural network',
  'Recurrent neural network',
  'Machine learning',
  'Autoencoder',
  'Hierarchical temporal memory',
  'Pattern recognition',
  'Sepp Hochreiter'],
 ['Feature learning',
  'Deep learning',
  'Machine learning',
  'Autoencoder',
  'Hierarchical temporal memory',
  'Convolutional neural network',
  'Pattern recognition',
  'M-Theory (learning framework)',
  'Sepp Hochreiter',
  'Artificial intelligence'],
 ['Deep learning', 'Bayesian network'],
 ['Deep learning',
  'Artificial neural network',
  'Convolutional neural network',
  'Recurrent neural network',
  'Speech recognition',
  'Vanishing gradient problem',
  'Feature learning',
  'Pattern recognition',
  'Machine learning',
  'Multilayer perceptron'],
 ['Artificial neural network',
  'Deep learning',
  'Convolutional neural network',
  'Recurrent neural network',
  'Convolutional Deep Belief Networks',
  'Multilayer perceptron