In [23]:
#Function to get the wikipedia article
from wikipydia import dict_storage, wikidb, parse, wikisyn
from wiki_sections import get_article_obj
wiki_db = wikidb.WikiDb()

def get_article(href):
    
    article, downloaded = wiki_db.get_article_by_href(href)
    
    if downloaded:
        wiki_db.save()
        print("Db updated.")
    
    return get_article_obj(article.title(), article.html())

In [24]:
#Function to get the keywords from a piece of text

from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

stop_words = set(stopwords.words('english')).union(set(punctuation)).union(set(["''", "``"]))

def get_text_keywords(text, n=10):
    tokens = word_tokenize(text.lower())
    token_counter = Counter()
    
    for word in tokens:
        if len(word) <= 1:
            continue
        if word in stop_words:
            continue
    
        token_counter[word] += 1
    
    #return token_counter.most_common(n)
    
    return [w for w, count in token_counter.most_common(n)]

#get_text_keywords("Linear algebra is the branch of mathematics concerning vector spaces and linear mappings between such spaces. It includes the study of lines, planes, and subspaces, but is also concerned with properties common to all vector spaces.")

In [25]:
#Function to search wikipedia articles

import requests

def search_wikipedia(term, lang="en", timeout=60):
    """Function search articles on wikipedia."""
    #import exceptions

    #https://en.wikipedia.org/w/api.php?action=query&utf8&list=search&srsearch=neural
    #https://www.mediawiki.org/wiki/API:Search

    # https://en.wikipedia.org/w/api.php?action=parse&redirects&page=fluid_mechanics

    req_params = [
        'action=query',
        'utf8',
        'list=search',
        'format=json',
        'srwhat=text',
        'srprop',
        'srlimit=500',
        'srsearch=' + term
    ]

    wikipedia_api_url = "https://" + lang + ".wikipedia.org/w/api.php?" + "&".join(req_params)

    page_data = requests.get(wikipedia_api_url, timeout=timeout).json()

    results = [result['title'] for result in page_data['query']['search']]
    
    return results

#search_wikipedia("neural")

In [26]:
def test_indexes():
    pages_to_download = [i for i in range(67)]
    batch_size = 4
    print(pages_to_download)
    print("\n")
    
    for i in range(0, len(pages_to_download), batch_size):
        print(pages_to_download[i:i+batch_size])

    #for i in range(ceil(len(pages_to_download) / batch_size)):
        #print(pages_to_download[i * batch_size : (i+1) * batch_size])

#test_aee()    

In [28]:
DictStorage = dict_storage.DictStorage
wiki_links = DictStorage("wiki_links")
from collections import defaultdict
from urllib.parse import quote

from math import ceil

def get_wikipedia_links(pages, batch_size=20):
    
    pages_to_download = list()
    
    for page in pages:
        if page not in wiki_links:
            pages_to_download.append(page)
    
    #If there is pages not locally present, download and save data
    if len(pages_to_download) > 0:
        for i in range(0, len(pages_to_download), batch_size):
            #Query all the pages not present
            downloaded_links = query_wikipedia_links(pages_to_download[i:i+batch_size])
            for t, l in downloaded_links.items():
                wiki_links[t] = l
        wiki_links.save()
    
    links_to_return = dict()
    for page in pages:
        try:
            links_to_return[page] = wiki_links[page]
        except KeyError:
            print("Not found links for: " + page)
        
    return links_to_return
    
def query_wikipedia_links(pages, lang="en", timeout=60):
    
    links = defaultdict(set)
    plcontinue = None
    
    while True:
        print("Querying plcontinue: " + str(plcontinue))
        
        partial_links, plcontinue = partial_query_wikipedia_links(pages, plcontinue)
        
        for title, link_set in partial_links.items():
            links[title] = links[title].union(link_set)

        if not plcontinue:
            break;
            
    return links
        

def partial_query_wikipedia_links(pages, plcontinue=None, lang="en", timeout=60):
    #https://en.wikipedia.org/w/api.php?action=query&titles=MQTT&prop=links&pllimit=500&pltitles=Adafruit|Internet%20layer
    
    req_params = [
        'action=query',
        'utf8',
        'prop=links',
        'plnamespace=0',
        'format=json',
        'pllimit=500',
        'titles=' + "|".join([quote(page) for page in pages])
    ]
    
    if plcontinue:
        req_params.append("plcontinue=" + plcontinue)

    wikipedia_api_url = "https://" + lang + ".wikipedia.org/w/api.php?" + "&".join(req_params)

    page_data = requests.get(wikipedia_api_url, timeout=timeout).json()

    
    #If there is a continue page
    if "continue" in page_data:
        plcontinue_param = page_data['continue']['plcontinue']
    else:
        plcontinue_param = None
        
    links = defaultdict(set)
    
    for pageid, data in page_data['query']['pages'].items():
        page_title = data["title"]
        #links[page_title] = set()
        if "links" not in data:
            continue
        for l in data['links']:
            links[page_title].add(l['title'])
            
    return links, plcontinue_param
        
    
def test_query_wikilinks1():
    
    test_query_links = get_wikipedia_links(['Deep learning',
      'Feature learning',
      'Artificial neural network',
      'Convolutional neural network',
      'Recurrent neural network',
      'Machine learning',
      'Autoencoder',
      'Hierarchical temporal memory',
      'Pattern recognition',
      'Sepp Hochreiter',
      'sofuasduasjd',
      "MQTT",
      "Linear algebra"])

    print("")
    for page, links in test_query_links.items():
        print(page, len(links))
        
def test_query_wikilinks2():
    print(wiki_links.items())
    
#test_query_wikilinks1()

Failed to open wiki_links.pickle. Created empty dict.


In [29]:
def get_sections_text(href):
    sections_text = list()
    art = get_article(href)
    
    for sec in art.flatten_sections():
        if sec.content and not sec.content.isspace(): 
            sections_text.append(sec.content)
    
    #for sec in art:
        #sections_text.append(str(sec).replace("\n", " "))
    return sections_text

#len(get_sections_text("MQTT"))

In [30]:
def get_sections_articles_candidates(href):
    
    #1. Get article sections text
    sections_text = get_sections_text(href)
    
    #2. Get sections keywords
    sections_keywords = list()
    for sec_text in sections_text:
        sections_keywords.append(get_text_keywords(sec_text))
        
    #3. Get articles sugestions for each section
    sections_articles_candidates = list()
    for sec_keywords in sections_keywords:

        search_term = " ".join(word for word in sec_keywords)
        sections_articles_candidates.append(search_wikipedia(search_term)[:10])
    
    return sections_articles_candidates   


In [41]:
from networkx import DiGraph

def get_graph_from_candidates(sec_candidates):
    
    #1. Create graph
    graph = DiGraph()
    
    #2. Add nodes for each set of candidates
    for candidates in sec_candidates:
        for art in candidates:
            graph.add_node(art)
            
    print(len(graph.nodes()))
    
    #3. Get links from all the graph nodes
    nodes_links = get_wikipedia_links(graph.nodes())
    
    #4. Add links in the graph if the target already exists in the graph
    for node, links in nodes_links.items():
        for link in links:
            #Avoid loopback links and targets that does not belongs to the graph
            if node!=link and graph.has_node(link):
                graph.add_edge(node, link)
        
    print(len(graph.nodes()))
        
    return graph

In [32]:
sec_candidates = get_sections_articles_candidates("Artificial neural network")

In [42]:
graph = get_graph_from_candidates(sec_candidates)

123
123


In [45]:
def get_edges_from_lists(sources, targets, graph):
    edges_list = list()
    for source in sources:
        for target in targets:
            if graph.has_edge(source, target):
                edges_list.append((source, target))
    return edges_list            

In [53]:
get_edges_from_lists(sec_candidates[1], sec_candidates[0], graph)

[('Artificial neural network', 'Biological neural network'),
 ('Artificial neural network', 'Types of artificial neural networks'),
 ('Artificial neural network', 'Convolutional neural network'),
 ('Artificial neural network', 'Connectionism'),
 ('Artificial neural network', 'Neuron'),
 ('Artificial neural network', 'Blue Brain Project')]

In [47]:
sec_candidates

[['Artificial neural network',
  'Biological neural network',
  'Artificial brain',
  'Types of artificial neural networks',
  'Nervous system network models',
  'Convolutional neural network',
  'Cultured neuronal network',
  'Connectionism',
  'Neuron',
  'Blue Brain Project'],
 ['Artificial neural network',
  'Optics',
  'Origin of language',
  'List of University of California, Berkeley alumni'],
 ['Artificial neural network',
  'Quantum neural network',
  'Recurrent neural network',
  'Deep learning',
  'Convolutional neural network',
  'Types of artificial neural networks',
  'Cellular neural network',
  'Computational neuroscience',
  'Nervous system network models',
  'List of datasets for machine learning research'],
 ['Deep learning',
  'Artificial neural network',
  'Artificial intelligence',
  'Neuropsychology',
  'Computational creativity',
  'History of artificial intelligence',
  'Psychology'],
 ['Artificial neural network',
  'Convolutional neural network',
  'Deep lear

In [55]:
graph['Recurrent neural network']

{'Artificial neural network': {},
 'Backpropagation': {},
 'Bayesian network': {},
 'Biological neural network': {},
 'Boosting (machine learning)': {},
 'Convolutional neural network': {},
 'Deep learning': {},
 'Hidden Markov model': {},
 'Logistic regression': {},
 'Machine learning': {},
 'Markov chain': {},
 'Self-organizing map': {},
 'Speech recognition': {},
 'Spiking neural network': {},
 'Support vector machine': {},
 'Turing machine': {}}

In [43]:
sec_candidates

[['Artificial neural network',
  'Biological neural network',
  'Artificial brain',
  'Types of artificial neural networks',
  'Nervous system network models',
  'Convolutional neural network',
  'Cultured neuronal network',
  'Connectionism',
  'Neuron',
  'Blue Brain Project'],
 ['Artificial neural network',
  'Optics',
  'Origin of language',
  'List of University of California, Berkeley alumni'],
 ['Artificial neural network',
  'Quantum neural network',
  'Recurrent neural network',
  'Deep learning',
  'Convolutional neural network',
  'Types of artificial neural networks',
  'Cellular neural network',
  'Computational neuroscience',
  'Nervous system network models',
  'List of datasets for machine learning research'],
 ['Deep learning',
  'Artificial neural network',
  'Artificial intelligence',
  'Neuropsychology',
  'Computational creativity',
  'History of artificial intelligence',
  'Psychology'],
 ['Artificial neural network',
  'Convolutional neural network',
  'Deep lear

In [36]:
sorted(graph.in_degree().items(), key=lambda a: a[1], reverse=True)

[('Artificial neural network', 53),
 ('Machine learning', 41),
 ('Artificial intelligence', 39),
 ('Speech recognition', 25),
 ('Neuron', 23),
 ('Deep learning', 21),
 ('Recurrent neural network', 21),
 ('Self-organizing map', 20),
 ('Logistic regression', 20),
 ('Cognitive science', 20),
 ('Algorithm', 20),
 ('Brain–computer interface', 19),
 ('Bayesian network', 19),
 ('Hidden Markov model', 18),
 ('Biological neural network', 18),
 ('Support vector machine', 17),
 ('Convolutional neural network', 16),
 ('Backpropagation', 16),
 ('Computational neuroscience', 16),
 ('Neuroprosthetics', 16),
 ('Blue Brain Project', 14),
 ('Artificial brain', 13),
 ('Functional magnetic resonance imaging', 13),
 ('Mind uploading', 13),
 ("Moore's law", 13),
 ('Connectionism', 13),
 ('Graphene', 13),
 ('Psychology', 13),
 ('Boosting (machine learning)', 13),
 ('Personal rapid transit', 12),
 ('Cognitive neuroscience', 12),
 ('Neuropsychology', 11),
 ('Cognitive architecture', 11),
 ('Pattern recognition

In [39]:
cand_links = get_wikipedia_links(cand[1])

In [41]:
class FlowNode():
    def __init__(self, content):
        pass

In [42]:
for title, links in cand_links.items():
    for link in links:
        if link in cand[1]:
            print(title, "->", link)

Artificial intelligence -> Convolutional neural network
Artificial intelligence -> Deep learning
Artificial intelligence -> Sepp Hochreiter
Convolutional neural network -> Feature learning
Convolutional neural network -> Deep learning
Sepp Hochreiter -> Deep learning
Feature learning -> Convolutional neural network
Feature learning -> Deep learning
Deep learning -> Feature learning
Deep learning -> Convolutional neural network
Deep learning -> Artificial intelligence
Deep learning -> Sepp Hochreiter


In [43]:
cand[0]

['Deep learning',
 'Feature learning',
 'Artificial neural network',
 'Convolutional neural network',
 'Recurrent neural network',
 'Machine learning',
 'Pattern recognition',
 'Autoencoder',
 'Hierarchical temporal memory',
 'End-to-end reinforcement learning']