# Experiment #36 - Represent Text By Wiki Articles (Greedy)
We aim here to represent a piece of text as a set of wikipedia articles. We may use articles links as priors to help create these representations. This approach is greedy, that is, it will take the articles that use the most number of words of ngrams.

## Results:
There is no need for permutations of disamb pages, because they will not link anything to anything. We can choose them based on individuals, like the one that got more links etc.

## List of heuristics used
*Removed puntuactions such "!.-" from the text since wikipedia don't often use these in the titles

In [1]:
from wikipydia import wikidb, url

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.util import ngrams as gen_ngrams

from string import punctuation

from learndata import LearnContents

import networkx as nx

from ThreadPool import ThreadPool

import urllib

#Load learn contents
ld = LearnContents()

#Load wikipedia local db
wdb = wikidb.WikiDb()

stopwords = list(punctuation)

### Draft area

In [2]:
def clear_bad_chars(text):
    """Ensure bad characters (such as '–' that is not a common '-') be removed."""
    return text.encode("ascii", "ignore").decode("ascii")

In [3]:
def get_bilateral_ngrams(text):
    tokens = [word for word in word_tokenize(clear_bad_chars(text.strip())) if word not in stopwords]
    for ngram_tuple in gen_ngrams(tokens, len(tokens)-1):
        yield " ".join(ngram_tuple)

In [4]:
def get_wikipedia_page_url(title):
    return urllib.parse.quote(title.replace(" ", "_"))

def get_graph(text, graph=None):
    if graph == None:
        graph = nx.DiGraph()
    
    #Add node to the graph
    #graph.add_node(text)
    
    #Try download article for this text
    try:
        art, _ = wdb.get_article_by_href(url.UnquotedURL(text))
        #If suceeed, add article to the node property
        graph.add_node(get_wikipedia_page_url(art.title()), {
            "article": art
        }) 
    except:
        #If no article has been found, we split the text into ngrams,
        #add it as edges and recurse the function the new nodes passing the current graph instance
        for ngram in get_bilateral_ngrams(text):
            print(ngram)
            graph.add_edge(text, ngram)
            get_graph(ngram, graph)
        
    return graph
    
graph = get_graph(ld[0].title)    
graph.nodes()

Data Visualization with
Data Visualization
Visualization with
Visualization
with
Visualization with d3.js
Visualization with
Visualization
with
with d3.js
with
d3.js


['with',
 'Data Visualization with d3.js',
 'Visualization with',
 'd3.js',
 'Data_visualization',
 'Visualization with d3.js',
 'with d3.js',
 'With',
 'Data Visualization with',
 'D3.js',
 'Data Visualization',
 'Visualization']

In [5]:
def get_graph_art_pagerank(graph):
    pr = nx.pagerank(graph)
    
    sum_value = 0
    for node in nx.get_node_attributes(graph, "article").keys():
        print(node)
        sum_value += pr[node]

    return sum_value

get_graph_art_pagerank(graph)

Data_visualization
With
Visualization
D3.js


0.2667733465397895

In [6]:
sum_value = 0
for node in nx.get_node_attributes(graph, "article").keys():
    sum_value += pr[node]

NameError: name 'pr' is not defined

### 1. Generate hierarquical ngrams for the text

In [7]:
def get_ngrams(text, ngram_range=()):
    """Tokenize text and them return its ngrams."""
    
    tokens = [word for word in word_tokenize(clear_bad_chars(text.strip())) if word not in stopwords]
    
    if len(ngram_range) == 0:
        ngram_range = (1, len(tokens))
    
    ngrams_list = list()
    for ngram_size in range(ngram_range[0], ngram_range[1]+1):
        for ngram_tuple in gen_ngrams(tokens, ngram_size):
            ngram_text = " ".join(ngram_tuple)
            ngrams_list.append(ngram_text)

    return ngrams_list

In [8]:
def get_hierarquical_ngrams(text):
    n_tokens = len(word_tokenize(text))
    if n_tokens == 1:
        return {text: {}}
    
    ngrams = dict() 
        
    for ngram in get_ngrams(text, (n_tokens-1, n_tokens-1)):
        ngrams.update(get_hierarquical_ngrams(ngram))

    return {text: ngrams}

In [9]:
text_ngrams = get_hierarquical_ngrams(ld[0].title)

In [10]:
text_ngrams

{'Data Visualization with d3.js': {'Data Visualization with': {'Data Visualization': {'Data': {},
    'Visualization': {}},
   'Visualization with': {'Visualization': {}, 'with': {}}},
  'Visualization with d3.js': {'Visualization with': {'Visualization': {},
    'with': {}},
   'with d3.js': {'d3.js': {}, 'with': {}}}}}

### 2. Get ngrams set

In [11]:
def get_ngrams_list(h_ngrams):
    ngrams = list()
    for ngram in h_ngrams.keys():
        ngrams.append(ngram)
        ngrams += get_ngrams_list(h_ngrams[ngram])
        
    return ngrams

def get_grams_set(h_ngrams):
    return set(get_ngrams_list(h_ngrams))

ngrams_set = get_grams_set(text_ngrams)

### 3. Try to get a wikipedia article for each ngram

In [12]:
def get_wikipedia_page_url(title):
    return urllib.parse.quote(title.replace(" ", "_"))
#get_wikipedia_page_url("lucas vieira c++")

In [13]:
from collections import defaultdict

#Populate wikiarticles before with existing ngrams
wiki_articles = dict([(ngram, ngram) for ngram in ngrams_set])


def get_wiki_article(href):
    try:
        art, _ = wdb.get_article_by_href(url.UnquotedURL(href))
        art._title = get_wikipedia_page_url(art.title())
        wiki_articles[href] = art
    except Exception as e:
        print(e)
    

pool = ThreadPool(10)
%time pool.map(get_wiki_article, ngrams_set)
%time pool.wait_completion()
del pool
wdb.save()
wiki_articles

Wall time: 0 ns
Requested page 'Data Visualization with d3.js (en)' does not exists.
Requested page 'with d3.js (en)' does not exists.
Requested page 'Visualization with d3.js (en)' does not exists.
Requested page 'Visualization with (en)' does not exists.
Requested page 'Data Visualization with (en)' does not exists.
Wall time: 820 ms


{'Data': Data,
 'Data Visualization': Data_visualization,
 'Data Visualization with': 'Data Visualization with',
 'Data Visualization with d3.js': 'Data Visualization with d3.js',
 'Visualization': Visualization,
 'Visualization with': 'Visualization with',
 'Visualization with d3.js': 'Visualization with d3.js',
 'd3.js': D3.js,
 'with': With,
 'with d3.js': 'with d3.js'}

### 5. Create Graph

In [15]:
graph = nx.DiGraph()

#Add all nodes
for node in wiki_articles.values():
    is_article = not isinstance(node, str)
    if is_article:
        graph.add_node(str(node), {"article": node})
    else:
        graph.add_node(node)

sorted(graph.nodes())

['D3.js',
 'Data',
 'Data Visualization with',
 'Data Visualization with d3.js',
 'Data_visualization',
 'Visualization',
 'Visualization with',
 'Visualization with d3.js',
 'With',
 'with d3.js']

In [16]:
#Add ngram edges
#The greedy component is added here since we do not verify 
#deep levels of the ngrams if some upper level match an article

def add_ngram_edges(hngrams):
    for node_name in hngrams.keys():
        not_article = isinstance(wiki_articles[node_name], str)
        
        if not_article:
            #We acctually do not need the str function, just use for readability
            n1_name = str(wiki_articles[node_name])  
            for child_name in hngrams[node_name].keys():
                n2_name = str(wiki_articles[child_name])
                graph.add_edge(n1_name, n2_name)
            add_ngram_edges(hngrams[node_name])
            
add_ngram_edges(text_ngrams)

In [22]:
def get_largest_list(lists):
    largest_list = list()
    for l in lists:
        if len(l) > len(largest_list):
            largest_list = l
    return largest_list

#Replace graph with the one with most connected components
most_conn_comps = get_largest_list(list(nx.weakly_connected_components(graph)))
graph = graph.subgraph(most_conn_comps)

In [42]:
#Get the target links from the articles in the graph
from collections import Counter

links_counter = Counter()

for art in nx.get_node_attributes(graph, "article").values():
    if not art._is_disamb_page:
        for link_href, _ in art.links():
            links_counter[link_href] += 1

In [43]:
#Get all disamb links
from collections import defaultdict

disamb_links = defaultdict(list)
for node_name, node_article in nx.get_node_attributes(graph, "article").items():
    if node_article._is_disamb_page:
        for link_href, _ in node_article.links():
            disamb_links[node_name].append(link_href) 

disamb_links

defaultdict(list,
            {'Visualization': ['Mental_image',
              'Creative_visualization',
              'Motor_imagery',
              'Flow_visualization',
              'Geovisualization',
              'Illustration',
              'Information_graphics',
              'Data_visualization',
              'Information_visualization',
              'Interactive_visualization',
              'Music_visualization',
              'Scientific_visualization',
              'Security_visualisation',
              'Software_visualization',
              'Visualization_(computer_graphics)',
              'Visulation',
              'Guided_imagery',
              'List_of_graphical_methods',
              'Image',
              'Mental_image',
              'Previsualization',
              'Spatial_visualization_ability',
              'Visual_communication',
              'Visual_perception',
              'Visual_rhetoric',
              'Visual_system',
              'Visua

In [47]:
#Resolve disambiguation links
import random

resolved_disamb = dict()

for node_label, disamb_candidates in disamb_links.items():
    #Get the sorted list of links to members of the disamb candidates
    candidates_links = sorted(list(map(lambda a: (a, links_counter[a]), disamb_candidates)), key=lambda a: a[1], reverse=True)
    
    high_value = candidates_links[0][1]
    
    threshold_candidates = list()
    for cand_label, cand_links in candidates_links:
        if cand_links == high_value:
            threshold_candidates.append(cand_label)
            
    #Choose randomly between the high value candidates
    resolved_disamb[node_label] = random.choice(threshold_candidates)
    

In [52]:
get_wiki_article('Information_graphics')

In [54]:
wiki_articles['Information_graphics'].links()

[['Washington_Metro', 'Washington Metro'],
 ['Clipped_compound', 'clipped compound'],
 ['Information', 'information'],
 ['Graphics', 'graphics'],
 ['Data', 'data'],
 ['Knowledge', 'knowledge'],
 ['Information_visualization', 'information visualization'],
 ['Data_visualization', 'data visualization'],
 ['Statistical_graphics', 'statistical graphics'],
 ['Information_design', 'information design'],
 ['Information_architecture', 'information architecture'],
 ['Mass_communication', 'mass communication'],
 ['Isotype_(picture_language)', 'Isotypes'],
 ['Facebook', 'Facebook'],
 ['Twitter', 'Twitter'],
 ['Attention_span', 'attention span'],
 ['David_Macaulay', 'David Macaulay'],
 ['The_Way_Things_Work', 'The Way Things Work'],
 ['USA_Today', 'USA Today'],
 ['Washington_Metro', 'Washington Metro'],
 ['London_Underground', 'London Underground'],
 ['Edward_Tufte', 'Edward Tufte'],
 ['Christoph_Scheiner', 'Christoph Scheiner'],
 ['Illustration', 'illustrations'],
 ['William_Playfair', 'William Pl

In [51]:
resolved_disamb

{'Visualization': 'Information_graphics', 'With': 'WITH_(FM)'}

In [49]:
graph.nodes()

['With',
 'Data Visualization with d3.js',
 'Visualization with',
 'Data Visualization with',
 'D3.js',
 'Data_visualization',
 'Visualization with d3.js',
 'with d3.js',
 'Visualization']

In [None]:
def get_graph(text, graph=None):
    if graph == None:
        graph = nx.DiGraph()
    
    #Add node to the graph
    #graph.add_node(text)
    
    #Try download article for this text
    try:
        art, _ = wdb.get_article_by_href(url.UnquotedURL(text))
        #If suceeed, add article to the node property
        graph.add_node(get_wikipedia_page_url(art.title()), {
            "article": art
        }) 
    except:
        #If no article has been found, we split the text into ngrams,
        #add it as edges and recurse the function the new nodes passing the current graph instance
        for ngram in get_bilateral_ngrams(text):
            print(ngram)
            graph.add_edge(text, ngram)
            get_graph(ngram, graph)
        
    return graph

### 4. Get edges from the hierarquical ngrams

In [9]:
def hierarq_ngrams_to_edges(hierarq_ngrams):
    edges = list()
    for n1, n1_child in hierarq_ngrams.items():
        for n2 in n1_child:
            edges.append((n1, n2))

        edges += hierarq_ngrams_to_edges(n1_child)

    return edges

In [10]:
#convert to set to ensure only one edge of it kind is present
ngrams_edges = list(set(hierarq_ngrams_to_edges(text_ngrams))) 

In [11]:
ngrams_edges

[('Data Visualization with', 'Data Visualization'),
 ('Data Visualization', 'Data'),
 ('Data Visualization', 'Visualization'),
 ('Visualization with d3.js', 'with d3.js'),
 ('Visualization with d3.js', 'Visualization with'),
 ('Visualization with', 'Visualization'),
 ('Data Visualization with d3.js', 'Visualization with d3.js'),
 ('with d3.js', 'd3.js'),
 ('Data Visualization with d3.js', 'Data Visualization with'),
 ('Data Visualization with', 'Visualization with'),
 ('Visualization with', 'with'),
 ('with d3.js', 'with')]

### 5. Expand edges to wikiarticles candidates for each ngram

In [12]:
expanded_edges = list()

for n1, n2 in ngrams_edges:
    for cand1 in wiki_articles[n1]:
        for cand2 in wiki_articles[n2]:
            if str(cand1) != str(cand2): #Avoid circular ref
                expanded_edges.append((str(cand1),str(cand2)))

In [13]:
expanded_edges

[('Data Visualization with', 'Data_visualization'),
 ('Data_visualization', 'Data'),
 ('Data_visualization', 'Mental_image'),
 ('Data_visualization', 'Creative_visualization'),
 ('Data_visualization', 'Motor_imagery'),
 ('Data_visualization', 'Flow_visualization'),
 ('Data_visualization', 'Geovisualization'),
 ('Data_visualization', 'Illustration'),
 ('Data_visualization', 'Information_graphics'),
 ('Data_visualization', 'Information_visualization'),
 ('Data_visualization', 'Interactive_visualization'),
 ('Data_visualization', 'Music_visualization'),
 ('Data_visualization', 'Scientific_visualization'),
 ('Data_visualization', 'Security_visualisation'),
 ('Data_visualization', 'Software_visualization'),
 ('Data_visualization', 'Visualization_(computer_graphics)'),
 ('Data_visualization', 'Visulation'),
 ('Data_visualization', 'Guided_imagery'),
 ('Data_visualization', 'List_of_graphical_methods'),
 ('Data_visualization', 'Image'),
 ('Data_visualization', 'Mental_image'),
 ('Data_visuali

### 6. Get list of nodes allowed to be at the graph
Those are the articles candidates and the disambiguation links

In [14]:
allowed_nodes = set()

for n1, n2 in expanded_edges:
    allowed_nodes.add(n1)
    allowed_nodes.add(n2)

allowed_nodes

{'Carl_Johannes_With',
 'Creative_visualization',
 'D3.js',
 'Data',
 'Data Visualization with',
 'Data Visualization with d3.js',
 'Data_visualization',
 'Flow_visualization',
 'Geovisualization',
 'Guided_imagery',
 'Illustration',
 'Image',
 'Information_graphics',
 'Information_visualization',
 'Interactive_visualization',
 'List_of_graphical_methods',
 'Mental_image',
 'Motor_imagery',
 'Music_visualization',
 'Previsualization',
 'Scientific_visualization',
 'Security_visualisation',
 'Software_visualization',
 'Spatial_visualization_ability',
 'Visual_communication',
 'Visual_perception',
 'Visual_rhetoric',
 'Visual_system',
 'Visual_thinking',
 'Visualization with',
 'Visualization with d3.js',
 'Visualization_(computer_graphics)',
 'Visulation',
 'WITH_(FM)',
 'WRBS_(AM)',
 'WZFT',
 'With_(album)',
 'With_(character)',
 'With_(novel)',
 'with d3.js'}

### 7. Complement expanded edges with the articles links

In [15]:
art_links = list()
for art_candidates in wiki_articles.values():
    for art in art_candidates:
        try: #Use try block because if the art is not wikiarticle it will throw exception
            for link_href, _ in art.links():
                if link_href in allowed_nodes:
                   art_links.append((str(art), str(link_href)))
        except:
            pass
        
art_links

[('Data_visualization', 'Visual_communication'),
 ('Data_visualization', 'Visual_system'),
 ('Data_visualization', 'Data'),
 ('Data_visualization', 'Information_graphics'),
 ('Data_visualization', 'Information_visualization'),
 ('Data_visualization', 'Scientific_visualization'),
 ('Data_visualization', 'D3.js'),
 ('Data_visualization', 'Information_graphics'),
 ('Data_visualization', 'Data'),
 ('Data_visualization', 'Information_visualization'),
 ('D3.js', 'Data_visualization'),
 ('Data', 'Data_visualization')]

In [16]:
full_edges = list(set(art_links + expanded_edges))

full_edges

[('Visualization with', 'Data_visualization'),
 ('Visualization with', 'Software_visualization'),
 ('Data_visualization', 'D3.js'),
 ('D3.js', 'Data_visualization'),
 ('Data_visualization', 'Flow_visualization'),
 ('Data_visualization', 'Interactive_visualization'),
 ('Data_visualization', 'Spatial_visualization_ability'),
 ('Visualization with', 'Visual_system'),
 ('Visualization with', 'WZFT'),
 ('Visualization with d3.js', 'with d3.js'),
 ('Visualization with', 'Previsualization'),
 ('Visualization with', 'Music_visualization'),
 ('Visualization with', 'Visualization_(computer_graphics)'),
 ('Visualization with', 'With_(album)'),
 ('Data_visualization', 'Visulation'),
 ('Data_visualization', 'Visual_communication'),
 ('Visualization with', 'With_(character)'),
 ('with d3.js', 'With_(novel)'),
 ('Visualization with', 'Creative_visualization'),
 ('Data_visualization', 'Security_visualisation'),
 ('Data_visualization', 'Guided_imagery'),
 ('Visualization with', 'Mental_image'),
 ('Data

### 8. Create graph with the edges

In [17]:
graph = nx.DiGraph()
for edge in full_edges:
   graph.add_edge(*edge) 

graph.edges()

[('Data Visualization with', 'Data_visualization'),
 ('Data Visualization with', 'Visualization with'),
 ('Visualization with', 'With_(novel)'),
 ('Visualization with', 'Geovisualization'),
 ('Visualization with', 'Spatial_visualization_ability'),
 ('Visualization with', 'Motor_imagery'),
 ('Visualization with', 'Software_visualization'),
 ('Visualization with', 'WITH_(FM)'),
 ('Visualization with', 'Scientific_visualization'),
 ('Visualization with', 'Music_visualization'),
 ('Visualization with', 'Visual_perception'),
 ('Visualization with', 'Creative_visualization'),
 ('Visualization with', 'Visual_rhetoric'),
 ('Visualization with', 'With_(character)'),
 ('Visualization with', 'Previsualization'),
 ('Visualization with', 'Information_graphics'),
 ('Visualization with', 'Information_visualization'),
 ('Visualization with', 'Security_visualisation'),
 ('Visualization with', 'WZFT'),
 ('Visualization with', 'Flow_visualization'),
 ('Visualization with', 'Interactive_visualization'),
 

### 9. Generate combinations of articles presences

In [None]:
keep working here

we need to generate combinations of existing articles taking into account the hierarquical diagram

and everything that comes before the articles must remain for discount on pagerank

for debug we must print every pagerank for every combination to ensure the algorithm is working properly

In [29]:
#nx.bfs_predecessors(graph, source='Data Visualization with d3.js')

In [None]:
past_nodes = set()
nodes_combinations = list()

def iterate_thru_hngrams(hngrams):
    for ngram, ngram_childs in hngrams.items():
        ngram_candidate = wiki_articles[ngram]
        #Check if the ngram is a valid article
        if len(ngram_candidate)


In [None]:
def hierarq_ngrams_to_edges22(hierarq_ngrams):
    edges = list()
    for n1, n1_child in hierarq_ngrams.items():
        for n2 in n1_child:
            edges.append((n1, n2))

        edges += hierarq_ngrams_to_edges(n1_child)

    return edges

In [38]:
succrs = nx.bfs_successors(graph, 'Data Visualization with d3.js')

In [54]:
text_ngrams

{'Data Visualization with d3.js': {'Data Visualization with': {'Data Visualization': {'Data': {},
    'Visualization': {}},
   'Visualization with': {'Visualization': {}, 'with': {}}},
  'Visualization with d3.js': {'Visualization with': {'Visualization': {},
    'with': {}},
   'with d3.js': {'d3.js': {}, 'with': {}}}}}

In [55]:
wiki_articles['Data Visualization with d3.js']

['Data Visualization with d3.js']

In [58]:
for ngram in text_ngrams['Data Visualization with d3.js'].keys():
    
    

['Data Visualization with', 'Visualization with d3.js']

In [36]:
text_ngrams

{'Data Visualization with d3.js': {'Data Visualization with': {'Data Visualization': {'Data': {},
    'Visualization': {}},
   'Visualization with': {'Visualization': {}, 'with': {}}},
  'Visualization with d3.js': {'Visualization with': {'Visualization': {},
    'with': {}},
   'with d3.js': {'d3.js': {}, 'with': {}}}}}

In [35]:
true_articles = list()
for candidates in wiki_articles.values():
    if len(candidates) > 1:
        true_articles += candidates
    elif type(candidates[0]) != "str":
        true_articles.append(candidates[0])

true_articles

['Carl_Johannes_With',
 'With_(character)',
 'With_(novel)',
 'With_(album)',
 'WITH_(FM)',
 'WRBS_(AM)',
 'WZFT',
 'Visualization with d3.js',
 Data_visualization,
 'Data Visualization with',
 'Visualization with',
 D3.js,
 'with d3.js',
 'Mental_image',
 'Creative_visualization',
 'Motor_imagery',
 'Flow_visualization',
 'Geovisualization',
 'Illustration',
 'Information_graphics',
 'Data_visualization',
 'Information_visualization',
 'Interactive_visualization',
 'Music_visualization',
 'Scientific_visualization',
 'Security_visualisation',
 'Software_visualization',
 'Visualization_(computer_graphics)',
 'Visulation',
 'Guided_imagery',
 'List_of_graphical_methods',
 'Image',
 'Mental_image',
 'Previsualization',
 'Spatial_visualization_ability',
 'Visual_communication',
 'Visual_perception',
 'Visual_rhetoric',
 'Visual_system',
 'Visual_thinking',
 'Data Visualization with d3.js',
 Data]

### 6. Construct graph of candidate articles and links

In [52]:
#Create graph
digraph = nx.DiGraph()

for candidate in wiki_articles.values():
    if candidate._is_disamb_page:
        continue
    
    cand_url = get_wikipedia_page_url(candidate.title())
    for link_href, link_text in candidate.links():
        if link_href in allowed_nodes:
            digraph.add_edge(cand_url, link_href)

### 7. Choose eligible nodes based on a set of weakly connected nodes

In [83]:
list(nx.weakly_connected_components(digraph))

[{'Consumer', 'Ecology', 'Environmentalism', 'Good', 'Ontology', 'Utility'},
 {'1',
  '2',
  '3',
  '4',
  '5',
  '6',
  'Andromeda_(constellation)',
  'Animation',
  'Black',
  'Black_magic',
  'Book_of_Genesis',
  'Built_environment',
  'Cascading_Style_Sheets',
  'Causality',
  'D3.js',
  'Data',
  'Data_visualization',
  'Dice',
  'Document_Object_Model',
  'Environmental_determinism',
  'Epidemiology',
  'Geography',
  'Histogram',
  'Information_graphics',
  'Information_visualization',
  'Interaction',
  'Kabbalah',
  'Magic_(paranormal)',
  'Natural_environment',
  'Old_Norse',
  'Page_layout',
  'Professional_wrestling',
  'Relative_direction',
  'Scalable_Vector_Graphics',
  'Scientific_visualization',
  'Social_media',
  'Time',
  'Up_(2009_film)',
  'Visual_communication',
  'Visual_system',
  'Web_browser_engine'},
 {'A',
  'English_articles',
  'Icelandic_language',
  'The',
  'Upper_Peninsula_of_Michigan',
  'You'},
 {'Design', 'Environment_(systems)', 'Thought'},
 {'Cop

In [28]:
eligible_nodes = set()
for weak_con_set in nx.weakly_connected_components(graph):
    if len(weak_con_set) > len(eligible_nodes):
        eligible_nodes = weak_con_set

In [29]:
eligible_nodes

{'1',
 '2',
 '3',
 '4',
 '5',
 '6',
 'Andromeda_(constellation)',
 'Animation',
 'Black',
 'Black_magic',
 'Book_of_Genesis',
 'Built_environment',
 'Cascading_Style_Sheets',
 'Causality',
 'D3.js',
 'Data',
 'Data_visualization',
 'Dice',
 'Document_Object_Model',
 'Environmental_determinism',
 'Epidemiology',
 'Geography',
 'Histogram',
 'Information_graphics',
 'Information_visualization',
 'Interaction',
 'Kabbalah',
 'Magic_(paranormal)',
 'Natural_environment',
 'Old_Norse',
 'Page_layout',
 'Professional_wrestling',
 'Relative_direction',
 'Scalable_Vector_Graphics',
 'Scientific_visualization',
 'Social_media',
 'Time',
 'Up_(2009_film)',
 'Visual_communication',
 'Visual_system',
 'Web_browser_engine'}

### 8. Resolve disambiguation pages

In [31]:
from collections import defaultdict
disamb_resolve_dict = defaultdict(set)

for article in wiki_articles.values():
    if not article._is_disamb_page:
        continue
    
    for link_href, _ in article.links():
        if link_href in eligible_nodes:
            disamb_resolve_dict[article].add(link_href)
    
disamb_resolve_dict    

defaultdict(set,
            {Up: {'Relative_direction', 'Up_(2009_film)'},
             Some: {'Social_media'},
             Chapter Four: {'Book_of_Genesis'},
             Why: {'Causality'},
             What: {'Professional_wrestling'},
             Magic: {'Magic_(paranormal)'},
             And: {'Andromeda_(constellation)'},
             Environment: {'Built_environment',
              'Environmental_determinism',
              'Epidemiology',
              'Natural_environment'},
             Receiving: {'Kabbalah'},
             Dom: {'Document_Object_Model'},
             On: {'Old_Norse'},
             D3: {'D3.js', 'Dice'},
             Layout: {'Page_layout', 'Web_browser_engine'},
             Visualization: {'Data_visualization',
              'Information_graphics',
              'Information_visualization',
              'Scientific_visualization',
              'Visual_communication',
              'Visual_system'}})

In [45]:
for cand_text, cand_disamb in disamb_resolve_dict.items():
    if len(cand_disamb) > 1:
        print(cand_text)
        for c in cand_disamb:
            print(c, 
                  nx.in_degree_centrality(graph)[c], " - ",
                  nx.out_degree_centrality(graph)[c], " - ",
                  nx.pagerank(graph)[c])
        print(" ")

Up
Up_(2009_film) 0.017543859649122806  -  0.0  -  0.025613303680563203
Relative_direction 0.017543859649122806  -  0.0  -  0.01585067761213782
 
Environment
Natural_environment 0.017543859649122806  -  0.0  -  0.01336297155823371
Built_environment 0.017543859649122806  -  0.0  -  0.01336297155823371
Epidemiology 0.017543859649122806  -  0.0  -  0.01336297155823371
Environmental_determinism 0.017543859649122806  -  0.0  -  0.01336297155823371
 
D3
Dice 0.017543859649122806  -  0.0  -  0.015612011751413492
D3.js 0.017543859649122806  -  0.07017543859649122  -  0.014165257629840325
 
Layout
Page_layout 0.017543859649122806  -  0.0  -  0.015608935551758681
Web_browser_engine 0.017543859649122806  -  0.0  -  0.015608935551758681
 
Visualization
Visual_communication 0.017543859649122806  -  0.0  -  0.014165257629840325
Information_visualization 0.017543859649122806  -  0.0  -  0.014165257629840325
Information_graphics 0.017543859649122806  -  0.0  -  0.014165257629840325
Scientific_visualiz

In [92]:
digraph.edges('Dice')

[]

In [84]:
digraph.edges()

[('Utility', 'Consumer'),
 ('Black', 'Interaction'),
 ('Black', 'Black'),
 ('Black', 'Black_magic'),
 ('6', '5'),
 ('6', 'Dice'),
 ('6', 'Book_of_Genesis'),
 ('Scalable_Vector_Graphics', 'Animation'),
 ('Scalable_Vector_Graphics', 'Document_Object_Model'),
 ('Scalable_Vector_Graphics', 'Cascading_Style_Sheets'),
 ('Geography', 'Natural_environment'),
 ('Geography', 'Interaction'),
 ('Geography', 'Built_environment'),
 ('Geography', 'Environmental_determinism'),
 ('Geography', 'Epidemiology'),
 ('3', 'Professional_wrestling'),
 ('3', '4'),
 ('3', '2'),
 ('3', 'Kabbalah'),
 ('Design', 'Thought'),
 ('Design', 'Environment_(systems)'),
 ('4', '3'),
 ('4', '5'),
 ('4', 'Time'),
 ('4', 'Book_of_Genesis'),
 ('A', 'English_articles'),
 ('Data_visualization', 'Scientific_visualization'),
 ('Data_visualization', 'D3.js'),
 ('Data_visualization', 'Visual_communication'),
 ('Data_visualization', 'Information_visualization'),
 ('Data_visualization', 'Data'),
 ('Data_visualization', 'Information_gra

In [None]:
keep working on handling desambiguation pages
then elect nodes that will be used
and elect nodes for each learn content part

In [None]:
graph.edges()

In [25]:
lc_contents = unroll_lc_contents(ld[0])
candidates = list()
for i, lc in enumerate(lc_contents):
    print("Working on {}/{}".format(i+1, len(lc_contents)))
    candidates += get_text_article_candidates(lc.title)
    
wdb.save()

Working on 1/29
Working on 2/29
Working on 3/29
Working on 4/29
Working on 5/29
Working on 6/29
Working on 7/29
Working on 8/29
Working on 9/29
Working on 10/29
Working on 11/29
Working on 12/29
Working on 13/29
Working on 14/29
Working on 15/29
Working on 16/29
Working on 17/29
Working on 18/29
Working on 19/29
Working on 20/29
Working on 21/29
Working on 22/29
Working on 23/29
Working on 24/29
Working on 25/29
Working on 26/29
Working on 27/29
Working on 28/29
Working on 29/29


In [60]:
#candidates

In [48]:
#Generate graph
graph = nx.DiGraph()

candidates_urls = [get_wikipedia_page_url(cand.title()) for cand in candidates]

for cand_url, candidate in zip(candidates_urls, candidates):
    for link_href, link_text in candidate.links():
        if link_href in candidates_urls:
            graph.add_edge(cand_url, link_href)

In [61]:
print_lc_content(ld[0])

Data Visualization with d3.js
	 Chapter 1: Getting Started with d3.js
		 What is d3.js?
		 Setting up a play environment
		 A simple histogram
		 Summary
	 Chapter 2: A Primer on DOM, SVG, and CSS
		 DOM
		 SVG
		 CSS
		 Summary
	 Chapter 3: Making Data Useful
		 Thinking about data functionally
		 Loading data
		 Scales
		 Time
		 Geography
		 Summary
	 Chapter 4: Making Things Move
		 Animating with transitions
		 Interacting with the user
		 Summary
	 Chapter 5: Layouts – d3's Black Magic
		 What are layouts and why should you care
		 Summary
	 Chapter 6: Designing Good Visualizations
		 What is a visualization?
		 Some great examples
		 Summary


In [85]:
#sorted(candidates[4].links())

In [86]:
#graph.edges()

In [90]:
list(nx.weakly_connected_components(graph))

[{'1',
  '2',
  '3',
  '4',
  '5',
  '6',
  'Animation',
  'Cascading_Style_Sheets',
  'D3',
  'D3.js',
  'Data',
  'Data_visualization',
  'Histogram',
  'Scalable_Vector_Graphics',
  'Time',
  'Visualization'},
 {'Black', 'Black_magic', 'Geography', 'Interaction'},
 {'The', 'You'},
 {'Design', 'Thought'},
 {'Good', 'Utility'}]

In [77]:
for node_name in list(nx.weakly_connected_components(graph))[0]:
    print(node_name, len(graph.in_edges(node_name)))

D3 0
4 2
Visualization 0
6 1
Time 1
5 2
1 1
Histogram 1
2 1
D3.js 2
Animation 1
Data 2
3 3
Cascading_Style_Sheets 2
Data_visualization 3
Scalable_Vector_Graphics 2


In [87]:
#graph['D3.js']

In [88]:
#graph.edges()

In [47]:
#sorted(nx.in_degree_centrality(graph).items(), key=lambda a:a[1], reverse=True)

In [112]:
import requests

def _download_page_data(page, lang, timeout):
    """Function to retrieve a wikipedia page in html form, with its sections"""
    #import exceptions

    #assert isinstance(page, URL)

    # https://en.wikipedia.org/w/api.php?action=parse&redirects&page=fluid_mechanics

    req_params = [
        'action=parse',
        'redirects',
        'format=json',
        'prop=text|displaytitle|categories',
        'page=' + page
    ]

    wikipedia_api_url = "https://" + lang + ".wikipedia.org/w/api.php?" + "&".join(req_params)

    try:
        page_data = requests.get(wikipedia_api_url, timeout=timeout).json()
    except requests.exceptions.ConnectTimeout:
        raise exceptions.PageRequestTimeout(page, lang, timeout)

    #If the object parse is not in the json object, page does not exists
    if not 'parse' in page_data:
        raise exceptions.PageDoesNotExists(page, lang)

    page_title = page_data['parse']['title']
    page_id = page_data['parse']['pageid']
    page_html = page_data['parse']['text']['*']

    #return page, page_title, page_id, page_html
    return page_data

In [129]:
a = _download_page_data("mqtt", "en", 60)['parse']['categories']

In [130]:
a

[{'*': 'CS1_maint:_Multiple_names:_authors_list', 'hidden': '', 'sortkey': ''},
 {'*': 'Application_layer_protocols', 'sortkey': ''},
 {'*': 'Data_transmission', 'sortkey': ''},
 {'*': 'IBM_WebSphere', 'sortkey': 'MQ'},
 {'*': 'Message-oriented_middleware', 'sortkey': ''},
 {'*': 'Network_protocols', 'sortkey': ''},
 {'*': 'Telemetry', 'sortkey': ''}]

In [131]:
def is_disambiguation_page(categories):
    for cat in categories:
        if cat['*'] == 'Disambiguation_pages':
            return True
    return False

In [132]:
is_disambiguation_page(a)

False

In [91]:
we need to check how to handle disambiguation pages such
https://en.wikipedia.org/wiki/Dom
we could first compare the links that the page return and check if it matches the graph, if so, get the
N more matched article links

SyntaxError: invalid syntax (<ipython-input-91-4f244b6b473b>, line 1)