# Embedtree2
Notebook to compile the so far research of nvgtt.

In [1]:
import requests
import nltk
from bs4 import BeautifulSoup
import re
import urllib
import networkx as nx

from DictStorage import DictStorage
from ThreadPool import ThreadPool

### Function to get wikipedia data

In [2]:
def get_page(page, lang="en"):
    """Function to retrieve a wikipedia page in html form, with its sections"""

    # https://en.wikipedia.org/w/api.php?action=parse&redirects&page=fluid_mechanics
    """
    wikipediaApiUrl = "https://" + lang + ".wikipedia.org/w/api.php"
    
    pageParams = {
        'action': 'parse', 
        'redirects': True,
        'page': urllib.unquote(page),
        'format': 'json',
        'prop':'text|displaytitle'
    }

    pageData = requests.get(wikipediaApiUrl, pageParams, timeout=0.001).json()
    """    
    pageParams = [
        'action=parse',
        'redirects',
        'format=json',
        'prop=text|displaytitle',
        'page=' + page
    ]
    
    wikipediaApiUrl = "https://" + lang + ".wikipedia.org/w/api.php?" + "&".join(pageParams)

    pageData = requests.get(wikipediaApiUrl, timeout=60).json()
    
    if not 'parse' in pageData:
        print(page) 
        print(urllib.unquote(page))
        print(pageData)
        raise "Error while getting page " + page


    docHtml = BeautifulSoup(pageData['parse']['text']['*'], 'html.parser')

    #Split document by its sections
    docSections = __splitIntoSections__(docHtml)

    structPageData = {
        'title': pageData['parse']['title'],
        'pageid': pageData['parse']['pageid'],
        'full': docHtml,
        'sections': docSections
    }

    return structPageData


def __splitIntoSections__(htmlObj):
    """Function to split html document in sections (use h2 tags as divisors)"""

    #Init var to store sections
    sectionObjs = [[]]

    for tag in htmlObj.children:
        #Start new section in case the tag is h2
        if tag.name == 'h2':
            sectionObjs.append([])

        #If it is a valid tag (invalid tags has no 'name' property)
        if tag.name != None:
            sectionObjs[len(sectionObjs) - 1].append(tag)

    return sectionObjs
    

In [3]:
#print get_page("C%2B%2B")['title']

In [4]:
def get_page_and_parse(page):
    """Function to treat the data, remove unecessary things etc."""
    
    page_data = get_page(page)
    
    soup = page_data['full']
        
    #Clear table of contents if any
    for node in soup.findAll(id='toc'):
        node.decompose()

    #Clear top info table if any
    for node in soup.findAll(class_='ambox'):
        node.decompose()

    #Clear info box if any
    for node in soup.findAll(class_='infobox'):
        node.decompose()

    #Clear verticalbox if any
    for node in soup.findAll(class_='vertical-navbox'):
        node.decompose()
        
    #Clear navbox if any
    for node in soup.findAll(class_='navbox'):
        node.decompose()
        
    return page_data

In [5]:
blocked_link_terms = set([
    "(disambiguation)", #Not interested in disambiguation pages
    ":" #Pages with colon are offen special pages. Not sure if there is articles with colon
])

def get_page_links(page_data):
    """Function to get links """
    #page_data = get_page_and_parse(page)
    
    links = list()
    
    for link in page_data['full'].findAll("a"):
        
        #If the a tag has no href attr, skip it
        if not link.has_attr("href"):
            continue
            
        #If the href does not starts with "/wiki/", skip it
        if link['href'].find("/wiki/") != 0:
            continue
            
        #Check if some blocked term is present in the href, if so, skip the link
        skip_link = False
        for term in blocked_link_terms:
            if link['href'].find(term) != -1:
                skip_link = True
                break
        if skip_link:
            continue
        
        #Get only the link portion
        #We MUST NOT use last index of / to get the path cause some titles like TCP/IP, have bar in the title
        #We should use the '/wiki/' string length
        linkHref = link['href'][6:]
        
        #Remove hashtag from url if any
        hashIndex = linkHref.find("#")
        if hashIndex != -1:
            linkHref = linkHref[:hashIndex]
            
        linkText = link.get_text()
            
        links.append((linkHref, linkText))
            
    return links#, n_valid_links

## Main()

In [6]:
wikisyn = DictStorage("wikisyn") #Storage for link synoms
href_to_pageid = DictStorage("href_to_pageid") #Storage lookup table of hrefs and pageids
pageid_to_title = DictStorage("pageid_to_title") #Storage for page titles
pageid_to_href = DictStorage("pageid_to_href") #Storage lookup table of pageid and hrefs
pageid_to_page_links = DictStorage("pageid_to_page_links") #Storage for page links
pageid_to_page_text = DictStorage("pageid_to_page_text") #Storage for page texts

In [7]:
#Function to save page data to storages
def save_page_data(page_href, page_data):
    page_title = page_data['title']
    page_id = page_data['pageid']
    page_text = page_data['full'].get_text()
    
    #Register page id lookup tables
    href_to_pageid[page_href] = page_id
    href_to_pageid[page_title] = page_id
    
    #Register page title
    pageid_to_title[page_id] = page_title
    
    if not page_id in pageid_to_href:
        pageid_to_href[page_id] = set()
    pageid_to_href[page_id].add(page_href)
    pageid_to_href[page_id].add(page_title)
    
    #Register page text
    pageid_to_page_text[page_id] = page_text
    
    #Register page links and wikisyn
    page_links = get_page_links(page_data)
    pageid_to_page_links[page_id] = set()
    for link_href, link_text in page_links:
        pageid_to_page_links[page_id].add(link_href)
        if not link_href in wikisyn:
            wikisyn[link_href] = set()
        wikisyn[link_href].add(link_text)
       
    #Save everything
    href_to_pageid.save()
    pageid_to_title.save()
    pageid_to_href.save()
    pageid_to_page_text.save()
    pageid_to_page_links.save()
    wikisyn.save()

In [8]:
def get_pageid(page):
    
    #Check if the page is in the redirects table, if not, download it and register it
    if not page in href_to_pageid:
        print("Page not found. Downloading and registering it...")
        page_data = get_page_and_parse(page)
        save_page_data(page, page_data)
        print("Done.")
        
    return href_to_pageid[page]

In [9]:
#Function to download bunch of wikipedia pages at once if they are not present
check_and_download__done = 0
def check_and_download(pages):
    global check_and_download__done
    check_and_download__done = 0
    
    n_tasks = len(pages)
    
    # Function to be executed in a thread
    def download_stuff(page):
        global check_and_download__done
        try:
            get_pageid(page)
            check_and_download__done += 1
            #print("Done " + str(check_and_download__done) + "/" + str(n_tasks))
        except requests.exceptions.Timeout:
            print("Failed to get page " + page + ". Timed out.")

    # Instantiate a thread pool with 5 worker threads
    pool = ThreadPool(10)

    pool.map(download_stuff, pages)
    pool.wait_completion()
    
    print("Finishing downloading. Done tasks: " + str(check_and_download__done) + "/" + str(n_tasks))

In [10]:
#check_and_download(["Node.js"])

In [11]:
#target_page = urllib.quote("JavaScript")
#target_id = get_pageid(target_page)
#target_links = pageid_to_page_links[target_id]
#target_links_ids = set()

#for i, link in enumerate(target_links):
    #print("Working on " + link + ". " + str((i+1)) + "/" + str(len(target_links)))
    #target_links_ids.add(get_pageid(link))


In [12]:
def print_sorted_list(data, key, reverse=False):
    for k, v in sorted(data, key=key, reverse=reverse):
        print(k,v)  

In [13]:
def get_links_score(page):
    """Function to cross a list of links with a text, setting scores."""
    
    pageid = href_to_pageid[page]
    page_links = pageid_to_page_links[pageid]
    page_text = pageid_to_page_text[pageid]
    
    links_score = dict()
    
    norm_fact = 0
    
    #Ensure all page links are present
    check_and_download(page_links)
    
    for link_href in page_links:
        links_score[link_href] = 0
        for l_text in wikisyn[link_href]:
            matches = re.findall('[^a-zA-Z0-9_]' + re.escape(l_text) + '[^a-zA-Z0-9_]', page_text, re.IGNORECASE)
            links_score[link_href] += len(matches) 
            norm_fact += len(matches)
            
    norm_links_score = dict(map(lambda a: [a[0], float(a[1])/norm_fact], links_score.items()))
            
    return norm_links_score

#v_sum = 0
#links_score = get_links_score("MQTT")
#for k, v in sorted(links_score.items(), key=lambda a:a[1], reverse=True):
    #print(k,wikisyn[k],v)
    #v_sum += v
#print v_sum

In [16]:
def get_node_edges_scores(page_href):
    """Function to get node edges to be placed in the graph. """
    
    edges = dict()
    
    #Get main page data
    page_id = get_pageid(page_href)
    page_title = pageid_to_title[page_id]
    page_links = get_links_score(page_href).items()
    
    for i, (link_href, score) in enumerate(page_links):
        #print("Working on link {0} {1}/{2}".format(link_href, i+1, len(page_links)))
        link_id = get_pageid(link_href)
        link_title = pageid_to_title[link_id]
        
        #If there is already a title already place, sum the scores
        if (page_title, link_title) in edges:
            edges[(page_title, link_title)] += score
        else:
            edges[(page_title, link_title)] = score        
        
    return edges
    

edges_scores = get_node_edges_scores("AutoCAD")
print_sorted_list(edges_scores.items(), lambda a:a[1], True)

#CREATE METHOD TO CREATE GRAPH BASED ON DEEPNESS
#MAYBE PLACE STOP CONDITION TO NOT DOWNLOAD EVERY LINK
#CHECK WHETHER WIKISYN IS REALLY GOOD BECAUSE OF ERRORS. MAYBE KEEP TRACK HOW MANY TIMES EACH WORD APPEARS

Finishing downloading. Done tasks: 74/74
('AutoCAD', 'Autodesk') 0.1544502617801047
('AutoCAD', 'Macintosh') 0.11518324607329843
('AutoCAD', '.dwg') 0.07329842931937172
('AutoCAD', 'MacOS') 0.07329842931937172
('AutoCAD', 'Microsoft Windows') 0.06806282722513089
('AutoCAD', 'Computer-aided design') 0.05235602094240838
('AutoCAD', 'Cloud computing') 0.03664921465968586
('AutoCAD', 'Commercial software') 0.031413612565445025
('AutoCAD', 'Android (operating system)') 0.028795811518324606
('AutoCAD', 'Portable Document Format') 0.020942408376963352
('AutoCAD', 'Italian language') 0.01832460732984293
('AutoCAD', 'IOS') 0.01832460732984293
('AutoCAD', 'Architecture') 0.015706806282722512
('AutoCAD', 'Mac App Store') 0.015706806282722512
('AutoCAD', 'Mobile app') 0.015706806282722512
('AutoCAD', 'App Store (iOS)') 0.015706806282722512
('AutoCAD', 'Dexigner') 0.015706806282722512
('AutoCAD', 'AutoCAD Architecture') 0.013089005235602094
('AutoCAD', 'Mac OS X Lion') 0.013089005235602094
('AutoCA

In [16]:
#links_score = get_page_links_score(page_links, page_data['full'].get_text())

In [15]:
#total_links = 0
#for k, v in links_score.items():
    #total_links += v

#for k, v in sorted(links_score.items(), key=lambda a:a[1], reverse=True):
    #print(k,page_links[k],float(v),float(v)/total_links)

In [138]:
#page_data = get_links_score()
#for d in page_data.iteritems():
    #print(d)

In [238]:
#print(page_data['full'].get_text())