# Embedtree2
Notebook to compile the so far research of nvgtt.

In [1]:
import nltk
import re
from urllib.parse import quote, unquote
import networkx as nx
import sys
import pickle
import requests

from ThreadPool import ThreadPool

from wikipydia import dict_storage, wikidb
DictStorage = dict_storage.DictStorage

In [2]:
wiki_db = wikidb.WikiDb()

In [3]:
last_percent_reported = None
def download_progress_hook(count, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 5% change in download progress.
  """
  global last_percent_reported
  percent = int(count * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent

In [4]:
#wikisyn = DictStorage("wikisyn") #Storage for link synoms

In [5]:
class WikiSynBeta:
    def __init__(self, filename):
        self.filename = filename + ".pickle"
        
        self.submitted_pageids = set()
        self.hrefs = dict()
        
        try:
            with open(self.filename, mode='r+b') as pickle_file:
                saved_data = pickle.load(pickle_file)
                
                self.submitted_pageids = saved_data.submitted_pageids
                self.hrefs = saved_data.hrefs
                
        except IOError:
            print("Failed to open " + self.filename + ". Created empty wikisyn.")
        
    def save(self):
        with open(self.filename, mode='w+b') as pickle_file:
            pickle.dump(self, pickle_file)
    
    def submit_article(self, wikiart):
        if wikiart.page_id in self.submitted_pageids:
            return False
        
        for link_href, link_text in wikiart.links():
            #If the link_text is invalid (empty, spaces etc) skip it
            if not link_text:
                continue
            
            #Ensure link_text is lower case to compute it only once
            link_text = link_text.lower()
            
            #Init this href if it has not been initiated
            if link_href not in self.hrefs:
                self.hrefs[link_href] = dict()
            
            #Init this href text if it has not been initiated   
            if link_text not in self.hrefs[link_href]:
                self.hrefs[link_href][link_text] = 0
                
            self.hrefs[link_href][link_text] += 1 #Add the occurence of this text in this href
            
        self.submitted_pageids.add(wikiart.page_id)
        return True
             
    def get_synoms(self, href, norm=True):
        
        if href not in self.hrefs:
            return list()
        
        if not norm:
            return self.hrefs[href].items()
        
        norm_fact = 0
        for text, score in self.hrefs[href].items():
            norm_fact += score
            
        norm_synoms = list()
        for text, score in self.hrefs[href].items():
            norm_synoms.append((text, score / norm_fact))
        
        return norm_synoms
        
        #synoms = list()
        
        #norm_fact = 0
        

        
        #for l_text, l_score in self.hrefs[href].items():
            #norm_fact += l_score  
        
        #for link_text in self.hrefs[href]:
            #synoms.append(link_text.items())
        
        #return synoms
    
    def get_joined_synoms(self, page_hrefs, norm=True):
        
        synoms = dict()
        
        norm_fact = 0
        
        for href in page_hrefs:
            if href not in self.hrefs:
                continue
            for l_text, l_score in self.hrefs[href].items():
                
                if l_text not in synoms:
                    synoms[l_text] = 0
                
                synoms[l_text] += l_score
                norm_fact += l_score                
        
        #If we should not return scores normalized
        if not norm:
            return synoms.items()
        
        norm_synoms = list()
        for text, score in synoms.items():
            norm_synoms.append((text, score / norm_fact))
        
        return norm_synoms
            

In [6]:
synbeta = WikiSynBeta("wikisynbeta")

In [7]:
def test_wikisynbeta():
    
    syntest = WikiSynBeta("syntest")
    
    wikiart, _ = wiki_db.get_article_by_href("Node.js")
    print(syntest.submit_article(wikiart)) 
    print(syntest.submit_article(wikiart)) 
    
    print(syntest.submitted_pageids)
    #for href in synbeta.get_joined_synoms(["JavaScript"], False):
    for href in syntest.get_synoms("Angular_(application_platform)"):
        print(href)
    
    for h in syntest.hrefs.items():
        print(h)
        
def test_wikisynsave():
    syntest = WikiSynBeta("syntest")
    print(syntest.submitted_pageids)
    print(syntest.hrefs)
    
    wikiart, _ = wiki_db.get_article_by_href("Node.js")
    synbeta.submit_article(wikiart) 
    
    print(synbeta.submitted_pageids)
    print(synbeta.hrefs)
    
    synbeta.save()
   
#test_wikisynbeta()

#test_wikisynsave()

#synbeta = WikiSynBeta()
#test_wikisynbeta()
#test_wikisynbeta()

In [8]:
def get_wiki_article_by_href(href):
    wikiart, downloaded = wiki_db.get_article_by_href(href)
    #Populate wikisyn
    synbeta.submit_article(wikiart)
    
    #for link_href, link_text in wikiart.links():
        #if not link_href in wikisyn:
            #wikisyn[link_href] = set()
        #wikisyn[link_href].add(link_text)
        
    return wikiart

def get_wiki_article_by_title(title):
    return get_wiki_article_by_href(quote(title))

In [9]:
def get_all_wikisyns_by_href(href):
    wikiart = get_wiki_article_by_href(href)
    hrefs = wiki_db.pageid_to_href[wikiart.page_id]
    
    wikisyns = set()
    for page_href in hrefs:
        if page_href not in wikisyn:
            continue
        for syn in wikisyn[page_href]:
            if syn: #CHeck if the text is empty
                wikisyns.add(syn.lower()) 
    
    return wikisyns

def get_all_wikisyns_by_title(title):
    return get_all_wikisyns_by_href(quote(title))

#print(get_all_page_wikisyns("javascript"))

In [10]:
def save_everything():
    wiki_db.save()
    synbeta.save()
    #wikisyn.save()

In [11]:
def test_get_wiki_article_by_href(href):
    wikiart = get_wiki_article_by_href(href)
    save_everything()
    return wikiart

#print(test_get_wiki_article_by_href("c%2b%2b").title)


In [12]:
#Function to download bunch of wikipedia pages at once if they are not present
check_and_download__done = 0
def check_and_download(pages):
    global check_and_download__done
    check_and_download__done = 0
    
    n_tasks = len(pages)
    
    print("Tasks to go: " + str(n_tasks))
    
    # Function to be executed in a thread
    def download_stuff(page):
        global check_and_download__done
        #try:
        get_wiki_article_by_href(page)
        #except:
            #print("Failed to get page " + page + ". Timed out.")
        #finally:
        check_and_download__done += 1
        download_progress_hook(check_and_download__done, n_tasks)
            #percentage = round(check_and_download__done * 100 / n_tasks)
            #if percentage % 5 == 0:
                #print("{0}%.....".format(percentage), end="")
            #print("Done " + str(check_and_download__done) + "/" + str(n_tasks))

    # Instantiate a thread pool with 5 worker threads
    pool = ThreadPool(50)

    pool.map(download_stuff, pages)
    pool.wait_completion()
    
    print("\nFinishing downloading. Done tasks: " + str(check_and_download__done) + "/" + str(n_tasks))

In [13]:
#check_and_download(["Node.js"])

In [14]:
#target_page = urllib.quote("JavaScript")
#target_id = get_pageid(target_page)
#target_links = pageid_to_page_links[target_id]
#target_links_ids = set()

#for i, link in enumerate(target_links):
    #print("Working on " + link + ". " + str((i+1)) + "/" + str(len(target_links)))
    #target_links_ids.add(get_pageid(link))


In [15]:
def print_sorted_list(data, key, reverse=False):
    for k, v in sorted(data, key=key, reverse=reverse):
        print(k,v)  

In [26]:
def get_links_score(page):
    """Function to cross a list of links with a text, setting scores."""
    
    wikiart = get_wiki_article_by_href(page)
    
    pageid = wikiart.page_id
    #Ensure only one href is present
    page_links = set([link_href for link_href, link_text in wikiart.links()])
    page_text = wikiart.text()
    
    links_score = dict()
    
    norm_fact = 0 #norm factor to results sum to one
    
    #Ensure all page links are present
    #check_and_download(page_links)
    
    #The ideia is to get for each href the texts that use to follow these hrefs in wikipedia articles.
    #The more each term appears, it got increased weight.
    #Then we try to match each of the terms for each href to the wiki text, applying to each match the correspondent weight.
    #The weight stuff is good to avoid cases where the href_text have appeared only once in one article,
    #but it is a frequent term in other articles but offen does not mean the original href it pointed to
    
    for link_href in page_links:
        links_score[link_href] = 0
        for l_text, l_score in synbeta.get_synoms(link_href): #get_all_wikisyns_by_href(link_href): #wikisyn[link_href]:
            matches = re.findall('[^a-zA-Z0-9_]' + re.escape(l_text) + '[^a-zA-Z0-9_]', page_text, re.IGNORECASE)
            matches_score = len(matches) * l_score
            links_score[link_href] += matches_score
            norm_fact += matches_score
            
    norm_links_score = dict(map(lambda a: [a[0], float(a[1])/norm_fact], links_score.items()))
            
    return norm_links_score

#v_sum = 0
links_score = get_links_score("Convolutional neural network")
for k, v in sorted(links_score.items(), key=lambda a:a[1], reverse=True):
    print(k,synbeta.hrefs[k],v)
    #v_sum += v
#print v_sum

ArXiv {'arxiv': 308} 0.053275468476275255
Artificial_neural_network {'neural networks': 10, 'artificial neural network': 22, 'artificial neural networks': 13, 'neural network': 2} 0.050341650437531686
Overfitting {'overfitting': 35, 'overfitted': 1} 0.0457019950164126
Digital_object_identifier {'doi': 6141, 'digital object identifier': 6, 'doi number': 1} 0.04382396118086502
C_(programming_language) {'c': 343, 'c programming language': 44, 'c language': 14, 'c-language': 2, 'standard c': 1, 'c (programming language)': 4, 'c programs': 1, 'c programming languages': 1, 'c-style': 1} 0.041845680692399974
Convolution {'convolution': 46, 'convolutional': 3, 'convolutions': 2, 'convolving': 3, 'convolved in the normal way': 1, 'discrete convolution': 1, 'convolved': 2, 'applications of convolution': 1} 0.038190490363351856
Regularization_(mathematics) {'regularization': 19, 'regularizer': 2, 'regularization penalty': 1, 'regularizing': 1, 'regularization (mathematics)': 3, 'regularisation': 

In [17]:
def get_node_edges_scores(page_href, top=-1):
    """Function to get node edges to be placed in the graph. """
    
    edges = dict()
    
    wikiart = get_wiki_article_by_href(page_href)
    
    #Get main page data
    page_id = wikiart.page_id
    page_title = wikiart.title
    page_links = get_links_score(page_href).items()
    
    if top > -1:
        sorted_page_links = sorted(page_links, key=lambda a: a[1], reverse=True)
        page_links = sorted_page_links[:top]
        #print(page_links)
        
    #Ensure everything has been downloaded first    
    hrefsToDownload = [link_href for link_href, _ in page_links]        
    check_and_download(hrefsToDownload)
    
    for i, (link_href, score) in enumerate(page_links):
        
        #print("Working on link {0} {1}/{2}".format(link_href, i+1, len(page_links)))
        
        link_art = get_wiki_article_by_href(link_href)
        
        link_id = link_art.page_id
        link_title = link_art.title
        
        #If there is already a title already place, sum the scores
        if (page_title, link_title) in edges:
            edges[(page_title, link_title)] += score
        else:
            edges[(page_title, link_title)] = score        
        
    return edges



#print(get_links_score("JavaScript"))

#edges_scores = get_node_edges_scores("TensorFlow")
#print_sorted_list(edges_scores.items(), lambda a:a[1], True)

#save_everything()

#CREATE METHOD TO CREATE GRAPH BASED ON DEEPNESS --DONE
#MAYBE PLACE STOP CONDITION TO NOT DOWNLOAD EVERY LINK --DONE
#CHECK WHETHER WIKISYN IS REALLY GOOD BECAUSE OF ERRORS. MAYBE KEEP TRACK HOW MANY TIMES EACH WORD APPEARS DONE

In [18]:
def test_123(target, top=20):
    
    edges_scores = get_node_edges_scores(target, top)
    print_sorted_list(edges_scores.items(), lambda a:a[1], True)
    print("\n\n")
    links_score = get_links_score(target)
    print_sorted_list(links_score.items(), lambda a:a[1], True)
    
#test_123("MQTT")

In [19]:
#print(synbeta.hrefs["Publish%E2%80%93subscribe_pattern"])

In [20]:
def get_graph_edges(seed_href, deepness, top=-1):
    if deepness > 3:
        raise Exception("Not allowed more than 2 of deepness")
    
    edges = list()
    
    seed_title = get_wiki_article_by_href(seed_href).title
    
    done_titles = list()
    todo_queue = list()
    
    todo_queue.append(seed_title)
    
    for i in range(deepness):
        print("Working on batch {0} of {1}".format(i+1, deepness))
        
        current_todo_queue = todo_queue
        todo_queue = list()
        
        print("queue size: {0}".format(len(current_todo_queue)))
        
        while len(current_todo_queue) > 0:
            
            next_title = current_todo_queue.pop() 
        
            if next_title in done_titles:
                continue
        
            edges_scores = get_node_edges_scores(quote(next_title), top)
        
            done_titles.append(next_title)
            
            #Sort edges scores and take the top 5
            #sorted_edges = sorted(edges_scores.items(), key=lambda a: a[1], reverse=True)[:5]
            
            #Save new edges and next to do titles
            for edge, score in edges_scores.items():
                edges.append((edge, score))
                
                if not edge[1] in done_titles:
                    todo_queue.append(edge[1])
    
    #save_everything() 
    return edges           

    
#edges = get_graph_edges("JavaScript", 2, 10)
#save_everything()

#for e in edges:
    #print(e)


In [21]:
def get_graph(edges):
    #Create a directed graph
    graph = nx.DiGraph()
    for edge in edges:
        graph.add_edge(edge[0], edge[1])
    return graph

In [22]:
def extract_path_tuples(path):
    """Function that generates all path tuples from a list path."""
    path_tuples = []
    for i, _ in enumerate(path):
        if i == 0:
            continue
        path_tuples.append((path[i-1], path[i]))

    return path_tuples

def get_features(graph, seed_node, cutoff, deep_rank_scores=None):
    """
    Function to compute the prereq probabilities for every node.
    Compute deeprank and bidirection rank. TO BE EXPLAINED

    Returns: bidir_probs, deeprank_probs, n_paths, min_depths, max_depths

    """

    #Compute number of edges per node
    nodes_edges = dict()
    for n1, n2 in graph.edges():
        #init nodes dict if not initiated
        if not n1 in nodes_edges:
            nodes_edges[n1] = 0
        nodes_edges[n1] += 1

    #Compute initial edges probabilities
    #For bidirection rank we set 1 in case the edge is unidirectional and 0.5 in case bidirectional.
    #For deeprank we compute the fraction of the edge over all the edges in the same node.

    ### Later we need to create other methods to compute proper distribution for the cases above. ### 

    bidir_edges_values = dict()
    deeprank_edges_values = dict()

    #edges_values = dict()
    for n1, n2 in graph.edges():
        if deep_rank_scores == None:
            deeprank_edges_values[(n1, n2)] = 1.0 / nodes_edges[n1]
        
        if graph.has_edge(n2, n1):
            bidir_edges_values[(n1, n2)] = 0.5
        else:
            bidir_edges_values[(n1, n2)] = 1

    if deep_rank_scores != None:
        deeprank_edges_values = dict(deep_rank_scores)

    #Now compute all the paths to the target seed_node and sequence probabilities to each path
    bidir_probs = dict() #Probabilities of reach seed_node from each node based on bidir values
    deeprank_probs = dict() #Probabilities of reach seed_node from each node based on deeprank values
    min_depths = dict() #Each node min depth
    max_depths = dict() #Each node max depth
    ns_paths = dict() #Each node number of paths

    # create dicts for every feature extracted
    # try get insights from kmeans
    # try to find something to deploy FAST (the energy applied must be low!)

    n_nodes = nx.number_of_nodes(graph)

    #Iterate thru all the graph nodes
    for i, node in enumerate(graph.nodes()):

        print("Working on node {0}/{1}".format(i+1,n_nodes))

        #Init min max depth
        min_depth = cutoff + 2
        max_depth = 0

        #Skip seed_node since we do not want verify paths to itself
        if node == seed_node:
            continue

        n_paths = 0
        total_bidir_prob = 0
        total_deeprank_prob = 0

        for path in nx.all_simple_paths(graph, source=seed_node, target=node, cutoff=cutoff):

            n_paths += 1
            partial_bidir_prob = 1.0
            partial_deeprank_prob = 1.0

            #Computes min-max depth
            max_depth = max(max_depth, len(path))
            min_depth = min(min_depth, len(path))

            #Iterate the path tuples
            for i, edge_tuple in enumerate(extract_path_tuples(path)):
                partial_bidir_prob *= bidir_edges_values[edge_tuple]
                partial_deeprank_prob *= deeprank_edges_values[edge_tuple]

            total_bidir_prob += partial_bidir_prob
            total_deeprank_prob += partial_deeprank_prob

        #After processing all node paths, save the final values    
        bidir_probs[node] = total_bidir_prob / n_paths
        deeprank_probs[node] = total_deeprank_prob
        min_depths[node] = min_depth
        max_depths[node] = max_depth
        ns_paths[node] = n_paths 

    return bidir_probs, deeprank_probs, ns_paths, min_depths, max_depths

In [25]:
target_article = "Convolutional neural network"

scored_edges = get_graph_edges(target_article, 3, 10)
edges = [edge for edge,score in scored_edges]

article_title = get_wiki_article_by_href(target_article).title

#print(scored_edges)
#print(edges)
graph = get_graph(edges)

bidir_probs, deeprank_probs, ns_paths, min_depths, max_depths = get_features(graph, article_title, 4, scored_edges)

print("\n")
for title, dr in sorted(deeprank_probs.items(), key=lambda a: a[1], reverse=True):
    print("{0}        {1}        {2}        {3}        {4}        {5}" \
          .format(title, round(dr,3), round(bidir_probs[title],3), ns_paths[title], min_depths[title], max_depths[title]))
    #print(title, round(dr,3))

save_everything()

#MUST KEEP PLAYING WITH PARAMETERS
#COME TO A CONCLUSION

#CREATE MUSIC WITH MACHINE LEARNING


Working on batch 1 of 3
queue size: 1
Tasks to go: 10
10%20%30%60%70%50%80%90%100%
Finishing downloading. Done tasks: 10/10
Working on batch 2 of 3
queue size: 9
Tasks to go: 10
20%30%60%70%10%80%90%40%50%
Finishing downloading. Done tasks: 10/10
Tasks to go: 10
30%40%20%10%70%90%100%80%60%
Finishing downloading. Done tasks: 10/10
Tasks to go: 6
.50%..100%.
Finishing downloading. Done tasks: 6/6
Tasks to go: 10
10%20%30%50%40%60%80%90%70%100%
Finishing downloading. Done tasks: 10/10
Tasks to go: 10
10%20%30%40%80%50%90%70%100%
Finishing downloading. Done tasks: 10/10
Tasks to go: 10
10%20%40%50%60%30%80%90%100%
Finishing downloading. Done tasks: 10/10
Tasks to go: 10
10%20%30%100%40%50%60%90%70%80%
Finishing downloading. Done tasks: 10/10
Tasks to go: 10
20%30%40%10%70%60%50%100%90%
Finishing downloading. Done tasks: 10/10
Tasks to go: 10
10%20%40%30%60%80%90%70%100%50%
Finishing downloading. Done tasks: 10/10
Working on batch 3 of 3
queue size: 79
Tasks to go: 10
10%20%40%30%60%80%70%

In [238]:
#print(page_data['full'].get_text())