# Embedtree2
Notebook to compile the so far research of nvgtt.

In [182]:
import requests
import nltk
from bs4 import BeautifulSoup
import re

### Function to get wikipedia data

In [183]:
def get_page(page, lang="en"):
    """Function to retrieve a wikipedia page in html form, with its sections"""

    # https://en.wikipedia.org/w/api.php?action=parse&redirects&page=fluid_mechanics

    wikipediaApiUrl = "https://" + lang + ".wikipedia.org/w/api.php"

    pageParams = {
        'action': 'parse', 
        'redirects': True,
        'page': page,
        'format': 'json',
        'prop':'text|displaytitle'
    }

    pageData = requests.get(wikipediaApiUrl, pageParams).json()

    if not 'parse' in pageData:
        raise "Error while getting page " + page


    docHtml = BeautifulSoup(pageData['parse']['text']['*'], 'html.parser')

    #Split document by its sections
    docSections = __splitIntoSections__(docHtml)

    structPageData = {
        'title': pageData['parse']['title'],
        'pageid': pageData['parse']['pageid'],
        'full': docHtml,
        'sections': docSections
    }

    return structPageData


def __splitIntoSections__(htmlObj):
    """Function to split html document in sections (use h2 tags as divisors)"""

    #Init var to store sections
    sectionObjs = [[]]

    for tag in htmlObj.children:
        #Start new section in case the tag is h2
        if tag.name == 'h2':
            sectionObjs.append([])

        #If it is a valid tag (invalid tags has no 'name' property)
        if tag.name != None:
            sectionObjs[len(sectionObjs) - 1].append(tag)

    return sectionObjs
    

In [208]:
def get_page_and_parse(page):
    """Function to treat the data, remove unecessary things etc."""
    
    page_data = get_page(page)
    
    soup = page_data['full']
    
    #Remove lateral info table
    for t in soup.findAll("table", { "class" : "vertical-navbox" }):
        t.extract()
    
    for t in soup.findAll("table", { "class" : "infobox" }):
        t.extract()
        
    for t in soup.findAll("div", { "id" : "toc" }):
        t.extract()   
        
    return page_data

In [209]:
def get_page_links(page_data):
    """Function to get links """
    #page_data = get_page_and_parse(page)
    
    links = dict()
    
    for link in page_data['full'].findAll("a"):
        
        #If the a tag has no href attr, skip it
        if not link.has_attr("href"):
            continue
            
        #Get only wikipedia links
        #Remove pages that contains a colon (":"). Their offen are special pages. Not sure if there is articles with colon
        #Skip empty text links
        if link['href'].find("/wiki/") == 0 and link['href'].find(":") == -1:
            #We MUST NOT use last index of / to get the path cause some titles like TCP/IP, have bar in the title
            #We should use the '/wiki/' string length
            linkName = link['href'][6:]

            #Remove hashtag from url if any
            hashIndex = linkName.find("#")
            if hashIndex != -1:
                linkName = linkName[:hashIndex]
            
            if not links.has_key(linkName):
                links[linkName] = list()
            links[linkName].append(link.get_text())
            
    return links

In [210]:
def get_page_links_score(links, text):
    """Function to cross a list of links with a text, setting scores."""
    
    links_score = dict()
    
    for link_href, link_texts in links.items():
        for l_text in link_texts:
            GOT TO ENSURE l_text is not a stop word and is surrounded by non letters
            we made a project that do that, must find
            #Escape special regex chars and ignore case
            for match in re.findall(re.escape(l_text), text, re.IGNORECASE):
                if not links_score.has_key(link_href):
                    links_score[link_href] = 0
                links_score[link_href] += 1
            
    return links_score
    

In [211]:
page_data = get_page_and_parse("C++")

In [212]:
page_links = get_page_links(page_data)

In [213]:
links_score = get_page_links_score(page_links, page_data['full'].get_text())

In [214]:
for k, v in sorted(links_score.items(), key=lambda a:a[1], reverse=True):
    print(k,v)

(u'C_(programming_language)', 5464)
(u'International_Organization_for_Standardization', 1545)
(u'D_(programming_language)', 1462)
(u'D_language', 1462)
(u'International_Electrotechnical_Commission', 1311)
(u'International_Standard_Book_Number', 730)
(u'ISO_1', 724)
(u'ISO_2', 505)
(u'ISO_9', 315)
(u'Preferred_number', 275)
(u'ISO_6', 271)
(u'ISO_4', 266)
(u'ISO_5', 243)
(u'ISO_7', 231)
(u'C%2B%2B11', 144)
(u'Bjarne_Stroustrup', 118)
(u'Standardization', 90)
(u'Prolog', 70)
(u'C%2B%2B17', 63)
(u'C%2B%2B14', 59)
(u'Generic_programming', 59)
(u'Class_(computer_programming)', 44)
(u'Exception_handling', 43)
(u'Java_(programming_language)', 36)
(u'ISO_31-1', 35)
(u'A440_(pitch_standard)', 35)
(u'ISO_80000-1', 35)
(u'ISO_639-1', 35)
(u'ISO/IEC_8859-1', 35)
(u'ISO_3166-1', 35)
(u'ISO_15706-2', 34)
(u'MPEG-4_Part_2', 34)
(u'ISO_31-2', 34)
(u'ISO_80000-2', 34)
(u'ISO_3166-2', 34)
(u'ISO_639-2', 34)
(u'ISO_11940-2', 34)
(u'ISO/IEC_8859-2', 34)
(u'Programming_language', 32)
(u'C%2B%2B_Standard_Li

In [138]:
#page_data = get_links_score()
#for d in page_data.iteritems():
    #print(d)

In [155]:
#
print(page_data['full'].get_text())

"CXX" redirects here. For the Roman numerals, see 120 (number).

C++

Paradigm
Multi-paradigm: procedural, functional, object-oriented, generic[1]


Designed by
Bjarne Stroustrup


First appeared
1983; 34 years ago (1983)





Stable release

ISO/IEC 14882:2014 / 15 December 2014; 2 years ago (2014-12-15)



Typing discipline
Static, nominative, partially inferred


Implementation language
C++


OS
Cross-platform


Filename extensions
.cc .cpp .cxx .C .c++ .h .hh .hpp .hxx .h++


Website
isocpp.org


Major implementations


LLVM Clang, GCC, Microsoft Visual C++, Embarcadero C++Builder, Intel C++ Compiler, IBM XL C++


Influenced by


C, Simula, ALGOL 68, Ada, CLU, ML


Influenced


Ada 95, C99, C#,[2] Chapel,[3] D, Java,[4] Lua, Rust, Python, Perl, PHP




 C++ Programming at Wikibooks




C++ (pronounced cee plus plus /ˈsiː plʌs plʌs/) is a general-purpose programming language. It has imperative, object-oriented and generic programming features, while also providing facilities for low