In [1]:
import wikipedia as wp
import requests
import bs4
import os
from tqdm import tqdm_notebook as tqdm

# FUNCTIONS & GLOBALS

In [2]:
files = {}

with open('input/languages.txt', 'r') as file:
    languages = [f.strip('\n') for f in file.readlines()]

In [3]:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}

In [9]:
def get_soup_url(url):
    r = requests.get(url)
    return bs4.BeautifulSoup(r.content, 'html5lib')

# for influential people scrape
def get_name_from_element(tag):
    try:
        return tag.find_all('a')[0].text.strip()
    except:
        ret = tag.text.split('(')[0].strip()
        if 'Menes' in ret:
            return ret.split(' ')[0]
        else:
            return ret

def extract_content_pages(files, page_list, languages=languages):
    # iterate over languages
    for lang in languages:
        print(lang)
        wp.set_lang(lang)
        try:
            files[lang]
        except KeyError:
            files[lang] = {}
        
        # iterate over page names
        for i, name in enumerate(tqdm(page_list)):
            try:
                page_content = wp.page(wp.search(name)[0]).content
                files[lang][name] = page_content
            except:
                continue
    
    return files

# FIND 100 MOST INFLUENTIAL PEOPLE

In [10]:
# url of 100 most influential people in history
soup = get_soup_url('https://www.biographyonline.net/people/100-most-influential.html')

In [11]:
name_elements = soup.find_all('li', class_ = None)[:100]
names = [get_name_from_element(elem) for elem in name_elements]

## EXTRACT CONTENT OF THEIR WIKIPEDIA PAGES

In [13]:
files = extract_content_pages (files, names)  

en


HBox(children=(IntProgress(value=0), HTML(value='')))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))



de


HBox(children=(IntProgress(value=0), HTML(value='')))


hu


HBox(children=(IntProgress(value=0), HTML(value='')))


ro


HBox(children=(IntProgress(value=0), HTML(value='')))




# FIND 100 BEST CITIES

In [14]:
# url of 100 best cities to live list
soup = get_soup_url('https://www.bestcities.org/rankings/worlds-best-cities/')

In [15]:
cities = [' '.join(match.text.strip().split(' ')[1:]) for match in soup.findAll('div', {'class': 'rankings-cities-detail'})]

## EXTRACT CONTENT OF THEIR WIKIPEDIA PAGES

In [16]:
# download pages in every language
files = extract_content_pages(files, cities)  

en


HBox(children=(IntProgress(value=0), HTML(value='')))


de


HBox(children=(IntProgress(value=0), HTML(value='')))


hu


HBox(children=(IntProgress(value=0), HTML(value='')))


ro


HBox(children=(IntProgress(value=0), HTML(value='')))




# FIND TOP 100 COMPANIES

In [17]:
# url of top 100 companies list
url = 'https://ceoworld.biz/2019/06/28/the-top-100-best-performing-companies-in-the-world-2019/'
r = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(r.content, 'html5lib')

In [18]:
companies = [f.text for f in soup.findAll('td', {'class': 'column-2'})]

## EXTRACT CONTENT OF THEIR WIKIPEDIA PAGES

In [20]:
# download pages in every language
files = extract_content_pages(files, companies)  

en


HBox(children=(IntProgress(value=0), HTML(value='')))


de


HBox(children=(IntProgress(value=0), HTML(value='')))


hu


HBox(children=(IntProgress(value=0), HTML(value='')))


ro


HBox(children=(IntProgress(value=0), HTML(value='')))




# FIND TOP 100 POP/ROCK BANDS

In [21]:
# url of top 100 companies list
soup = get_soup_url('https://www.imdb.com/list/ls076954447/')

In [22]:
bands = [f.text.strip().split('\n')[-1].strip() for f in soup.findAll('h3', {'class': 'lister-item-header'})]

## EXTRACT CONTENT OF THEIR WIKIPEDIA PAGES

In [23]:
files = extract_content_pages(files, bands)

en


HBox(children=(IntProgress(value=0), HTML(value='')))


de


HBox(children=(IntProgress(value=0), HTML(value='')))


hu


HBox(children=(IntProgress(value=0), HTML(value='')))


ro


HBox(children=(IntProgress(value=0), HTML(value='')))




# FIND TOP SPORT FRANCHISES

In [24]:
# url of top sport franchises list
soup = get_soup_url('https://en.wikipedia.org/wiki/Forbes%27_list_of_the_most_valuable_sports_teams')

In [25]:
a = [f.findAll('a') for f in soup.findAll('td')]
a = [f for f in a if not f == []]
a = [f for f in a if not len(f)==1]
sport_franchises = [f[1].text for f in a][:-5]

## EXTRACT CONTENT OF THEIR WIKIPEDIA PAGES

In [26]:
files = extract_content_pages(files, sport_franchises)

en


HBox(children=(IntProgress(value=0, max=165), HTML(value='')))


de


HBox(children=(IntProgress(value=0, max=165), HTML(value='')))


hu


HBox(children=(IntProgress(value=0, max=165), HTML(value='')))


ro


HBox(children=(IntProgress(value=0, max=165), HTML(value='')))




# FIND TOP 100 BOOKS

In [27]:
soup = get_soup_url('https://medium.com/world-literature/creating-the-ultimate-list-100-books-to-read-before-you-die-45f1b722b2e5')

In [28]:
books = [f.text.split(' —')[0].split(' by ')[0] for f in soup.findAll('li')][8:][:-3]

## EXTRACT CONTENT OF THEIR WIKIPEDIA PAGES

In [29]:
files = extract_content_pages(files, books)

en


HBox(children=(IntProgress(value=0), HTML(value='')))


de


HBox(children=(IntProgress(value=0), HTML(value='')))


hu


HBox(children=(IntProgress(value=0), HTML(value='')))


ro


HBox(children=(IntProgress(value=0), HTML(value='')))




# KEEP ONLY PAGES WHICH ARE PRESENT IN ALL LANGUAGES

In [30]:
for i, key in enumerate(files.keys()):
    if i == 0:
        common_files = set(files[key])
    else:
        common_files = common_files.intersection(set(files[key]))

In [31]:
len(common_files)

452

# WRITE TO FILE

In [32]:
# ensure directories are present
if not os.path.exists('page_text'):
    os.makedirs('page_text')
for lang in languages:
    if not os.path.exists(os.path.join('page_text', f'{lang}')):
        os.makedirs(os.path.join('page_text', f'{lang}'))
    for doc in common_files:
        with open((os.path.join('page_text', f'{lang}', f'{doc.lower().replace(" ", "-").replace("/", "")}.txt')), 'w') as file:
            file.write(files[lang][doc])