In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata

## GET STARTING PAGE WITH UNIVERSITIES

In [2]:
r = requests.get('https://www.felvi.hu/felveteli/egyetemek_foiskolak/!IntezmenyiOldalak/intezmeny_lista.php?elj=20a')

In [5]:
soup = BeautifulSoup(r.content, 'lxml')

In [6]:
# get subpage url of university programs
def extract_uni_and_href(soup, base_url='https://www.felvi.hu'):
    out = {}
    trs = soup.findAll('tr')
    
    for tr in trs:
        try:
            tr.findAll('td')[0]['title']
            for i, td in enumerate(tr.findAll('td')):
                if i in (0,2):
                    continue
                elif i in (1,3):
                    title = td['title']
                    href = td.findAll('a')[0]['href']
                    out[title] = f"{base_url}{href}".replace('&elj=20a', '&oldal=2&elj=20a')
        except KeyError:
            continue
        
    return out

In [7]:
uni_links = extract_uni_and_href(soup)

## GET SUBPAGES IF DEPARTMENTS HAVE DIFFERENT SUBPAGES

In [10]:
def try_extract_subpages(uni_links, base_url='https://www.felvi.hu'):
    new_uni_links = {}
    keys_to_delete = []
    for k, v in uni_links.items():
        r = requests.get(v)
        soup = BeautifulSoup(r.content, 'lxml')
        subpages = [elem for elem in soup.findAll('a') if k in elem.text]    
        if len(subpages) > 0:
            keys_to_delete.append(k)
            for elem in subpages:
                href = elem['href']
                new_uni_links[unicodedata.normalize("NFKD", elem.text).strip()] = f"{base_url}{href}".replace('&elj=20a', '&oldal=2&elj=20a')
    return new_uni_links, keys_to_delete

In [11]:
new_uni_links, keys_to_delete = try_extract_subpages(uni_links)

In [12]:
uni_links = {k: v for k, v in uni_links.items() if not k in keys_to_delete}

In [13]:
total_uni_links = {**uni_links, **new_uni_links}

## GET TABLES FROM THESE PAGES

In [14]:
def get_tables(href):
    r = requests.get(href)
    return pd.read_html(r.content, header=0)

In [15]:
tables = {k: get_tables(v) for k,v in total_uni_links.items()}

In [16]:
# THROW AWAY FOREIGN UNIs AND MISSING DATA
keys_to_remove = []
for k, v in tables.items():
    try:
        v[5]
    except IndexError:
        keys_to_remove.append(k)

In [17]:
tables = {k: v for k, v in tables.items() if not k in keys_to_remove}

## FIND TABLE WITH THE PROGRAMS

In [18]:
def find_rel_table(table_list):
    ret_tables = []
    for table in table_list:
        if 'Képz.szint' in table.columns:
            ret_tables.append(table)
    return ret_tables

In [19]:
pot_tables = {k: find_rel_table(v) for k, v in tables.items()}

In [21]:
more_tab = {k: v for k,v in pot_tables.items() if len(v) > 1}

## ADD UNI ID AND CONCAT

In [27]:
for k, v in pot_tables.items():
    for tab in v:
        tab['Intézmény/Kar'] = k

In [29]:
out = pd.concat([item for sublist in list(pot_tables.values()) for item in sublist], sort=False, ignore_index=True)

In [32]:
out['Munka-rend'].value_counts()

N    4884
L    1719
E      59
T      34
Name: Munka-rend, dtype: int64

## POLISH AND WRITE TO FILE

In [35]:
out.columns = [unicodedata.normalize("NFKD", col) for col in out.columns]

In [36]:
def strip_numbers_from_end(string):
    for i in range(20):
        if str(i) in string:
            string = string.rstrip(f'{str(i)})').rstrip('(')
        else:
            continue
    return string

In [37]:
out['Meghirdetett képzés - lábjegyzet nélkül'] = [strip_numbers_from_end(f) for f in out['Meghirdetett képzés']]

In [38]:
out.shape[0] - out.isnull().sum()

Képz.szint                                               6696
Munka-rend                                                6696
Fin.forma                                                 6696
Meghirdetett képzés                                     6696
Önköltség (félév)                                    6696
Képz. idő(félév)                                      6696
Kapacitásmin. < max.                                     6696
Személyes megjelenést igénylő vizsgaformák             21
Képz.terület                                            6696
Intézmény/Kar                                           6696
Érettségi vizsgakövetelmények a pontszámításhoz    6587
Pontsz. fels. okl. IGEN/NEM                               6675
Vizsgatárgyak                                              88
Meghirdetett képzés - lábjegyzet nélkül                 6696
dtype: int64

In [40]:
out.to_excel('felvi-2020-meghirdetett-kepzesek.xlsx')