# ČLK scraping
The aim of this notebook is to scrap data from [ČLK](https://www.lkcr.cz/seznam-lekaru-426.html) website to obtain current data about doctors in Czech republic 

In [19]:
download = False
run_parallel = False

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options


from itertools import islice
from fake_useragent import UserAgent
from joblib import Parallel, delayed
from datetime import date

import time
import pandas as pd
import numpy as np
import re
import json
import warnings
import json
import random
import string
import unidecode

warnings.filterwarnings('ignore')


In [3]:
PATH_CHROME = '/home/gary/Apps/chromedriver'
PATH_RAW = '../../data/raw/'
PATH_INTERMEDIATE = '../../data/intermediate/'
PATH_FINAL = '../../data/final/'

## Get fields and districts
Obtain all field and districts available on the website

In [18]:
def get_fields_districts_dicts():
    driver = webdriver.Chrome(PATH_CHROME)
    driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

    ## dict of fields - name:value
    d_fields = dict()

    filterObor = driver.find_element_by_name('filterObor')
    options = [x for x in filterObor.find_elements_by_tag_name("option")]

    for element in options:
        if element.text:
            d_fields[element.text] = element.get_attribute("value")

    ## dict of districts - name:value
    d_districts = dict()

    filterOkresId = driver.find_elements_by_name('filterOkresId')[-1]

    options = [x for x in filterOkresId.find_elements_by_tag_name("option")]

    for element in options:
        if element.text:
            d_districts[element.text] = element.get_attribute("value")

    driver.close()
    
    np.save(PATH_DATA+'dict_districts.npy', d_districts) 
    np.save(PATH_DATA+'dict_fields.npy', d_fields) 
    return d_fields, d_districts

In [19]:
if download:
    d_fields, d_districts = get_fields_districts_dicts()
else:
    d_districts = np.load(PATH_INTERMEDIATE+'dict_districts.npy',allow_pickle='TRUE').item()
    d_fields = np.load(PATH_INTERMEDIATE+'dict_fields.npy',allow_pickle='TRUE').item()

## Google authentication

In [None]:
# # Not necessary to run this cell

# print('Gmail username and password')
# gmailId, passWord = map(str, input().split())
# try:
#     driver = webdriver.Chrome(PATH_CHROME)
#     driver.get(r'https://accounts.google.com/signin/v2/identifier?continue='+\
#     'https%3A%2F%2Fmail.google.com%2Fmail%2F&service=mail&sacu=1&rip=1'+\
#     '&flowName=GlifWebSignIn&flowEntry = ServiceLogin')
#     driver.implicitly_wait(15)
  
#     loginBox = driver.find_element_by_xpath('//*[@id ="identifierId"]')
#     loginBox.send_keys(gmailId)
  
#     nextButton = driver.find_elements_by_xpath('//*[@id ="identifierNext"]')
#     nextButton[0].click()
  
#     passWordBox = driver.find_element_by_xpath(
#         '//*[@id ="password"]/div[1]/div / div[1]/input')
#     passWordBox.send_keys(passWord)
  
#     nextButton = driver.find_elements_by_xpath('//*[@id ="passwordNext"]')
#     nextButton[0].click()
  
#     print('Login OK')
# except:
#     print('Login Failed')

## Obtain links

In [302]:
# # Uncomment when run for the first time
# # stores all retrieved records
# total_info = []

In [288]:
def set_viewport_size(driver, width, height):
    '''
        Sets width and height of the webpage. It can help to bypass CAPTCHA
    '''
    window_size = driver.execute_script("""
        return [window.outerWidth - window.innerWidth + arguments[0],
          window.outerHeight - window.innerHeight + arguments[1]];
        """, width, height)
    driver.set_window_size(*window_size)

    

def chunks(d, SIZE=10):
    '''
     Split dictionary into chunks of SIZE
    '''
    it = iter(data)
    for i in range(0, len(d), SIZE):
        yield {k:d[k] for k in islice(it, SIZE)}
            

def save_progress(processed_letters, finished_letters):
    with open(PATH_INTERMEDIATE+'finished_letters.txt','w') as f:
        f.write(str(finished_letters))

    np.save(PATH_INTERMEDIATE+'processed_letters.npy', processed_letters) 

def load_progress():
    '''
     Load info about already processed substrings in CLK search.
     Returns:
         processed_letters: already processed substrings
         finished_letters: completely processed first letters from the alphabet
    '''
    import ast
    with open(PATH_INTERMEDIATE+'finished_letters.txt','r') as f:
        finished_letters = ast.literal_eval(f.read())

    processed_letters = np.load(PATH_INTERMEDIATE+'processed_letters.npy',allow_pickle='TRUE').item()
    return processed_letters, finished_letters

    
def get_surname(full_name):
    names = full_name.split()
    if ',' in names[-2]:
        return names[-2][:-1]
    else:
        return names[-1]   
    
## ------------------------------------------------------           
def obtain_links(d_districts, d_fields):
    l_info = [] # general info about doctors
    processed_options = []

    try:
        # iterate over districts
        for district_name, district_id in d_districts.items():

            # iterate over fields
            for field_name, field_id in d_fields.items():

                # https://stackoverflow.com/questions/58872451/how-can-i-bypass-the-google-captcha-with-selenium-and-python
                # set fake agent
                options = Options()
                user_agent = UserAgent().random
                options.add_argument(f'user-agent={user_agent}')
                # TODO check user_agent's validity 
                driver = webdriver.Chrome(executable_path=PATH_CHROME, chrome_options=options)

                # mouseover actions
                action_chains = ActionChains(driver)

                # randomly change size of the webpage
                set_viewport_size(driver, random.randrange(950, 1300), random.randrange(600, 800))

                driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

                time.sleep(1)

                # mouseover to reject cookies
                reject = driver.find_element(By.CLASS_NAME, 'cc-nb-reject') #.click()
                ActionChains(driver).move_to_element(reject).click().perform()


                # select district
                wait = WebDriverWait(driver, 2)
                select = wait.until(EC.element_to_be_clickable((By.XPATH, "//select[@name='filterOkresId']")))
                action_chains.move_to_element(select).click().perform()
                time.sleep(1)

                select_d = Select(select)
                select_d.select_by_value(district_id)

                # select field of medicine
                select = wait.until(EC.element_to_be_clickable((By.NAME, "filterObor")))
                action_chains.move_to_element(select).click().perform()
                time.sleep(1)

                select_f = Select(select)
                select_f.select_by_value(field_id)

                # Confirm chosen options and search
                wait.until(EC.element_to_be_clickable((By.NAME, "do[findLekar]=1")))
                time.sleep(.5)
    #             action_chains.move_to_element(search).click().perform()
                search = driver.find_element_by_name('do[findLekar]=1')
                search.send_keys(Keys.RETURN)
    #             search.click()

                time.sleep(4)

                # Page counter
                counter = 0
                while True: 
                    # Stopping criteria
                    next_page_text = f'{counter*20+1}-{counter*20+20}'
                    if not next_page_text in driver.page_source and not 'Další >>' in driver.page_source:
                        break

                    driver.get(f'https://www.lkcr.cz/seznam-lekaru-426.html?paging.pageNo={counter}')
                    main = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "form-clk"))
                    )
                    doc_list = main.find_element(by=By.CLASS_NAME, value='seznam-lekaru.item-list')

                    for i in doc_list.find_elements(by=By.CLASS_NAME, value='item')[1:]:
                        info = i.text.split('\n')[:-1]
                        link = i.find_element_by_css_selector('a').get_attribute('href')
                        info = [link, district_name, field_name] + info 
                        l_info.append(info)

                    # next page
                    counter += 1

                driver.close()

    finally:
        return l_info

In [None]:
# run_parallel = False

# if not run_parallel:
#     ## Sequential run
#     x = obtain_links(d_districts, d_fields)
#     total_info.append(x)

# else:
#     ## Parallel run
#     n_chunks = 10

#     dist_chunks = list(chunks(d_districts, round(len(d_districts)/n_chunks)))
#     key = 'alergologie a klinická imunologie'

#     res_list = Parallel(n_jobs=n_chunks)(delayed(obtain_links)(dist_chunks[x], {key:d_fields[key]}) for x in range(n_chunks))
#     total_info += res_list


In [None]:
# # Check how many pages the letter has (if >= 50 need to divide into smaller chunks)
# # processed_letters = dict()
# processed_letters, finished_letters = load_progress()

In [400]:
def obtain_links_by_name(prefix='', start_pos=0, end_pos=None):
    ## Search by name
    '''
     Obtain urls by surnames' prefix, you can also specify starting position for following letter (eg. 0 = a, 4 = e etc.)
     Return:
         l_info: scrapped info
         processed_letters: dict with info how many pages prefixes have
    '''
    
    l_info = [] # general info about doctors
    processed_letters = dict()

    letters = string.ascii_lowercase[start_pos:]
    if end_pos:
        letters = letters[:end_pos-start_pos+1]
        
    try:

        for letter in letters:
            driver = webdriver.Chrome(executable_path=PATH_CHROME)

            # Randomly change size of the webpage
            set_viewport_size(driver, random.randrange(950, 1300), random.randrange(650, 900))

            driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

            time.sleep(1)
            
            try:
                driver.find_element(By.CLASS_NAME, 'cc-nb-reject').click()
            except: 
                pass

            search = driver.find_element_by_name('filterPrijmeni')
            search.send_keys(prefix + letter +'%')
            search.send_keys(Keys.RETURN)

            time.sleep(5)

            # Page counter
            counter = 0
            while True: 
                # Stopping criteria
                next_page_text = f'{counter*20+1}-{counter*20+20}'
                if not (next_page_text in driver.page_source or 'Další&nbsp;&gt;&gt;' in driver.page_source):
                    break

                driver.get(f'https://www.lkcr.cz/seznam-lekaru-426.html?paging.pageNo={counter}')
                main = WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "form-clk"))
                )
                doc_list = main.find_element(by=By.CLASS_NAME, value='seznam-lekaru.item-list')

                for i in doc_list.find_elements(by=By.CLASS_NAME, value='item')[1:]:
                    info = i.text.split('\n')[:-1]
                    link = i.find_element_by_css_selector('a').get_attribute('href')
                    info = [link] + info 
                    l_info.append(info)

                # Next page
                counter += 1

            processed_letters[prefix + letter] = counter
            driver.close()
            
    except Exception as e:
        print(f'[{prefix + letter}] Error occured:', e)
        
    finally:
        return l_info, processed_letters
    


In [561]:
# letters = doctors.surname.apply(lambda x: x[0]).unique()
# letters.sort()

# for letter in letters:
#     recs = doctors[doctors.surname.apply(lambda x: x[0] == letter)].sort_values('surname')
#     n_recs = recs.shape[0]
#     last_rec = recs.iloc[-1, 1] #tail(1).surname
#     if not letter in finished_letters:
#         print(f'[{letter}] total: {n_recs}, {last_rec}')

In [None]:
prefix = '' 

if not prefix in finished_letters:
    l_info, new_processed_letters = obtain_links_by_name(prefix, start_pos=8, end_pos=8)
    
    total_info += l_info
    processed_letters = {**processed_letters, **new_processed_letters}
#     print(total_info[-1])
#     print(processed_letters)
    
else:
    print(f'Prefix [{prefix + letter}] already finished.')


In [542]:
# print(processed_letters)
# finished_letters = set()

# for letter, pages in processed_letters.items():
#     if 0 < pages < 50:
#         finished_letters.add(letter)
        
finished_letters.add(prefix)

print(finished_letters)


## SAVE PROGRESS 
save_progress(processed_letters, finished_letters)
# processed_letters, finished_letters = load_progress()

{'s', 'a', 'b', 'o', 'i', 'x', 'v', 'l', 'm', 'j', 'g', 't', 'ch', 'u', 'z', 'r', 'f', 'c', 'q', 'd', 'n', 'k', 'e', 'w', 'y', 'p', 'h'}


In [548]:
assert not set(string.ascii_lowercase).difference(finished_letters), set(string.ascii_lowercase).difference(finished_letters)

In [553]:
def parse_info(total_info):
    '''
     Parses values from list and converts it into the pd.DataFrame
     Returns: pd.DataFrame
    '''

    # columns
    n_cols = max(map(len, total_info))
    doctors = pd.DataFrame(total_info, columns=['url', 'name', *[f'workplace{x}' for x in range(1, n_cols-1)]])
    # surname
    doctors['surname'] = doctors.name.apply(get_surname)
    doctors.surname = doctors.surname.apply(unidecode.unidecode)
    doctors.surname = doctors.surname.apply(lambda x: x.lower())
    doctors.loc[doctors.name == 'MUDr. Karin Boušová , Ph.D.', 'surname'] = 'Boušová'

    doctors.loc[(doctors.name == 'Nedal M. H. Abuasad') & (doctors.url.isna()), 'url'] = 'https://www.lkcr.cz/seznam-lekaru?filterId=MTE2MDU4NTE4NSwsTmVkYWwgTS4gSC4sLEFidWFzYWQ%3D&do[load]=1'

    return doctors

doctors = parse_info(total_info)
# save
doctors.to_csv(PATH_INTERMEDIATE+ 'doctors_alphabetical.csv', index=False, header=False, mode='a')


In [554]:
## drops duplicates by url
doctors_no_dupl = pd.read_csv(PATH_INTERMEDIATE + 'doctors_alphabetical.csv')
doctors_no_dupl = doctors_no_dupl.drop_duplicates('url', keep='last')
doctors_no_dupl.to_csv(PATH_INTERMEDIATE + 'doctors_alphabetical_nd.csv', index=False)

((56149, 5), (62242, 5))

In [23]:
d_districts.keys()

dict_keys(['Benešov', 'Beroun', 'Blansko', 'Brno-město', 'Brno-venkov', 'Bruntál', 'Břeclav', 'Česká Lípa', 'České Budějovice', 'Český Krumlov', 'Děčín', 'Domažlice', 'Frýdek-Místek', 'Havlíčkův Brod', 'Hodonín', 'Hradec Králové', 'Cheb', 'Chomutov', 'Chrudim', 'Jablonec nad Nisou', 'Jeseník', 'Jičín', 'Jihlava', 'Jindřichův Hradec', 'Karlovy Vary', 'Karviná', 'Kladno', 'Klatovy', 'Kolín', 'Kroměříž', 'Kutná Hora', 'Liberec', 'Litoměřice', 'Louny', 'Mělník', 'Mladá Boleslav', 'Most', 'Náchod', 'Nový Jičín', 'Nymburk', 'Olomouc', 'Opava', 'Ostrava-město', 'Pardubice', 'Pelhřimov', 'Písek', 'Plzeň-jih', 'Plzeň-město', 'Plzeň-sever', 'Praha hl.m.', 'Praha-východ', 'Praha-západ', 'Prachatice', 'Prostějov', 'Přerov', 'Příbram', 'Rakovník', 'Rokycany', 'Rychnov nad Kněžnou', 'Semily', 'Sokolov', 'Strakonice', 'Svitavy', 'Šumperk', 'Tábor', 'Tachov', 'Teplice', 'Trutnov', 'Třebíč', 'Uherské Hradiště', 'Ústí nad Labem', 'Ústí nad Orlicí', 'Vsetín', 'Vyškov', 'Zlín', 'Znojmo', 'Žďár nad Sázavou

In [20]:
print('Districts x fields: ', len(d_districts) * len(d_fields), 'x')
print('Alphabet:', len(processed_letters), 'x')

Districts x fields:  9009 x


NameError: name 'processed_letters' is not defined

In [572]:
doctors

Unnamed: 0,url,name,workplace1,workplace2,surname
0,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,MUDr. Miroslav Baader,Liberec 1,,baader
1,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Jana Baarová,Kopřivnice 1,,baarova
2,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Monika Baarová,Hradec Králové,,baarova
3,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Věra Baarová,Havířov 1,,baarova
4,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Vladimíra Baarová,Klatovy,,baarova
...,...,...,...,...,...
62237,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Marie Mizerová,Olomouc 9,,mizerova
62238,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,MUDr. Jan Mizner,Praha 10,,mizner
62239,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,MUDr. Petr Mizner,Praha 5,,mizner
62240,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Barbora Miznerová,Opava 5,,miznerova


## Scrap obtained links

In [573]:
def save_to_json(rec, name):
    with open(PATH_INTERMEDIATE + name + '.json', "w") as outfile:
        json.dump(rec, outfile)
        
## ------------------------------------------

def parse_doctor_tables(tables):
    '''
        Parse tables in records' url
        Input[tables]: WebElelement
        Output[d_detail]: dict
    '''
    d_detail = dict()
    
    # evidence number
    ev_num = tables[0].find_element(by=By.CLASS_NAME, value='evidencni-cislo').text.split()[-1]
    d_detail['Evidenční číslo'] = ev_num

    ## first + second table
    for table in tables[:2]:
        for row in table.find_elements_by_css_selector('tr'):
            cell = row.find_elements_by_css_selector('td')
            key = cell[0].text
            values = cell[1].text.split('\n')
            values = values if len(values)>1 else values[0]
            d_detail[key] = values
    
    ## 3+ table (workplaces)
    workplaces = []
    for workplace in tables[2:]:
        d_workplace = dict()
        for row in workplace.find_elements_by_css_selector('tr'):
            cell = row.find_elements_by_css_selector('td')
            key = cell[0].text
            values = cell[1].text.split('\n')
            values = values if len(values)>1 else values[0]
            d_workplace[key] = values

        workplaces.append(d_workplace)

    d_detail['Pracoviště'] = workplaces
    
    return d_detail

## ------------------------------------------
def remove_asterisks(tables):
    new_tables = []
    for table in tables:
        if not '*' in table.text:
            new_tables.append(table)

    return new_tables

def get_doctor_detail(url, driver):
    '''
        Return detail info about doctors.
        Input[url]: str, CLK url 
        Output[d_detail]: dict, info about doctor
    '''
    try:
        driver.get(url)

        # due to the bug in the opening
#         driver.refresh()
  
        # load main content
        main = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "detail-lekare"))
        )
        
        # basic info
        tables = main.find_elements(by=By.CLASS_NAME, value='text-box-lekar')
        tables = remove_asterisks(tables)
        d_detail = parse_doctor_tables(tables)

        # name
        name = main.find_element(by=By.CLASS_NAME, value='jmeno-lekare').text
        d_detail['Jméno'] = name
        
    except Exception as e:
        print(f'[{url}] Error occured:', e)
    finally:
#         driver.close()
        return d_detail 

In [576]:
# l_doctors = []

In [577]:
## Loop for downloading websites
cnt = 1
n_url = doctors_no_dupl.url.nunique()
try:
    driver = webdriver.Chrome(PATH_CHROME)
    driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

    for url in doctors_no_dupl.url.unique():
        if cnt % 500 == 0:
            print('.', end='')
        if cnt % 5000 == 0:
            print(f' {cnt}/{n_url}')    
        cnt += 1

        d_detail = get_doctor_detail(url, driver)
        d_detail['url'] = url
        l_doctors.append(d_detail)

except Exception as e:
    print(f'[{url}] Error occured:', e)
        
finally:
    print(' Finished!')
    driver.close()

.......... 5000/56149
.......... 10000/56149
.......... 15000/56149
.......... 20000/56149
.......... 25000/56149
.......... 30000/56149
.......... 35000/56149
.......... 40000/56149
.......... 45000/56149
.......... 50000/56149
.......... 55000/56149
.. Finished!


In [578]:
save_to_json(l_doctors, 'doctors_all')

## Postprocessing

In [331]:
with open(PATH_INTERMEDIATE + 'doctors_all.json') as f:
    data = json.load(f)    

doctors = pd.json_normalize(data)
print(doctors.columns)
doctors.columns = ['_id', 'university', 'graduated_year', 'lifelong_studies',
                   'workplace', 'doctor_name', 'doctor_url', 'medical_specialty',
                   'private_practice', 'leading_doctor_licence', 'method_of_treatment_licence']

Index(['Evidenční číslo', 'Vysoká škola', 'Rok promoce',
       'Diplom celoživotního vzdělávání', 'Pracoviště', 'Jméno', 'url',
       'Dosažená odbornost', 'K výkonu soukromé praxe a lektora v oboru',
       'Pro výkon funkce vedoucího lékaře a primáře v oboru',
       'Funkční licence pro léčebnou metodu'],
      dtype='object')


In [332]:
doctors = doctors[doctors['doctor_name'] != ''].reset_index(drop=True)

# fix graduated years
doctors.loc[doctors.graduated_year == '9819', 'graduated_year'] = '1998'
doctors.loc[doctors.graduated_year == '', 'graduated_year'] = np.nan
doctors.loc[doctors.graduated_year == '0', 'graduated_year'] = np.nan
doctors.loc[~doctors.graduated_year.isna(), 'graduated_year'] = doctors.loc[~doctors.graduated_year.isna(), 'graduated_year'].apply(int)

# fix university
doctors.loc[doctors.university == '', 'university'] = np.nan
doctors.loc[doctors.university == 'neuvedena', 'university'] = np.nan

# fix lifelong studies
doctors['lifelong_studies'] = np.where(doctors['lifelong_studies'] == 'ano', True, False)

# workplaces
doctors['n_doctor_workplaces'] = doctors.workplace.apply(len)
doctors.loc[~doctors.workplace.apply(bool), 'workplace'] = np.nan

In [333]:
# Age
# estimate an age based on students age at the time they graduated and their probabilities
students = pd.read_csv(PATH_FINAL+'students.csv', index_col=0)

dr = students[(students['graduated'] == True) & (students.major == 'Všeobecné lékařství')][['graduated', 'major', 'age_end']]

ages = dr.age_end.value_counts().reset_index()
ages.columns = ['age','count']
n_records = sum(ages['count'])
ages['p'] = ages['count'] / n_records
# ages.head()


random_age = np.random.choice(ages['age'], p=ages['p'], size=doctors.shape[0])
doctors['graduated_age_estimate'] = random_age
doctors['graduated_age_estimate'] = doctors['graduated_age_estimate'].apply(round)
doctors['age_estimate'] = doctors['graduated_age_estimate'] + (date.today().year - doctors['graduated_year']) 


In [334]:
doctors = doctors.explode('workplace').reset_index(drop=True)

In [335]:
doctors_wp_df = doctors['workplace'].apply(pd.Series).drop([0], axis=1)
doctors_wp_df = doctors_wp_df[['Název zdravotnického zařízení:', 'Název pracoviště:', 'Adresa pracoviště:']]
doctors_wp_df.columns = ['workplace_name', 'workplace_hospital_ward', 'workplace_address']

In [336]:
doctors_df = pd.concat([doctors.drop(['workplace'], axis=1), doctors_wp_df], axis=1)

In [337]:
# parse address

def get_zip_code(address):
    idx = re.search(r"\d\d\d\d\d", address)
    if not idx:
        return np.nan
    idx = idx.start()
    return int(address[idx:idx+5])

def get_street(address):
    idx = re.search(r", \d\d\d", address)    
    idx = idx.start()
    return address[:idx]

def get_city(address):
    idx = re.search(r"\d\d\d\d\d ", address)
    if not idx:
        return np.nan
    idx = idx.end()
    return address[idx:]
        

wp_na = doctors_df['workplace_address'].isna()

doctors_df['zip_code'] = doctors_df[~wp_na].workplace_address.apply(get_zip_code)
doctors_df['street'] = doctors_df[~wp_na].workplace_address.apply(get_street)
doctors_df['city'] = doctors_df[~wp_na].workplace_address.apply(get_city)

In [338]:
# number of companies' workplaces
doctors_df_unique = doctors_df.drop_duplicates(['workplace_name', 'workplace_hospital_ward', 'workplace_address'])
n_workplaces = doctors_df_unique.groupby(['workplace_name', 'workplace_hospital_ward']).size().reset_index(name='n_workplaces')
doctors_df = pd.merge(doctors_df, n_workplaces, how='left')

# number of doctors in workplace
doctors_df_unique = doctors_df.drop_duplicates(['workplace_name', 'workplace_hospital_ward', '_id'])
n_doctors_in_workplace = doctors_df_unique.groupby(['workplace_name', 'workplace_hospital_ward']).size().reset_index(name='n_doctors_in_workplace')
doctors_df = pd.merge(doctors_df, n_doctors_in_workplace, how='left')


In [339]:
# fix uni

def uni_abbreviation(uni):
    return {
        '1. LF Univerzity Karlovy v Praze' : '1LFUK',
        '2. LF Univerzity Karlovy v Praze' : '2LFUK',
        '3. LF Univerzity Karlovy v Praze' : '3LFUK',
        'LF Univerzity Karlovy v Plzni' : 'PLUK',
        'LF Univerzity Karlovy v Hradci Králové' : 'HKUK',
        'LF Masarykovy Univerzity v Brně' : 'MUNI',
        'LF Univerzity Palackého v Olomouci' : 'UPOL',
        'LF Ostravské univerzity v Ostravě' : 'OVA',
        'Vojenská lékařská akademie v Hradci Králové' : 'UNOB',
        'zahraniční lékařská fakulta' : 'ABROAD'
    }.get(uni, np.nan)

uni_na = doctors_df.university.isna()
doctors_df.loc[~uni_na, 'university'] = doctors_df[~uni_na].university.apply(uni_abbreviation)

In [340]:
doctors_df.loc[(doctors_df.graduated_year == 1194), 'graduated_year'] = 1994
doctors_df.loc[(doctors_df.graduated_year < 1945), 'graduated_year'] = np.nan

In [341]:
# doctors_df = pd.read_csv(PATH_FINAL+'doctors.csv', index_col=0)
# doctors_df = doctors_df.explode('medical_specialty').reset_index(drop=True)

In [342]:
d_fields_unif = {
    'epidemiologie': 'hygiena a epidemiologie',
    'hygiena všeobecná a komunální': 'hygiena a epidemiologie',
    'hygiena obecná a komunální': 'hygiena a epidemiologie',
    'hygiena práce a nemoci z povolání': 'hygiena a epidemiologie',
    'hygiena záření': 'hygiena a epidemiologie',
    'hygiena výživy a předmětů běžného užívání': 'hygiena a epidemiologie',
    'hygiena výživy ': 'hygiena a epidemiologie',
    'všeobecné lékařství': 'všeobecné praktické lékařství',
    'praktické lékařství pro dospělé': 'všeobecné praktické lékařství',
    'dětské lékařství': 'pediatrie',
    'praktické lékařství pro děti a dorost': 'pediatrie',
    'dorostové lékařství': 'pediatrie',
    'dětská onkologie a hematoonkologie': 'dětská onkologie a hematologie',
    'dětská gynekologie': 'gynekologie dětí a dospívajících',
    'hygiena dětí a dorostu': 'dětská hygiena',
    'dětská a dorostová psychiatrie': 'dětská psychiatrie',
    'diabetologie a endokrinologie': 'endokrinologie a diabetologie',
    'endokrinologie': 'endokrinologie a diabetologie',
    'diabetologie': 'endokrinologie a diabetologie',
    'audiologie a foniatrie': 'foniatrie',
    'audiologie': 'foniatrie',
    'stomatologická chirurgie': 'maxilofaciální chirurgie',
    'úrazová chirurgie (traumatologie)': 'úrazová chirurgie',
    'traumatologie': 'úrazová chirurgie',
    'otorinolaryngologie': 'otorinolaryngologie a chirurgie hlavy a krku',
    'patologická anatomie': 'patologie',
    'léčení alkoholismu a jiných toxikomanií': 'návykové nemoci',
    'paliativní medicína a léčba bolesti': 'paliativní medicína',
    'plicní chirurgie': 'hrudní chirurgie',
    'traumatologie pohybového ústrojí': 'ortopedie a traumatologie pohybového ústrojí',
    'ortopedie': 'ortopedie a traumatologie pohybového ústrojí',
    'ortopedická protetika': 'ortopedie a traumatologie pohybového ústrojí',
    'přenosné nemoci': 'infekční lékařství',
    '"fyziatrie, balneologie a léčebná rehabilitace"': 'rehabilitační a fyzikální medicína',
    'hyperbarická a letecká medicína': 'hyperbarická medicína a oxygenoterapie',
    'letecké lékařství': 'hyperbarická medicína a oxygenoterapie',
    'radiodiagnostika': 'radiologie a zobrazovací metody',
    'radioterapie': 'radiační onkologie',
    'anesteziologie a resuscitace': 'anesteziologie a intenzivní medicína',
    'hematolologie a transfuzní služba': 'hematologie a transfuzní lékařství',
    'interní lékařství': 'vnitřní lékařství',
    'posudkové lékařství': 'soudní lékařství',
    'mikrobiologie': 'lékařská mikrobiologie',
    'mikrobiologie životního prostředí': 'lékařská mikrobiologie',
    'perinatologie': 'perinatologie a fetomaternální medicína',
}




In [343]:
def convert_fields(fields):
    if fields is np.NaN:
        return np.NaN
    if isinstance(fields, list):
        converted = []
        for x in fields:
            converted.append(d_fields_unif.get(x, x))
        return list(set(converted))
    else:
        return [d_fields_unif.get(fields, fields)]

doctors_df['medical_specialty'] = doctors_df['medical_specialty'].apply(convert_fields)
doctors_df['private_practice'] = doctors_df['private_practice'].apply(convert_fields)
doctors_df['leading_doctor_licence'] = doctors_df['leading_doctor_licence'].apply(convert_fields)

In [345]:
doctors_df.to_csv(PATH_FINAL+'doctors.csv')

## Merge Datasets

In [403]:
doctors_df = pd.read_csv(PATH_FINAL+'doctors.csv', index_col=0)
dentists_df = pd.read_csv(PATH_FINAL+'dentists.csv', index_col=0)
docs_total = pd.concat([doctors_df, dentists_df])

for col in ['workplace_hospital_ward', 'workplace_address', 'street', 'city']:
    docs_total[col] = docs_total[col].apply(lambda x: x.strip() if str == type(x) else x)

### Add districts

In [404]:
nrpzs = pd.read_csv(PATH_FINAL+'nrpzs.csv', index_col=0)
okresy = nrpzs[['psc', 'okres', 'okres_code']].drop_duplicates().dropna()
okresy.columns = ['zip_code', 'district', 'NUTS']

# get zip codes with multiple districts
x = okresy.zip_code.value_counts()
mult = okresy[okresy.zip_code.isin(x[x>1].index)].sort_values('zip_code').zip_code.unique()

In [405]:
tmp = pd.merge(docs_total, okresy, on='zip_code', how='left')

In [406]:
correct_city = tmp[tmp.zip_code.isin(mult)][['city', 'district']].drop_duplicates(['city', 'district']).drop_duplicates(['city'], keep='first')
correct_city.loc[correct_city.city == 'Poříčany', 'district'] = 'Kolín'

correct_df = pd.merge(tmp, correct_city, on=['city', 'district'])

In [407]:
docs_final = pd.concat([tmp[~tmp.zip_code.isin(mult)], correct_df])

# Praha
docs_final.loc[(docs_final.zip_code.apply(lambda x: str(x).startswith('1'))) & (docs_final.district.isna()), 'district'] = 'Hlavní město Praha'
docs_final.loc[(docs_final.zip_code.apply(lambda x: str(x).startswith('1'))) & (docs_final.district.isna()), 'NUTS'] = 'CZ0100'

# Rychnov n/Kn.
docs_final.loc[docs_final.city == 'Rychnov n/Kn.', 'district'] = 'Rychnov nad Kněžnou'
# Kladruby u Vla
docs_final.loc[docs_final.city == 'Kladruby u Vlašimi', 'district'] = 'Benešov'
# Okres Sokolov
docs_final.loc[(~docs_final.city.isna()) & docs_final.city.str.contains('u Sokolova'), 'district'] = 'Sokolov'
# Okres KV
docs_final.loc[(~docs_final.city.isna()) & docs_final.city.str.contains('u Karlových'), 'district'] = 'Karlovy Vary'
# Příbram
docs_final.loc[docs_final.city == 'Příbram V', 'city'] = 'Příbram'
docs_final.loc[docs_final.city == 'Příbram V', 'district'] = 'Příbram'

docs_final.loc[docs_final.city == 'Ostrava l', 'city'] = 'Ostrava'
docs_final.loc[docs_final.city == 'Frýdek Místek', 'city'] = 'Frýdek-Místek'
docs_final.loc[docs_final.city == 'Val.Meziříčí', 'city'] = 'Valašské Meziříčí'
docs_final.loc[docs_final.city == 'Hr.Králové', 'city'] = 'Hradec Králové'
docs_final.loc[docs_final.city == 'Děčín II', 'city'] = 'Děčín'
docs_final.loc[docs_final.city == 'Stará Boleslav', 'city'] = 'Brandýs nad Labem - Stará Boleslav'
docs_final.loc[docs_final.city == 'Příbram V', 'city'] = 'Příbram'

docs_final.loc[docs_final.city == 'Řevnice u Prahy', 'city'] = 'Řevnice'
docs_final.loc[docs_final.city == 'Uh. Hradiště', 'city'] = 'Uherské Hradiště'
docs_final.loc[docs_final.city == 'Holice v Č.', 'city'] = 'Holice'
docs_final.loc[docs_final.city == 'Slatiny u Jičína', 'city'] = 'Slatiny'


In [408]:
city = pd.read_csv(PATH_RAW+'obce_okresy.csv', encoding='windows-1250', usecols=['TEXT1', 'TEXT2'])
city.columns = ['village_name', 'district_name']
city = city.drop_duplicates()
city_dict = pd.Series(city.district_name.values, index=city.village_name).to_dict()

In [409]:
city_cond = (~docs_final.city.isna()) & docs_final.city.str.contains(',')
docs_final.loc[city_cond, 'city'] =  docs_final[city_cond].city.apply(lambda x: x.split(',')[0])

city_cond2 = (~docs_final.city.isna()) & docs_final.city.str.contains(' - ')
docs_final.loc[city_cond2, 'city'] =  docs_final[city_cond2].city.apply(lambda x: x.replace(' - ', '-'))

dist_cond = (~docs_final.city.isna()) & (docs_final.district.isna())
docs_final.loc[dist_cond, 'district'] = docs_final[dist_cond].city.apply(lambda x: city_dict.get(re.sub('\d', '', x).strip(), np.nan))

city_cond3 = (~docs_final.city.isna()) & docs_final.city.str.contains('-')
docs_final.loc[city_cond3, 'city'] =  docs_final[city_cond3].city.apply(lambda x: x.split('-')[0])

dist_cond = (~docs_final.city.isna()) & (docs_final.district.isna())
docs_final.loc[dist_cond, 'district'] = docs_final[dist_cond].city.apply(lambda x: city_dict.get(re.sub('\d', '', x).strip(), np.nan))


#### Zipcodes divided by 10

In [410]:
with open(PATH_RAW+'psc.txt', 'r') as f:
    psc_list = [x.replace('\'', '').replace('\n', '').split(', ')[1:] for x in f.readlines()]
    psc = pd.DataFrame(psc_list,columns=['zip_code', 'city', 'district', 'region', 'country'])

psc.zip_code = pd.to_numeric(psc.zip_code)


# https://zvarik.cz/cs/databaze-psc
# https://apl.czso.cz/iSMS/cisdata.jsp?kodcis=43 # ciselnik obci
psc.to_csv(PATH_FINAL+'zip_codes.csv')
psc

Unnamed: 0,zip_code,city,district,region,country
0,36235,Abertamy,Karlovy Vary,Jihočeský a Západočeský,CZ
1,54232,Trutnov,Trutnov,Východočeský (+část Jihomoravského),CZ
2,67904,Adamov,Blansko,Jihomoravský,CZ
3,28601,Adamov,Kutná Hora,Středočeský,CZ
4,37371,Adamov,České Budějovice,Jihočeský a Západočeský,CZ
...,...,...,...,...,...
8055,76361,Žlutava,Zlín,Severomoravský,CZ
8056,37806,Nová Ves nad Lužnicí,Jindřichův Hradec,Jihočeský a Západočeský,CZ
8057,53836,Žumberk,Chrudim,Východočeský (+část Jihomoravského),CZ
8058,26301,Županovice,Příbram,Středočeský,CZ


In [411]:
psc_part = psc[['zip_code', 'district']].drop_duplicates().dropna()
psc_part.zip_code = psc_part.zip_code // 10
psc_part = psc_part.drop_duplicates()
x  = psc_part.zip_code.value_counts()
psc_part = psc_part[psc_part.zip_code.isin(x[x<=1].index)].sort_values('zip_code')
psc_part_dict = pd.Series(psc_part.district.values, index=psc_part.zip_code).to_dict()

In [412]:
missing_zip_codes = docs_final[(~docs_final['zip_code'].isna())& (docs_final['district'].isna())]

for key, value in psc_part_dict.items():
    missing_zip_codes.loc[missing_zip_codes.zip_code.apply(lambda x: str(x).startswith(str(key))), 'district'] = value

docs_final = pd.concat([docs_final[~((~docs_final['zip_code'].isna())& (docs_final['district'].isna()))], missing_zip_codes])

In [425]:
nuts = pd.read_excel(PATH_RAW + 'NUTS.xlsx')
nuts = nuts[['OKRES_LAUT,C,100', 'CZNUTS,C,15']]
nuts.columns = ['Okres', 'NUTS']
nuts_dict = pd.Series(nuts.NUTS.values,index=nuts.Okres).to_dict()
nuts_dict['Hlavní město Praha'] = 'CZ0100'
docs_final.loc[docs_final.NUTS.isna() & ~docs_final.district.isna(), 'NUTS'] = docs_final[docs_final.NUTS.isna() & ~docs_final.district.isna()].district.apply(lambda x: nuts_dict[x])

In [23]:
docs_final.loc[docs_final.district == 'Hlavní město Praha', 'district'] = 'Praha'
docs_final.loc[docs_final.medical_specialty == '[stomatologie]', 'medical_specialty'] = "['stomatologie']"
docs_final.loc[docs_final.private_practice == '[stomatologie]', 'private_practice'] = "['stomatologie']"


In [24]:
docs_final.to_csv(PATH_FINAL+'doctors_all.csv', index=False)

In [429]:
print(f'Only in doctors.csv: {set(doctors_df.columns).difference(set(dentists_df.columns))}')
print(f'Only in dentists.csv: {set(dentists_df.columns).difference(set(doctors_df.columns))}')

Only in doctors.csv: {'lifelong_studies', 'leading_doctor_licence', 'university', 'method_of_treatment_licence', 'graduated_year', 'workplace_hospital_ward', 'age_estimate'}
Only in dentists.csv: {'IC', 'workplace_url', 'area'}


## Medical specialties

In [26]:
import ast
docs_final = pd.read_csv(PATH_FINAL+'doctors_all.csv')
docs_final['medical_specialty'] = docs_final[~docs_final['medical_specialty'].isna()]['medical_specialty'].apply(lambda x: ast.literal_eval(str(x)))

In [63]:
medical_specialties = docs_final.explode('medical_specialty').medical_specialty.value_counts().index.sort_values()
md_df = pd.DataFrame(['Všechny obory'],columns=['medical_specialty_name'])
md_df = pd.concat([md_df, pd.DataFrame(medical_specialties,columns=['medical_specialty_name'])])#.reset_index().rename(columns={'index': 'medical_specialty_id'})
md_df.to_csv(PATH_FINAL+'medical_specialty.csv')

## References

#### SELENIUM
https://www.youtube.com/watch?v=b5jt2bhSeXs&ab_channel=TechWithTim

https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webelement

https://stackoverflow.com/questions/58872451/how-can-i-bypass-the-google-captcha-with-selenium-and-python