# ČLK scraping
The aim of this notebook is to scrap data from [ČLK](https://www.lkcr.cz/seznam-lekaru-426.html) website to obtain current data about doctors in Czech republic 

In [558]:
download = False
run_parallel = False

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options


from itertools import islice
from fake_useragent import UserAgent
from joblib import Parallel, delayed

import time
import pandas as pd
import numpy as np
import re
import json
import warnings
import json
import random
import string
import unidecode

warnings.filterwarnings('ignore')


In [3]:
PATH_CHROME = '/home/gary/Apps/chromedriver'
PATH_INTERMEDIATE = '../../data/intermediate/'

## Get fields and districts
Obtain all field and districts available on the website

In [4]:
def get_fields_districts_dicts():
    driver = webdriver.Chrome(PATH_CHROME)
    driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

    ## dict of fields - name:value
    d_fields = dict()

    filterObor = driver.find_element_by_name('filterObor')
    options = [x for x in filterObor.find_elements_by_tag_name("option")]

    for element in options:
        if element.text:
            d_fields[element.text] = element.get_attribute("value")

    ## dict of districts - name:value
    d_districts = dict()

    filterOkresId = driver.find_elements_by_name('filterOkresId')[-1]

    options = [x for x in filterOkresId.find_elements_by_tag_name("option")]

    for element in options:
        if element.text:
            d_districts[element.text] = element.get_attribute("value")

    driver.close()
    
    np.save(PATH_DATA+'dict_districts.npy', d_districts) 
    np.save(PATH_DATA+'dict_fields.npy', d_fields) 
    return d_fields, d_districts

In [560]:
if download:
    d_fields, d_districts = get_fields_districts_dicts()
else:
    d_districts = np.load(PATH_INTERMEDIATE+'dict_districts.npy',allow_pickle='TRUE').item()
    d_fields = np.load(PATH_INTERMEDIATE+'dict_fields.npy',allow_pickle='TRUE').item()

## Google authentication

In [None]:
# # Not necessary to run this cell

# print('Gmail username and password')
# gmailId, passWord = map(str, input().split())
# try:
#     driver = webdriver.Chrome(PATH_CHROME)
#     driver.get(r'https://accounts.google.com/signin/v2/identifier?continue='+\
#     'https%3A%2F%2Fmail.google.com%2Fmail%2F&service=mail&sacu=1&rip=1'+\
#     '&flowName=GlifWebSignIn&flowEntry = ServiceLogin')
#     driver.implicitly_wait(15)
  
#     loginBox = driver.find_element_by_xpath('//*[@id ="identifierId"]')
#     loginBox.send_keys(gmailId)
  
#     nextButton = driver.find_elements_by_xpath('//*[@id ="identifierNext"]')
#     nextButton[0].click()
  
#     passWordBox = driver.find_element_by_xpath(
#         '//*[@id ="password"]/div[1]/div / div[1]/input')
#     passWordBox.send_keys(passWord)
  
#     nextButton = driver.find_elements_by_xpath('//*[@id ="passwordNext"]')
#     nextButton[0].click()
  
#     print('Login OK')
# except:
#     print('Login Failed')

## Obtain links

In [302]:
# # Uncomment when run for the first time
# # stores all retrieved records
# total_info = []

In [288]:
def set_viewport_size(driver, width, height):
    '''
        Sets width and height of the webpage. It can help to bypass CAPTCHA
    '''
    window_size = driver.execute_script("""
        return [window.outerWidth - window.innerWidth + arguments[0],
          window.outerHeight - window.innerHeight + arguments[1]];
        """, width, height)
    driver.set_window_size(*window_size)

    

def chunks(d, SIZE=10):
    '''
     Split dictionary into chunks of SIZE
    '''
    it = iter(data)
    for i in range(0, len(d), SIZE):
        yield {k:d[k] for k in islice(it, SIZE)}
            

def save_progress(processed_letters, finished_letters):
    with open(PATH_INTERMEDIATE+'finished_letters.txt','w') as f:
        f.write(str(finished_letters))

    np.save(PATH_INTERMEDIATE+'processed_letters.npy', processed_letters) 

def load_progress():
    '''
     Load info about already processed substrings in CLK search.
     Returns:
         processed_letters: already processed substrings
         finished_letters: completely processed first letters from the alphabet
    '''
    import ast
    with open(PATH_INTERMEDIATE+'finished_letters.txt','r') as f:
        finished_letters = ast.literal_eval(f.read())

    processed_letters = np.load(PATH_INTERMEDIATE+'processed_letters.npy',allow_pickle='TRUE').item()
    return processed_letters, finished_letters

    
def get_surname(full_name):
    names = full_name.split()
    if ',' in names[-2]:
        return names[-2][:-1]
    else:
        return names[-1]   
    
## ------------------------------------------------------           
def obtain_links(d_districts, d_fields):
    l_info = [] # general info about doctors
    processed_options = []

    try:
        # iterate over districts
        for district_name, district_id in d_districts.items():

            # iterate over fields
            for field_name, field_id in d_fields.items():

                # https://stackoverflow.com/questions/58872451/how-can-i-bypass-the-google-captcha-with-selenium-and-python
                # set fake agent
                options = Options()
                user_agent = UserAgent().random
                options.add_argument(f'user-agent={user_agent}')
                # TODO check user_agent's validity 
                driver = webdriver.Chrome(executable_path=PATH_CHROME, chrome_options=options)

                # mouseover actions
                action_chains = ActionChains(driver)

                # randomly change size of the webpage
                set_viewport_size(driver, random.randrange(950, 1300), random.randrange(600, 800))

                driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

                time.sleep(1)

                # mouseover to reject cookies
                reject = driver.find_element(By.CLASS_NAME, 'cc-nb-reject') #.click()
                ActionChains(driver).move_to_element(reject).click().perform()


                # select district
                wait = WebDriverWait(driver, 2)
                select = wait.until(EC.element_to_be_clickable((By.XPATH, "//select[@name='filterOkresId']")))
                action_chains.move_to_element(select).click().perform()
                time.sleep(1)

                select_d = Select(select)
                select_d.select_by_value(district_id)

                # select field of medicine
                select = wait.until(EC.element_to_be_clickable((By.NAME, "filterObor")))
                action_chains.move_to_element(select).click().perform()
                time.sleep(1)

                select_f = Select(select)
                select_f.select_by_value(field_id)

                # Confirm chosen options and search
                wait.until(EC.element_to_be_clickable((By.NAME, "do[findLekar]=1")))
                time.sleep(.5)
    #             action_chains.move_to_element(search).click().perform()
                search = driver.find_element_by_name('do[findLekar]=1')
                search.send_keys(Keys.RETURN)
    #             search.click()

                time.sleep(4)

                # Page counter
                counter = 0
                while True: 
                    # Stopping criteria
                    next_page_text = f'{counter*20+1}-{counter*20+20}'
                    if not next_page_text in driver.page_source and not 'Další >>' in driver.page_source:
                        break

                    driver.get(f'https://www.lkcr.cz/seznam-lekaru-426.html?paging.pageNo={counter}')
                    main = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "form-clk"))
                    )
                    doc_list = main.find_element(by=By.CLASS_NAME, value='seznam-lekaru.item-list')

                    for i in doc_list.find_elements(by=By.CLASS_NAME, value='item')[1:]:
                        info = i.text.split('\n')[:-1]
                        link = i.find_element_by_css_selector('a').get_attribute('href')
                        info = [link, district_name, field_name] + info 
                        l_info.append(info)

                    # next page
                    counter += 1

                driver.close()

    finally:
        return l_info



In [None]:
# run_parallel = False

# if not run_parallel:
#     ## Sequential run
#     x = obtain_links(d_districts, d_fields)
#     total_info.append(x)

# else:
#     ## Parallel run
#     n_chunks = 10

#     dist_chunks = list(chunks(d_districts, round(len(d_districts)/n_chunks)))
#     key = 'alergologie a klinická imunologie'

#     res_list = Parallel(n_jobs=n_chunks)(delayed(obtain_links)(dist_chunks[x], {key:d_fields[key]}) for x in range(n_chunks))
#     total_info += res_list


In [None]:
# # Check how many pages the letter has (if >= 50 need to divide into smaller chunks)
# # processed_letters = dict()
# processed_letters, finished_letters = load_progress()

In [400]:
def obtain_links_by_name(prefix='', start_pos=0, end_pos=None):
    ## Search by name
    '''
     Obtain urls by surnames' prefix, you can also specify starting position for following letter (eg. 0 = a, 4 = e etc.)
     Return:
         l_info: scrapped info
         processed_letters: dict with info how many pages prefixes have
    '''
    
    l_info = [] # general info about doctors
    processed_letters = dict()

    letters = string.ascii_lowercase[start_pos:]
    if end_pos:
        letters = letters[:end_pos-start_pos+1]
        
    try:

        for letter in letters:
            driver = webdriver.Chrome(executable_path=PATH_CHROME)

            # Randomly change size of the webpage
            set_viewport_size(driver, random.randrange(950, 1300), random.randrange(650, 900))

            driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

            time.sleep(1)
            
            try:
                driver.find_element(By.CLASS_NAME, 'cc-nb-reject').click()
            except: 
                pass

            search = driver.find_element_by_name('filterPrijmeni')
            search.send_keys(prefix + letter +'%')
            search.send_keys(Keys.RETURN)

            time.sleep(5)

            # Page counter
            counter = 0
            while True: 
                # Stopping criteria
                next_page_text = f'{counter*20+1}-{counter*20+20}'
                if not (next_page_text in driver.page_source or 'Další&nbsp;&gt;&gt;' in driver.page_source):
                    break

                driver.get(f'https://www.lkcr.cz/seznam-lekaru-426.html?paging.pageNo={counter}')
                main = WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "form-clk"))
                )
                doc_list = main.find_element(by=By.CLASS_NAME, value='seznam-lekaru.item-list')

                for i in doc_list.find_elements(by=By.CLASS_NAME, value='item')[1:]:
                    info = i.text.split('\n')[:-1]
                    link = i.find_element_by_css_selector('a').get_attribute('href')
                    info = [link] + info 
                    l_info.append(info)

                # Next page
                counter += 1

            processed_letters[prefix + letter] = counter
            driver.close()
            
    except Exception as e:
        print(f'[{prefix + letter}] Error occured:', e)
        
    finally:
        return l_info, processed_letters
    


In [561]:
# letters = doctors.surname.apply(lambda x: x[0]).unique()
# letters.sort()

# for letter in letters:
#     recs = doctors[doctors.surname.apply(lambda x: x[0] == letter)].sort_values('surname')
#     n_recs = recs.shape[0]
#     last_rec = recs.iloc[-1, 1] #tail(1).surname
#     if not letter in finished_letters:
#         print(f'[{letter}] total: {n_recs}, {last_rec}')

In [None]:
prefix = '' 

if not prefix in finished_letters:
    l_info, new_processed_letters = obtain_links_by_name(prefix, start_pos=8, end_pos=8)
    
    total_info += l_info
    processed_letters = {**processed_letters, **new_processed_letters}
#     print(total_info[-1])
#     print(processed_letters)
    
else:
    print(f'Prefix [{prefix + letter}] already finished.')


In [542]:
# print(processed_letters)
# finished_letters = set()

# for letter, pages in processed_letters.items():
#     if 0 < pages < 50:
#         finished_letters.add(letter)
        
finished_letters.add(prefix)

print(finished_letters)


## SAVE PROGRESS 
save_progress(processed_letters, finished_letters)
# processed_letters, finished_letters = load_progress()

{'s', 'a', 'b', 'o', 'i', 'x', 'v', 'l', 'm', 'j', 'g', 't', 'ch', 'u', 'z', 'r', 'f', 'c', 'q', 'd', 'n', 'k', 'e', 'w', 'y', 'p', 'h'}


In [548]:
assert not set(string.ascii_lowercase).difference(finished_letters), set(string.ascii_lowercase).difference(finished_letters)

In [553]:
def parse_info(total_info):
    '''
     Parses values from list and converts it into the pd.DataFrame
     Returns: pd.DataFrame
    '''

    # columns
    n_cols = max(map(len, total_info))
    doctors = pd.DataFrame(total_info, columns=['url', 'name', *[f'workplace{x}' for x in range(1, n_cols-1)]])
    # surname
    doctors['surname'] = doctors.name.apply(get_surname)
    doctors.surname = doctors.surname.apply(unidecode.unidecode)
    doctors.surname = doctors.surname.apply(lambda x: x.lower())
    doctors.loc[doctors.name == 'MUDr. Karin Boušová , Ph.D.', 'surname'] = 'Boušová'

    doctors.loc[(doctors.name == 'Nedal M. H. Abuasad') & (doctors.url.isna()), 'url'] = 'https://www.lkcr.cz/seznam-lekaru?filterId=MTE2MDU4NTE4NSwsTmVkYWwgTS4gSC4sLEFidWFzYWQ%3D&do[load]=1'

    return doctors

doctors = parse_info(total_info)
# save
doctors.to_csv(PATH_INTERMEDIATE+ 'doctors_alphabetical.csv', index=False, header=False, mode='a')


In [554]:
## drops duplicates by url
doctors_no_dupl = pd.read_csv(PATH_INTERMEDIATE + 'doctors_alphabetical.csv')
doctors_no_dupl = doctors_no_dupl.drop_duplicates('url', keep='last')
doctors_no_dupl.to_csv(PATH_INTERMEDIATE + 'doctors_alphabetical_nd.csv', index=False)

((56149, 5), (62242, 5))

In [570]:
print('Districts x fields: ', len(d_districts) * len(d_fields), 'x')
print('Alphabet:', len(processed_letters), 'x')

Districts x fields:  9009 x
Alphabet: 492 x


In [571]:
doctors

Unnamed: 0,url,name,workplace1,workplace2,surname
0,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,MUDr. Miroslav Baader,Liberec 1,,baader
1,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Jana Baarová,Kopřivnice 1,,baarova
2,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Monika Baarová,Hradec Králové,,baarova
3,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Věra Baarová,Havířov 1,,baarova
4,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Vladimíra Baarová,Klatovy,,baarova
...,...,...,...,...,...
62237,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Marie Mizerová,Olomouc 9,,mizerova
62238,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,MUDr. Jan Mizner,Praha 10,,mizner
62239,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,MUDr. Petr Mizner,Praha 5,,mizner
62240,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,MUDr. Barbora Miznerová,Opava 5,,miznerova


## Scrap obtained links

In [133]:
def save_to_json(rec, name):
    with open(PATH_INTERMEDIATE + name + '.json', "w") as outfile:
        json.dump(rec, outfile)
        
## ------------------------------------------

def parse_doctor_tables(tables):
    '''
        Parse tables in records' url
        Input[tables]: WebElelement
        Output[d_detail]: dict
    '''
    d_detail = dict()
    
    # evidence number
    ev_num = tables[0].find_element(by=By.CLASS_NAME, value='evidencni-cislo').text.split()[-1]
    d_detail['Evidenční číslo'] = ev_num

    ## first + second table
    for table in tables[:2]:
        for row in table.find_elements_by_css_selector('tr'):
            cell = row.find_elements_by_css_selector('td')
            key = cell[0].text
            values = cell[1].text.split('\n')
            values = values if len(values)>1 else values[0]
            d_detail[key] = values
    
    ## 3+ table (workplaces)
    workplaces = []
    for workplace in tables[2:]:
        d_workplace = dict()
        for row in workplace.find_elements_by_css_selector('tr'):
            cell = row.find_elements_by_css_selector('td')
            key = cell[0].text
            values = cell[1].text.split('\n')
            values = values if len(values)>1 else values[0]
            d_workplace[key] = values

        workplaces.append(d_workplace)

    d_detail['Pracoviště'] = workplaces
    
    return d_detail

## ------------------------------------------
def remove_asterisks(tables):
    new_tables = []
    for table in tables:
        if not '*' in table.text:
            new_tables.append(table)

    return new_tables

def get_doctor_detail(url, driver):
    '''
        Return detail info about doctors.
        Input[url]: str, CLK url 
        Output[d_detail]: dict, info about doctor
    '''
    try:
        driver.get(url)

        # due to the bug in the opening
#         driver.refresh()
  
        # load main content
        main = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "detail-lekare"))
        )
        
        # basic info
        tables = main.find_elements(by=By.CLASS_NAME, value='text-box-lekar')
        tables = remove_asterisks(tables)
        d_detail = parse_doctor_tables(tables)

        # name
        name = main.find_element(by=By.CLASS_NAME, value='jmeno-lekare').text
        d_detail['Jméno'] = name
        
    except Exception as e:
        print(f'[{url}] Error occured:', e)
    finally:
#         driver.close()
        return d_detail 

In [125]:
# l_doctors = []

In [131]:
## Loop for downloading websites
cnt = 1
n_url = doctors_no_dupl.url.nunique()
try:
    driver = webdriver.Chrome(PATH_CHROME)
    driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

    for url in doctors_no_dupl.url.unique():
        if cnt % 500 == 0:
            print('.', end='')
        if cnt % 5000 == 0:
            print(f' {cnt}/{n_url}')    
        cnt += 1

        d_detail = get_doctor_detail(url, driver)
        d_detail['url'] = url
        l_doctors.append(d_detail)
finally:
    print(' Finished!')
    driver.close()

.......... 1000/17989
.......... 2000/17989
.......... 3000/17989
.......... 4000/17989
.......... 5000/17989
.......... 6000/17989
.......... 7000/17989
.......... 8000/17989
.......... 9000/17989
.......... 10000/17989
.......... 11000/17989
.......... 12000/17989
.......... 13000/17989
.......... 14000/17989
.......... 15000/17989
.......... 16000/17989
.......... 17000/17989
.........

In [134]:
save_to_json(l_doctors, 'doctors_alphabet')

In [126]:
# TODO create a package

## References

#### SELENIUM
https://www.youtube.com/watch?v=b5jt2bhSeXs&ab_channel=TechWithTim

https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webelement

https://stackoverflow.com/questions/58872451/how-can-i-bypass-the-google-captcha-with-selenium-and-python

#### SCRAPY
https://docs.scrapy.org/en/latest/topics/selectors.html

https://www.youtube.com/watch?v=s4jtkzHhLzY&ab_channel=JohnWatsonRooney

https://stackoverflow.com/questions/42947417/scrapy-extract-items-from-table

#### BASE64
https://stackoverflow.com/questions/3470546/how-do-you-decode-base64-data-in-python



#### OTHERS
https://www.edureka.co/blog/web-scraping-with-python/

https://realpython.com/beautiful-soup-web-scraper-python/

https://stackoverflow.com/questions/51007603/how-to-correctly-form-a-post-request-to-this-website-with-python-request

https://hackernoon.com/how-post-requests-with-python-make-web-scraping-easier-9i203511