# ČLK scraping
The aim of this notebook is to scrap data from [ČLK](https://www.lkcr.cz/seznam-lekaru-426.html) website to obtain current data about doctors in Czech republic 

In [2]:
download = True

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC

import time
import pandas as pd
import numpy as np
import re
import json
import warnings
import json
import random

warnings.filterwarnings('ignore')


In [4]:
PATH_CHROME = '/home/gary/Apps/chromedriver'
PATH_DATA = '../../data/intermediate/'

## Get fields and districts
Obtain all field and districts available on the website

In [5]:
def get_fields_districts_dicts():
    driver = webdriver.Chrome(PATH_CHROME)
    driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

    ## dict of fields - name:value
    d_fields = dict()

    filterObor = driver.find_element_by_name('filterObor')
    options = [x for x in filterObor.find_elements_by_tag_name("option")]

    for element in options:
        if element.text:
            d_fields[element.text] = element.get_attribute("value")

    ## dict of districts - name:value
    d_districts = dict()

    filterOkresId = driver.find_elements_by_name('filterOkresId')[-1]

    options = [x for x in filterOkresId.find_elements_by_tag_name("option")]

    for element in options:
        if element.text:
            d_districts[element.text] = element.get_attribute("value")

    driver.close()
    
    np.save(PATH_DATA+'dict_districts.npy', d_districts) 
    np.save(PATH_DATA+'dict_fields.npy', d_fields) 
    return d_fields, d_districts

In [6]:
if download:
    d_fields, d_districts = get_fields_districts_dicts()
else:
    d_districts = np.load(PATH_DATA+'dict_districts.npy',allow_pickle='TRUE').item()
    d_fields = np.load(PATH_DATA+'dict_fields.npy',allow_pickle='TRUE').item()

## Google authentication

In [7]:
# # Not necessary to run this cell

# print('Gmail username and password')
# gmailId, passWord = map(str, input().split())
# try:
#     driver = webdriver.Chrome(PATH_CHROME)
#     driver.get(r'https://accounts.google.com/signin/v2/identifier?continue='+\
#     'https%3A%2F%2Fmail.google.com%2Fmail%2F&service=mail&sacu=1&rip=1'+\
#     '&flowName=GlifWebSignIn&flowEntry = ServiceLogin')
#     driver.implicitly_wait(15)
  
#     loginBox = driver.find_element_by_xpath('//*[@id ="identifierId"]')
#     loginBox.send_keys(gmailId)
  
#     nextButton = driver.find_elements_by_xpath('//*[@id ="identifierNext"]')
#     nextButton[0].click()
  
#     passWordBox = driver.find_element_by_xpath(
#         '//*[@id ="password"]/div[1]/div / div[1]/input')
#     passWordBox.send_keys(passWord)
  
#     nextButton = driver.find_elements_by_xpath('//*[@id ="passwordNext"]')
#     nextButton[0].click()
  
#     print('Login OK')
# except:
#     print('Login Failed')

## Obtain links

In [10]:
# # lists where all the records are stored (when re-run)
# total_info = []

In [12]:
## TEST CASE (replace the code from 'try' to '# page counter' comment)
# search = driver.find_element_by_name('filterPrijmeni')
# search.send_keys('%')
# search.send_keys(Keys.RETURN)


links = [] # links to doctor's detail info
l_info = [] # general info about doctors

try:
    # iterate over districts
    for district_name, district_id in d_districts.items():
        # iterate over fields
        for field_name, field_id in d_fields.items():
            driver = webdriver.Chrome(PATH_CHROME)

            driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')
            # select field of medicine
            select = driver.find_element_by_name('filterObor')
            select_f = Select(select)
            # helps to avoid captcha?
            select_f.options
            select_f.select_by_value(field_id)
            time.sleep(1)


            # select district
            select = driver.find_elements_by_name('filterOkresId')[-1]
            select_d = Select(select)
            # helps to avoid captcha?
            select_d.options
            select_d.select_by_value(district_id)
            time.sleep(1)

            # "press the button" - confirm chosen options and search
#             tmp_inp = random.choice(['filterPrijmeni', 'filterJmeno'])
            search = driver.find_element_by_name('do[findLekar]=1') #tmp_int
            search.send_keys(Keys.RETURN)
            time.sleep(5)

            # page counter
            counter = 0
            while True: 
                # Stopping criteria
                next_page_text = f'{counter*20+1}-{counter*20+20}'
                print(next_page_text)
                if not next_page_text in driver.page_source and not 'Další >>' in driver.page_source:
                    break

                driver.get(f'https://www.lkcr.cz/seznam-lekaru-426.html?paging.pageNo={counter}')
                main = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "form-clk"))
                )
                doc_list = main.find_element(by=By.CLASS_NAME, value='seznam-lekaru.item-list')

                for i in doc_list.find_elements(by=By.CLASS_NAME, value='item')[1:]:
                    info = i.text.split('\n')[:-1]
                    link = i.find_element_by_css_selector('a').get_attribute('href')
                    info = [link, district_name, field_name] + info 
                    l_info.append(info)
                                
#                 # get all links from the page
#                 for x in doc_list.find_elements_by_css_selector('a')[::2]:
#                     links.append(x.get_property('attributes')[0]['value'])

#                 # -----------------------
#                 # Parsing part
#                 # parse table
#                 l_doctors = [re.split('\s\s+', x.lstrip().rstrip()) for x in doc_list.text.split('\n')]
#                 l_doctors = [[i for i in l if i != 'DETAIL'] for l in l_doctors]

#                 for l in l_doctors[1:]:
#                     # add new workplace (doctor has more than one)
#                     if len(l) == 1:
#                         if isinstance(l_info[-1][-1], list):
#                             l_info[-1][-1] += l
#                         else:
#                             l_info[-1][-1] = [l_info[-1][-1], *l]
#                     # only one workplace
#                     else:
#                         l_info.append(l)
#                     l_info[-1].append(district_name)
#                     l_info[-1].append(field_name)

                # -----------------------

                # next page
                counter += 1
                
            driver.close()
    
finally:
    total_info += l_info

1-20
21-40
1-20
1-20
21-40
1-20
21-40
1-20
1-20


WebDriverException: Message: chrome not reachable
  (Session info: chrome=96.0.4664.110)
Stacktrace:
#0 0x564530a5cee3 <unknown>
#1 0x56453052a49f <unknown>
#2 0x564530519244 <unknown>
#3 0x564530519c49 <unknown>
#4 0x56453051b9d2 <unknown>
#5 0x5645305140c6 <unknown>
#6 0x56453052ba02 <unknown>
#7 0x564530590ac8 <unknown>
#8 0x56453057e163 <unknown>
#9 0x564530553bfc <unknown>
#10 0x564530554c05 <unknown>
#11 0x564530a8ebaa <unknown>
#12 0x564530aa4651 <unknown>
#13 0x564530a8fb05 <unknown>
#14 0x564530aa5a68 <unknown>
#15 0x564530a8405f <unknown>
#16 0x564530ac0818 <unknown>
#17 0x564530ac0998 <unknown>
#18 0x564530adbeed <unknown>
#19 0x7fc0643a7609 <unknown>


## Scrap obtained links

In [13]:
n_cols = max(map(len, l_info))
doctors = pd.DataFrame(l_info, columns=['url', 'district', 'field', 'name', *[f'workplace{x}' for x in range(1, n_cols-3)]])
doctors

Unnamed: 0,url,district,field,name,workplace1,workplace2
0,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,Benešov,alergologie a klinická imunologie,MUDr. Anna Koláčná,Benešov u Prahy,
1,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,Benešov,alergologie a klinická imunologie,MUDr. Jan Koláčný,Benešov u Prahy,Votice
2,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,Benešov,alergologie a klinická imunologie,MUDr. Elena Kolouchová,Benešov u Prahy,
3,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,Benešov,anesteziologie a intenzivní medicína,MUDr. Jiří Bráza,Benešov u Prahy,
4,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,Benešov,anesteziologie a intenzivní medicína,MUDr. Zbyněk Breburda,Benešov u Prahy,
5,https://www.lkcr.cz/seznam-lekaru?filterId=NTE...,Benešov,anesteziologie a intenzivní medicína,MUDr. Sylva Dolenská,Čerčany,
6,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,Benešov,anesteziologie a intenzivní medicína,MUDr. Michal Gozon,Benešov u Prahy,
7,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,Benešov,anesteziologie a intenzivní medicína,MUDr. Josef Hanáček,Benešov u Prahy,
8,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,Benešov,anesteziologie a intenzivní medicína,MUDr. Lukáš Lörinc,Benešov u Prahy,
9,https://www.lkcr.cz/seznam-lekaru?filterId=MTE...,Benešov,anesteziologie a intenzivní medicína,MUDr. Stanislav Maršík,Benešov u Prahy,


In [22]:
def save_to_json(rec, name):
    with open(PATH_DATA + name + '.json', "w") as outfile:
        json.dump(rec, outfile)
        
## ------------------------------------------

def parse_doctor_tables(tables):
    '''
        Parse tables in records' url
        Input[tables]: WebElelement
        Output[d_detail]: dict
    '''
    d_detail = dict()
    
    # Evidence number
    ev_num = tables[0].find_element(by=By.CLASS_NAME, value='evidencni-cislo').text.split()[-1]
    d_detail['Evidenční číslo'] = ev_num

    ## first + second table
    for table in tables[:2]:
        for row in table.find_elements_by_css_selector('tr'):
            cell = row.find_elements_by_css_selector('td')
            key = cell[0].text
            values = cell[1].text.split('\n')
            values = values if len(values)>1 else values[0]
            d_detail[key] = values
    
    ## 3+ table
    workplaces = []
    for workplace in tables[2:]:
        d_workplace = dict()
        for row in workplace.find_elements_by_css_selector('tr'):
            cell = row.find_elements_by_css_selector('td')
            key = cell[0].text
            values = cell[1].text.split('\n')
            values = values if len(values)>1 else values[0]
            d_workplace[key] = values

        workplaces.append(d_workplace)

    d_detail['Pracoviště'] = workplaces
    
    return d_detail

#     d_detail = dict()
    
#     for l in tables:
#         table = l.find_elements_by_css_selector('tr')
#         d_table = dict()

#         if '*' in table[0].text:
#             continue

#         for row in table[1:]:

#             if '*' in row. text:
#                 continue
            
#             info = [x.strip() for x in row.text.split(":")]
#             print(row.text)
#             # append new value for key from last cycle (eg if a doctor has more than one field)
#             if len(info) == 1:
#                 last_values = d_table[last_key]
#                 d_table[last_key] = last_values + info

#             # save to the dict
#             else:
#                 last_key = info[0]
#                 d_table[last_key] = info[1:]

#         # if dict d_table is not empty
#         if d_table:
#             for x, y in d_table.items():
#                 if len(y) == 1:
#                     d_table[x] = y[0]

#             key = table[0].text if table[0].text else 'INFORMACE'
#             key = 'PRACOVIŠTĚ' if 'PRACOVIŠTĚ' in key else key
            
#             if key in d_detail:
#                 val = d_detail[key]
#                 if isinstance(val, list):
#                     d_detail[key] = val.append(d_table)
#                 else:
#                     d_detail[key] = [val, d_table]
#             else:
#                 d_detail[key] = d_table

#     return d_detail

## ------------------------------------------

def get_doctor_detail(url):
    '''
        Return detail info about doctors.
        Input[url]: str, CLK url 
        Output[d_detail]: dict, info about doctor
    '''
    try:
        driver = webdriver.Chrome(PATH_CHROME)
        driver.get(url)

        # due to the bug in the opening
        driver.refresh()

        
        # load main content
        main = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "detail-lekare"))
        )
        
        # basic info
        tables = main.find_elements(by=By.CLASS_NAME, value='text-box-lekar')
        d_detail = parse_doctor_tables(tables)

        # name
        name = main.find_element(by=By.CLASS_NAME, value='jmeno-lekare').text
        d_detail['Jméno'] = name
        
    finally:
        driver.close()
        return d_detail 

In [23]:
## Loop for downloading websites
l_doctors = []
for url in doctors.url.unique():
    print(url)
    d_detail = get_doctor_detail(url)
    l_doctors.append(d_detail)

https://www.lkcr.cz/seznam-lekaru?filterId=NTE0NzY2NTE3NywsQW5uYSwsS29sw6HEjW7DoQ%3D%3D&do[load]=1
https://www.lkcr.cz/seznam-lekaru?filterId=MTExNjEyMDE0OSwsSmFuLCxLb2zDocSNbsO9&do[load]=1
https://www.lkcr.cz/seznam-lekaru?filterId=NTEyNjM2MTE1NywsRWxlbmEsLEtvbG91Y2hvdsOh&do[load]=1
https://www.lkcr.cz/seznam-lekaru?filterId=MTExMDMwNzE0MiwsSmnFmcOtLCxCcsOhemE%3D&do[load]=1
https://www.lkcr.cz/seznam-lekaru?filterId=MTE1MjgxNzE3OCwsWmJ5bsSbaywsQnJlYnVyZGE%3D&do[load]=1
https://www.lkcr.cz/seznam-lekaru?filterId=NTE2MjYyODE1NSwsU3lsdmEsLERvbGVuc2vDoQ%3D%3D&do[load]=1
https://www.lkcr.cz/seznam-lekaru?filterId=MTE0NjU2MTE3NiwsTWljaGFsLCxHb3pvbg%3D%3D&do[load]=1
https://www.lkcr.cz/seznam-lekaru?filterId=MTEzNjIwMDE2NywsSm9zZWYsLEhhbsOhxI1law%3D%3D&do[load]=1
https://www.lkcr.cz/seznam-lekaru?filterId=MTE1NTcyNTE4MywsTHVrw6HFoSwsTMO2cmluYw%3D%3D&do[load]=1
https://www.lkcr.cz/seznam-lekaru?filterId=MTExOTI1MzE1MiwsU3RhbmlzbGF2LCxNYXLFocOtaw%3D%3D&do[load]=1
https://www.lkcr.cz/seznam-lek

In [24]:
l_doctors

[{'Evidenční číslo': '5147665177',
  'Vysoká škola': '3. LF Univerzity Karlovy v Praze',
  'Rok promoce': '2001',
  'Dosažená odbornost': ['alergologie a klinická imunologie',
   'dětské lékařství'],
  'Diplom celoživotního vzdělávání': 'ano',
  'K výkonu soukromé praxe a lektora v oboru': ['alergologie a klinická imunologie',
   'dětské lékařství'],
  'Pracoviště': [{'Název zdravotnického zařízení:': 'AKA, s.r.o.',
    'Název pracoviště:': 'pediatrie, alergologie a imunologie',
    'Adresa pracoviště:': 'Červené Vršky, 25601 Benešov u Prahy'}],
  'Jméno': 'MUDr. Anna Koláčná'},
 {'Evidenční číslo': '1116120149',
  'Vysoká škola': '3. LF Univerzity Karlovy v Praze',
  'Rok promoce': '1974',
  'Dosažená odbornost': ['dorostové lékařství',
   'lékařská imunologie',
   'pediatrie'],
  'Diplom celoživotního vzdělávání': 'ano',
  'K výkonu soukromé praxe a lektora v oboru': ['dorostové lékařství',
   'lékařská imunologie',
   'pediatrie',
   'praktické lékařství pro děti a dorost'],
  'Prac

In [25]:
save_to_json(l_doctors, 'doctors_sample')

In [126]:
# TODO merge with d_detail and doctors

## References

#### SELENIUM
https://www.youtube.com/watch?v=b5jt2bhSeXs&ab_channel=TechWithTim

https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webelement

https://stackoverflow.com/questions/58872451/how-can-i-bypass-the-google-captcha-with-selenium-and-python

#### SCRAPY
https://docs.scrapy.org/en/latest/topics/selectors.html

https://www.youtube.com/watch?v=s4jtkzHhLzY&ab_channel=JohnWatsonRooney

https://stackoverflow.com/questions/42947417/scrapy-extract-items-from-table

#### BASE64
https://stackoverflow.com/questions/3470546/how-do-you-decode-base64-data-in-python



#### OTHERS
https://www.edureka.co/blog/web-scraping-with-python/

https://realpython.com/beautiful-soup-web-scraper-python/

https://stackoverflow.com/questions/51007603/how-to-correctly-form-a-post-request-to-this-website-with-python-request

https://hackernoon.com/how-post-requests-with-python-make-web-scraping-easier-9i203511