# ČLK scraping
The aim of this notebook is to scrap data from [ČLK](https://www.lkcr.cz/seznam-lekaru-426.html) website to obtain current data about doctors in Czech republic 

In [2]:
download = False

In [3]:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC

import time
import pandas as pd
import numpy as np
import re
import json
import warnings
warnings.filterwarnings('ignore')


In [4]:
PATH_CHROME = '/home/gary/Apps/chromedriver'
PATH_DATA = '../../data/intermediate/'

## Get fields and districts
Retrieve all field and districts available on the website

In [5]:
def get_fields_districts_dicts():
    driver = webdriver.Chrome(PATH_CHROME)
    driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

    ## dict of fields - name:value
    d_fields = dict()

    filterObor = driver.find_element_by_name('filterObor')
    options = [x for x in filterObor.find_elements_by_tag_name("option")]

    for element in options:
        if element.text:
            d_fields[element.text] = element.get_attribute("value")

    ## dict of districts - name:value
    d_districts = dict()

    filterOkresId = driver.find_elements_by_name('filterOkresId')[-1]

    options = [x for x in filterOkresId.find_elements_by_tag_name("option")]

    for element in options:
        if element.text:
            d_districts[element.text] = element.get_attribute("value")

    driver.close()
    
    np.save(PATH_DATA+'dict_districts.npy', d_districts) 
    np.save(PATH_DATA+'dict_fields.npy', d_fields) 
    return d_fields, d_districts

In [6]:
if download:
    d_fields, d_districts = get_fields_districts_dicts()
else:
    d_districts = np.load(PATH_DATA+'dict_districts.npy',allow_pickle='TRUE').item()
    d_fields = np.load(PATH_DATA+'dict_fields.npy',allow_pickle='TRUE').item()

## Authentication

In [7]:
# TODO necessary?

## Get links

In [15]:
driver = webdriver.Chrome(PATH_CHROME)

### 
# DOWNLOAD PART
###

driver.get('https://www.lkcr.cz/seznam-lekaru-426.html#seznam')

## TEST CASE
# search = driver.find_element_by_name('filterPrijmeni')
# search.send_keys('%')
# search.send_keys(Keys.RETURN)


links = [] # links to doctor's detail info
l_info = [] # general info about doctors

try:
    # iterate over districts
    for district_name, district_id in d_districts.items():
        # iterate over fields
        for field_name, field_id in d_fields.items():

            # select field of medicine
            select = driver.find_element_by_name('filterObor')
            select_f = Select(select)
            # helps to avoid captcha?
            select_f.options
            select_f.select_by_value(field_id)
            time.sleep(1)


            # select district
            select = driver.find_elements_by_name('filterOkresId')[-1]
            select_d = Select(select)
            # helps to avoid captcha?
            select_d.options
            select_d.select_by_value(district_id)
            time.sleep(1)

            # "press the button" - confirm chosen options and search
            search = driver.find_element_by_name('filterPrijmeni')
            search.send_keys(Keys.RETURN)
            time.sleep(5)

            # page counter
            counter = 0
            while True: 
                # TODO vylepsit, konci na 10 strance
                # stopping criterion
                next_page_text = f'{counter*20+1}-{counter*20+20}'
                if not next_page_text in driver.page_source and not 'Další >>' in driver.page_source:
                    break

                driver.get(f'https://www.lkcr.cz/seznam-lekaru-426.html?paging.pageNo={counter}')
                main = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "main"))
                )
                seznam2 = main.find_element(by=By.CLASS_NAME, value='seznam2')

                # get all links from the page
                for x in seznam2.find_elements_by_css_selector('a')[::2]:
                    links.append(x.get_property('attributes')[0]['value'])

                # parse table
                l_doctors = [re.split('\s\s+', x.lstrip().rstrip()) for x in seznam2.text.split('\n')]
                l_doctors = [[i for i in l if i != 'DETAIL'] for l in l_doctors]

                for l in l_doctors[1:]:
                    # add new workplace (doctor has more than one)
                    if len(l) == 1:
                        if isinstance(l_info[-1][-1], list):
                            l_info[-1][-1] += l
                        else:
                            l_info[-1][-1] = [l_info[-1][-1], *l]
                    # only one workplace
                    else:
                        l_info.append(l)

                # next page
                counter += 1

            # Append correct link to records
            for i, l in enumerate(l_info):
                l_info[i].append(links[i])
                l_info[i].append(district_name)
                l_info[i].append(field_name)
    
    # helps to avoid captcha?    
    time.sleep(5)

finally:
#     pass
    driver.close()


## Scrap obtained links

In [21]:
# TODO 
doctors = pd.DataFrame(l_info, columns=['name', 'workplace', 'url', 'district', 'field'])
doctors['url'] = 'https://www.lkcr.cz' + doctors['url']
doctors.head()

Unnamed: 0,name,workplace,url,district,field
0,Ahmed Abdalhafez,Nymburk 2,https://www.lkcr.cz/seznam-lekaru-426.html?fil...,Benešov,dětská dermatovenerologie
1,MUDr. Abdallah Abdallah,neuvedena,https://www.lkcr.cz/seznam-lekaru-426.html?fil...,Benešov,dětská dermatovenerologie
2,MUDr. Ismaeel Abedelrahman Ragheb Abdelaziz,neuvedena,https://www.lkcr.cz/seznam-lekaru-426.html?fil...,Benešov,dětská dermatovenerologie
3,MUDr. Adel Abdulghani,Duchcov,https://www.lkcr.cz/seznam-lekaru-426.html?fil...,Benešov,dětská dermatovenerologie
4,MUDr. Hamza Abduljabar,Turnov 1,https://www.lkcr.cz/seznam-lekaru-426.html?fil...,Benešov,dětská dermatovenerologie


In [128]:
def parse_doctor_tables(tables):
    d_detail = dict()
    for l in tables:
        table = l.find_elements_by_css_selector('tr')
        d_table = dict()

        if '*' in table[0].text:
            continue

        for row in table[1:]:

            if '*' in row. text:
                continue
            info = [x.strip() for x in row.text.split(":")]

            # append new value for key from last cycle (eg if a doctor has more than one field)
            if len(info) == 1:
                last_values = d_table[last_key]
                d_table[last_key] = last_values + info

            # save to the dict
            else:
                last_key = info[0]
                d_table[last_key] = info[1:]

        # if dict d_table is not empty
        if d_table:
            for x, y in d_table.items():
                if len(y) == 1:
                    d_table[x] = y[0]

            key = table[0].text if table[0].text else 'INFORMACE'
            key = 'PRACOVIŠTĚ' if 'PRACOVIŠTĚ' in key else key
            
            if key in d_detail:
                val = d_detail[key]
                if isinstance(val, list):
                    d_detail[key] = val.append(d_table)
                else:
                    d_detail[key] = [val, d_table]
            else:
                d_detail[key] = d_table

    return d_detail



In [129]:
d_detail.keys()

dict_keys(['INFORMACE', 'PLATNÉ LICENCE UDĚLENÉ ČESKOU LÉKAŘSKOU KOMOROU', 'PRACOVIŠTĚ  1/2', 'PRACOVIŠTĚ  2/2', 'JMÉNO', 'ČÍSLO'])

In [150]:
import json
def save_dict_to_json(d, name):
    with open(PATH_DATA + name + '.json', "w") as outfile:
        json.dump(d, outfile)

In [131]:
def get_doctor_detail(url):
    '''
        Return detail info about doctors.
        Input: CLK url 
    '''
    try:
        driver = webdriver.Chrome(PATH_CHROME)
        driver.get(url)

        # load main content
        main = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "main"))
        )

        # basic info
        tables = main.find_elements(by=By.CLASS_NAME, value='seznam')
        d_detail = parse_doctor_tables(tables)

        # name
        name = main.find_element(by=By.CLASS_NAME, value='akt_nadpis').text
        d_detail['JMÉNO'] = name
        # evidence number
        number = main.find_element(by=By.CLASS_NAME, value='evcislo').text
        d_detail['ČÍSLO'] = re.sub('[^0-9]','', number)

    finally:
        driver.close()

        return d_detail 


In [146]:
l_doctors = []
for url in doctors.url.unique():
    d_detail = get_doctor_detail(url)
    l_doctors.append(d_detail)
# TODO add remaining info from dataframe

UnboundLocalError: local variable 'd_detail' referenced before assignment

In [149]:
save_dict_to_json(d_detail, 'doctors_sample')

In [151]:
d_detail

{'INFORMACE': {'Vysoká škola': '2. LF Univerzity Karlovy v Praze',
  'Rok promoce': '2016',
  'Dosažená odbornost': 'všeobecné praktické lékařství',
  'Diplom celoživotního vzdělávání': 'ano'},
 'PLATNÉ LICENCE UDĚLENÉ ČESKOU LÉKAŘSKOU KOMOROU': {'K výkonu soukromé praxe a lektora v oboru': '-',
  'Pro výkon funkce vedoucího lékaře a primáře v oboru': '-',
  'Funkční licence pro léčebnou metodu': '-'},
 'PRACOVIŠTĚ': None,
 'JMÉNO': 'MUDr. Pavel Adámek',
 'ČÍSLO': '1162962190'}

## References

In [None]:
## SELENIUM
# https://www.youtube.com/watch?v=b5jt2bhSeXs&ab_channel=TechWithTim
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webelement
# https://stackoverflow.com/questions/58872451/how-can-i-bypass-the-google-captcha-with-selenium-and-python

## SCRAPY
# https://docs.scrapy.org/en/latest/topics/selectors.html
# https://www.youtube.com/watch?v=s4jtkzHhLzY&ab_channel=JohnWatsonRooney
# https://stackoverflow.com/questions/42947417/scrapy-extract-items-from-table

## BASE64
# https://stackoverflow.com/questions/3470546/how-do-you-decode-base64-data-in-python



## OTHERS
# https://www.edureka.co/blog/web-scraping-with-python/
# https://realpython.com/beautiful-soup-web-scraper-python/
# https://stackoverflow.com/questions/51007603/how-to-correctly-form-a-post-request-to-this-website-with-python-request
# https://hackernoon.com/how-post-requests-with-python-make-web-scraping-easier-9i203511