# ČSK scraping
The aim of this notebook is to scrap data from [ČLK](https://www.dent.cz/zubni-lekari/abecedne) website to obtain current data about dentists in Czech republic 

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options


from itertools import islice
from fake_useragent import UserAgent
from joblib import Parallel, delayed

import time
import pandas as pd
import numpy as np
import re
import json
import warnings
import json
import random
import string
import unidecode

warnings.filterwarnings('ignore')


In [12]:
PATH_CHROME = '/home/gary/Apps/chromedriver'
PATH_INTERMEDIATE = '../../data/intermediate/'

## Get members

### Get urls

In [9]:
def get_member_links():
    '''
     Return:
         l_info: scrapped info
    '''
    
    l_info = [] # general info about dentists
        
    try:
        driver = webdriver.Chrome(executable_path=PATH_CHROME)

        driver.get('https://www.dent.cz/zubni-lekari/abecedne')

        time.sleep(1)

        # Page counter
        counter = 0
        n_pages = 377
        while counter < n_pages: 
            main = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, "main"))
            )
            doc_list = main.find_element(by=By.CLASS_NAME, value='cross-dentists-alphabetical')

            for i in doc_list.find_elements(by=By.CLASS_NAME,value='cross-dentists-alphabetical__item.col-12.col-md-6.col-lg-4.u-mb-md'):
                name = i.text
                link = i.find_element_by_css_selector('a').get_attribute('href')
                l_info.append([name, link])
                
            # Next page
            next_page_button = main.find_element(by=By.CLASS_NAME, value='box-pager__item.box-pager__btn.box-pager__btn--next')
            next_page_button.click()
            
            time.sleep(.5)
            counter += 1

        driver.close()
            
    except Exception as e:
        print(f'Error occured:', e)
        
    finally:
        return l_info
    


In [10]:
l_info = get_member_links()

In [11]:
dentists = pd.DataFrame(l_info, columns=['name', 'url'])
dentists = dentists[dentists.name != '']
dentists = dentists.drop_duplicates()
dentists

Unnamed: 0,name,url
0,Abdullah Anwer Lékař stomatolog,https://www.dent.cz/clen/78a69950-30d5-4948-84...
1,Abu Haija Moh'd,https://www.dent.cz/clen/04c897c9-9a89-426c-af...
2,Adamcová Libuše MUDr.,https://www.dent.cz/clen/fa37580d-5c03-4690-b1...
3,Abdulová Shirin MDDr.,https://www.dent.cz/clen/bf7597d9-8a65-4f57-86...
4,Ackermannová Michaela MDDr.,https://www.dent.cz/clen/55c14db6-4cc4-443a-81...
...,...,...
11296,Žůrková Michaela MDDr.,https://www.dent.cz/clen/4ce473e4-824f-4558-a7...
11298,Žmolíková Petra MUDr.,https://www.dent.cz/clen/4bbdf692-5cab-4c34-a2...
11301,Žmolová Klára MDDr.,https://www.dent.cz/clen/3405be9a-a390-4262-9a...
11304,Žmuráňová Lucia MDDr.,https://www.dent.cz/clen/d88f8d4b-b3de-4aab-8e...


In [13]:
dentists.to_csv(PATH_INTERMEDIATE + 'dentists_url.csv')

### Get detail

In [14]:
def get_detail(driver, url):
    d_detail = dict()
    
    try:
        driver.get(url)
        time.sleep(1)
        # load main content
        main = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "main"))
        )
            
        # name
        name = main.find_element(by=By.CLASS_NAME, value='d-flex.align-items-center').text
        d_detail['name'] = name

        
        text = main.find_element(by=By.CLASS_NAME, value='row-main').text
        areas = [x for x in text.split('\n') if "OSK" in x]
        d_detail['area'] = areas
        
        
        # workplaces
        workplaces_info = []
        workplaces = main.find_elements(by=By.CLASS_NAME, value='bg-lightgrey.p-3.mb-3')
        for workplace in workplaces:
            wp_info = workplace.find_elements(by=By.CLASS_NAME, value='col-md-8')
            for wp in wp_info:
                info = wp.text.split('\n')
                name, address = info[0], info[1]
                workplaces_info.append({'name': name, 'address':address})
        
        d_detail['workplace'] = workplaces_info
                
    finally:
        return d_detail 

In [79]:
# get_detail('https://www.dent.cz/clen/de737c67-3212-4905-b750-6b8de2d106a9')

In [None]:
l_detail = []

driver = webdriver.Chrome(executable_path=PATH_CHROME)
n_url = dentists.url.nunique()
cnt = 1

for url in dentists.url.unique():
    if cnt % 100 == 0:
        print('.', end='')
    if cnt % 1000 == 0:
        print(f' {cnt}/{n_url}')    
    cnt += 1    
    
    d_detail = get_detail(driver, url)
    d_detail['url'] = url
    l_detail.append(d_detail)
    
driver.close()


.......... 1000/11293
.......... 2000/11293
.......... 3000/11293
.......... 4000/11293
.......... 5000/11293
.......... 6000/11293
.......... 7000/11293
.

In [None]:
def save_to_json(rec, name):
    with open(PATH_INTERMEDIATE + name + '.json', "w") as outfile:
        json.dump(rec, outfile)

save_to_json(l_detail, 'dentists_all')

## Get workplaces
### Get urls

In [None]:
def get_workplace_links():
    
    l_info = [] # general info about dentists
        
    try:
        driver = webdriver.Chrome(executable_path=PATH_CHROME)

        driver.get('https://www.dent.cz/zubni-lekari')

        time.sleep(1)

        # Page counter
        counter = 0
        n_pages = 232
        while counter < n_pages: 
            main = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, "main"))
            )
            wp_list = main.find_element(by=By.CLASS_NAME, value='cross-cross-dentists-list')

            for i in wp_list.find_elements(by=By.CLASS_NAME,value='cross-dentists-list__item.col-12.col-md-6.u-mb-md'):
                record = i.find_element_by_css_selector('a')
                name = record.text
                link = record.get_attribute('href')
                l_info.append([name, link])
                
            # Next page
            next_page_button = main.find_element(by=By.CLASS_NAME, value='box-pager__item.box-pager__btn.box-pager__btn--next')
            next_page_button.click()
            
            time.sleep(.5)
            counter += 1

        driver.close()
            
    except Exception as e:
        print(f'Error occured:', e)
        
    finally:
        return l_info

In [None]:
l_workplaces = get_member_links()

In [None]:
workplaces = pd.DataFrame(l_workplaces, columns=['name', 'url'])
# workplaces = workplaces[workplaces.name != '']
# dentists = dentists.drop_duplicates()
workplaces

### Get detail

In [None]:
def get_wp_detail(driver, url):
    d_detail = dict()
    
    try:
        driver.get(url)
        time.sleep(1)
        # load main content
        main = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "main"))
        )
            
        # name
        name = main.find_element_by_css_selector('h1').text
        d_detail['name'] = name
        
        # detail info
        workplaces_info = []
        table = main.find_element(by=By.CLASS_NAME, value='box-detail__item')
        info = table.find_element_by_css_selector('p')
        
        assert len(info) % 2 == 0 
        
        for feature, value in zip(info[::2], info[1::2]):
            key = feature.text
            val = value.text.split('\n')
            d_detail[key] = val
                
    finally:
        return d_detail 

In [None]:
# get_detail('https://www.dent.cz/clen/de737c67-3212-4905-b750-6b8de2d106a9')

In [None]:
l_wp_detail = []

driver = webdriver.Chrome(executable_path=PATH_CHROME)
n_url = workplaces.url.nunique()
cnt = 1

for url in workplaces.url.unique():
    if cnt % 100 == 0:
        print('.', end='')
    if cnt % 1000 == 0:
        print(f' {cnt}/{n_url}')    
    cnt += 1    
    
    d_detail = get_detail(driver, url)
    d_detail['url'] = url
    l_wp_detail.append(d_detail)
    
driver.close()


In [578]:
save_to_json(l_doctors, 'workplaces_all')

## Postprocessing

In [49]:
# TODO parse and combine workplaces with doctors