# ČSK scraping
The aim of this notebook is to scrap data from [ČLK](https://www.dent.cz/zubni-lekari/abecedne) website to obtain current data about dentists in Czech republic 

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options


from itertools import islice
from fake_useragent import UserAgent
from joblib import Parallel, delayed

import time
import pandas as pd
import numpy as np
import re
import json
import warnings
import json
import random
import string
import unidecode

warnings.filterwarnings('ignore')


In [3]:
PATH_CHROME = '/home/gary/Apps/chromedriver'
PATH_INTERMEDIATE = '../../data/intermediate/'

## Get members

### Get urls

In [9]:
def get_member_links():
    '''
     Return:
         l_info: scrapped info
    '''
    
    l_info = [] # general info about dentists
        
    try:
        driver = webdriver.Chrome(executable_path=PATH_CHROME)

        driver.get('https://www.dent.cz/zubni-lekari/abecedne')

        time.sleep(1)

        # Page counter
        counter = 0
        n_pages = 377
        while counter < n_pages: 
            main = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, "main"))
            )
            doc_list = main.find_element(by=By.CLASS_NAME, value='cross-dentists-alphabetical')

            for i in doc_list.find_elements(by=By.CLASS_NAME,value='cross-dentists-alphabetical__item.col-12.col-md-6.col-lg-4.u-mb-md'):
                name = i.text
                link = i.find_element_by_css_selector('a').get_attribute('href')
                l_info.append([name, link])
                
            # Next page
            next_page_button = main.find_element(by=By.CLASS_NAME, value='box-pager__item.box-pager__btn.box-pager__btn--next')
            next_page_button.click()
            
            time.sleep(.5)
            counter += 1

        driver.close()
            
    except Exception as e:
        print(f'Error occured:', e)
        
    finally:
        return l_info
    


In [10]:
l_info = get_member_links()

In [11]:
dentists = pd.DataFrame(l_info, columns=['name', 'url'])
dentists = dentists[dentists.name != '']
dentists = dentists.drop_duplicates()
dentists

Unnamed: 0,name,url
0,Abdullah Anwer Lékař stomatolog,https://www.dent.cz/clen/78a69950-30d5-4948-84...
1,Abu Haija Moh'd,https://www.dent.cz/clen/04c897c9-9a89-426c-af...
2,Adamcová Libuše MUDr.,https://www.dent.cz/clen/fa37580d-5c03-4690-b1...
3,Abdulová Shirin MDDr.,https://www.dent.cz/clen/bf7597d9-8a65-4f57-86...
4,Ackermannová Michaela MDDr.,https://www.dent.cz/clen/55c14db6-4cc4-443a-81...
...,...,...
11296,Žůrková Michaela MDDr.,https://www.dent.cz/clen/4ce473e4-824f-4558-a7...
11298,Žmolíková Petra MUDr.,https://www.dent.cz/clen/4bbdf692-5cab-4c34-a2...
11301,Žmolová Klára MDDr.,https://www.dent.cz/clen/3405be9a-a390-4262-9a...
11304,Žmuráňová Lucia MDDr.,https://www.dent.cz/clen/d88f8d4b-b3de-4aab-8e...


In [13]:
dentists.to_csv(PATH_INTERMEDIATE + 'dentists_url.csv')

### Get detail

In [14]:
def get_detail(driver, url):
    d_detail = dict()
    
    try:
        driver.get(url)
        time.sleep(1)
        # load main content
        main = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "main"))
        )
            
        # name
        name = main.find_element(by=By.CLASS_NAME, value='d-flex.align-items-center').text
        d_detail['name'] = name

        
        text = main.find_element(by=By.CLASS_NAME, value='row-main').text
        areas = [x for x in text.split('\n') if "OSK" in x]
        d_detail['area'] = areas
        
        
        # workplaces
        workplaces_info = []
        workplaces = main.find_elements(by=By.CLASS_NAME, value='bg-lightgrey.p-3.mb-3')
        for workplace in workplaces:
            wp_info = workplace.find_elements(by=By.CLASS_NAME, value='col-md-8')
            for wp in wp_info:
                info = wp.text.split('\n')
                name, address = info[0], info[1]
                workplaces_info.append({'name': name, 'address':address})
        
        d_detail['workplace'] = workplaces_info
                
    finally:
        return d_detail 

In [79]:
# get_detail('https://www.dent.cz/clen/de737c67-3212-4905-b750-6b8de2d106a9')

In [17]:
l_detail = []

driver = webdriver.Chrome(executable_path=PATH_CHROME)
n_url = dentists.url.nunique()
cnt = 1

for url in dentists.url.unique():
    if cnt % 100 == 0:
        print('.', end='')
    if cnt % 1000 == 0:
        print(f' {cnt}/{n_url}')    
    cnt += 1    
    
    d_detail = get_detail(driver, url)
    d_detail['url'] = url
    l_detail.append(d_detail)
    
driver.close()


.......... 1000/11293
.......... 2000/11293
.......... 3000/11293
.......... 4000/11293
.......... 5000/11293
.......... 6000/11293
.......... 7000/11293
.......... 8000/11293
.......... 9000/11293
.......... 10000/11293
.......... 11000/11293
..

In [18]:
def save_to_json(rec, name):
    with open(PATH_INTERMEDIATE + name + '.json', "w") as outfile:
        json.dump(rec, outfile)

save_to_json(l_detail, 'dentists_all')

In [46]:
again = []
for item in l_detail:
    area = item.get('area', None)
    if not isinstance(area, list):
        print(item)
        again.append(item['url'])

{'url': 'https://www.dent.cz/clen/437b5e52-17f3-4398-a299-0adc58314635'}
{'url': 'https://www.dent.cz/clen/66e309a8-2c2f-41a9-8907-98c356c501de'}
{'url': 'https://www.dent.cz/clen/b5cf05be-4f88-4e15-abac-f34f0aa80bbc'}
{'url': 'https://www.dent.cz/clen/fe73d29a-f176-493c-a115-825a3e33fe32'}
{'url': 'https://www.dent.cz/clen/2a113271-ed82-4ed3-b1af-1dd7e8116596'}
{'url': 'https://www.dent.cz/clen/b10c5a58-95d9-4c52-b7a6-029902266b91'}
{'name': 'MUDr. Jáňová Radmila', 'url': 'https://www.dent.cz/clen/9f8377a1-25b8-432b-a962-6c5da41f0496'}
{'name': 'Zubní lékař', 'url': 'https://www.dent.cz/clen/21574579-9c6e-46e5-9b97-729eea8a6b7b'}
{'url': 'https://www.dent.cz/clen/84aa6918-a9b4-478c-af8d-c7b70a498878'}
{'url': 'https://www.dent.cz/clen/d57b7dbb-d8fc-422f-af5f-4d2024fb54fc'}
{'url': 'https://www.dent.cz/clen/a4f5d6c8-65c9-4a1b-8e0d-328504b612d7'}
{'url': 'https://www.dent.cz/clen/8e2e3419-fd5d-4cef-941e-c782c4727525'}
{'url': 'https://www.dent.cz/clen/73de6dd3-7894-411f-a5ae-4c386a405eb

In [44]:
driver = webdriver.Chrome(executable_path=PATH_CHROME)

for url in again:
    d_detail = get_detail(driver, url)
    d_detail['url'] = url
    l_detail.append(d_detail)
driver.close()

In [45]:
dentists[dentists.url.isin(again)]

Unnamed: 0,name,url
242,Bartáková Iveta,https://www.dent.cz/clen/437b5e52-17f3-4398-a2...
711,Borovičková Helena MUDr.,https://www.dent.cz/clen/66e309a8-2c2f-41a9-89...
726,brabec jiri,https://www.dent.cz/clen/b5cf05be-4f88-4e15-ab...
1195,Černý Zdeněk,https://www.dent.cz/clen/fe73d29a-f176-493c-a1...
1450,Dostal Josef MUDr.,https://www.dent.cz/clen/2a113271-ed82-4ed3-b1...
1961,Freiberger Lukáš MDDr. MUDr.,https://www.dent.cz/clen/b10c5a58-95d9-4c52-b7...
3501,Jáňová Radmila MUDr.,https://www.dent.cz/clen/9f8377a1-25b8-432b-a9...
3811,Kafková Vlasta MUDr.,https://www.dent.cz/clen/21574579-9c6e-46e5-9b...
4041,Kedelidze Vladimír Lékař stomatolog,https://www.dent.cz/clen/84aa6918-a9b4-478c-af...
4729,Králíčková Daniela MDDr.,https://www.dent.cz/clen/d57b7dbb-d8fc-422f-af...


## Get workplaces
### Get urls

In [49]:
def get_workplace_links():
    
    l_info = [] # general info about dentists
        
    try:
        driver = webdriver.Chrome(executable_path=PATH_CHROME)

        driver.get('https://www.dent.cz/zubni-lekari')
        time.sleep(1)

        # Page counter
        counter = 0
        n_pages = 232
        while counter < n_pages: 
            main = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CLASS_NAME, "main"))
            )
            wp_list = main.find_element(by=By.CLASS_NAME, value='cross-cross-dentists-list')

            for i in wp_list.find_elements(by=By.CLASS_NAME,value='cross-dentists-list__item.col-12.col-md-6.u-mb-md'):
                record = i.find_element_by_css_selector('a')
                name = record.text
                link = record.get_attribute('href')
                l_info.append([name, link])
                
            # Next page
            next_page_button = main.find_element(by=By.CLASS_NAME, value='box-pager__item.box-pager__btn.box-pager__btn--next')
            next_page_button.click()
            
            time.sleep(.5)
            counter += 1

        driver.close()
            
    except Exception as e:
        print(f'Error occured:', e)
        
    finally:
        return l_info

In [51]:
l_workplaces = get_workplace_links()

In [52]:
workplaces = pd.DataFrame(l_workplaces, columns=['name', 'url'])
# workplaces = workplaces[workplaces.name != '']
# dentists = dentists.drop_duplicates()
workplaces

Unnamed: 0,name,url
0,#staycool s.r.o.,https://www.dent.cz/zubar/df313eba-7447-4496-b...
1,0-100 DENT s.r.o.,https://www.dent.cz/zubar/41b6caf3-48b0-4e43-a...
2,0-100 DENT s.r.o.,https://www.dent.cz/zubar/c30146a2-8fc8-4a59-a...
3,1. LF Dental s.r.o.,https://www.dent.cz/zubar/48d9dea2-88ad-41b8-b...
4,1. LF Dental s.r.o.,https://www.dent.cz/zubar/55722298-9ec7-4b6a-8...
...,...,...
6937,Zubzit497 s.r.o.,https://www.dent.cz/zubar/8eec538b-98cf-428f-b...
6938,"ZUOP, s.r.o.",https://www.dent.cz/zubar/4158473a-ae6a-4199-9...
6939,ZvolanekDental s.r.o.,https://www.dent.cz/zubar/13723870-c043-4ace-a...
6940,ZvolanekDental s.r.o.,https://www.dent.cz/zubar/93fc7228-bc39-4c73-8...


In [53]:
workplaces.to_csv(PATH_INTERMEDIATE + 'dentists_workplaces.csv')

### Get detail

In [62]:
def get_wp_detail(driver, url):
    d_detail = dict()
    
    try:
        driver.get(url)
        time.sleep(1)
        # load main content
        main = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "main"))
        )
            
        # name
        name = main.find_element_by_css_selector('h1').text
        d_detail['name'] = name
        
        # detail info
        workplaces_info = []
        table = main.find_element(by=By.CLASS_NAME, value='box-detail__item')
        info = table.find_elements_by_css_selector('p')
        
        if len(info) % 2 != 0:
            print(f'Alert [{url}]: probably bad parsing')
        
        for feature, value in zip(info[::2], info[1::2]):
            key = feature.text
            val = value.text.split('\n')
            d_detail[key] = val
                
    finally:
        return d_detail 

In [63]:
# driver = webdriver.Chrome(executable_path=PATH_CHROME)
# get_wp_detail(driver, 'https://www.dent.cz/zubar/816e0470-99c2-445d-8dcf-788d213fc135')


In [67]:
l_wp_detail = []

driver = webdriver.Chrome(executable_path=PATH_CHROME)
n_url = workplaces.url.nunique()
cnt = 1

for url in workplaces.url.unique():
    if cnt % 100 == 0:
        print('.', end='')
    if cnt % 1000 == 0:
        print(f' {cnt}/{n_url}')    
    cnt += 1    
    
    d_detail = get_wp_detail(driver, url)
    d_detail['url'] = url
    l_wp_detail.append(d_detail)
    
driver.close()


Alert [https://www.dent.cz/zubar/0c142114-d1a1-46bf-bec6-0ef5ca78223b]: probably bad parsing
Alert [https://www.dent.cz/zubar/4f0e1f91-f7eb-443a-8701-49174d14bc7d]: probably bad parsing
Alert [https://www.dent.cz/zubar/d54cc2b1-6d4b-4942-b17b-b0efd35b2de4]: probably bad parsing
Alert [https://www.dent.cz/zubar/4d805d62-1bc9-44bf-882f-ab13c8b3cd97]: probably bad parsing
Alert [https://www.dent.cz/zubar/ad24ce7d-1fe2-4db2-8e09-3859e8b56a36]: probably bad parsing
Alert [https://www.dent.cz/zubar/1e655648-4d8e-4419-80ee-51a697df3256]: probably bad parsing
Alert [https://www.dent.cz/zubar/b27602cb-ea97-40a4-829f-911dd2039662]: probably bad parsing
Alert [https://www.dent.cz/zubar/8fa2d49e-dde7-4e8a-a14c-f412d6580013]: probably bad parsing
Alert [https://www.dent.cz/zubar/161ede3f-28b5-4f0d-b83c-aa6f11232f39]: probably bad parsing
Alert [https://www.dent.cz/zubar/a49839c5-e00e-4285-af59-fc82bd03ad34]: probably bad parsing
Alert [https://www.dent.cz/zubar/050e4adf-e9dc-49cd-aa89-78842b91399c]

In [68]:
save_to_json(l_wp_detail, 'workplaces_all')

In [69]:
workplaces = pd.DataFrame(l_wp_detail)
workplaces.columns =['workplace_name', 'workplace_address', 'dentist', 'workplace_url', 'insurance_companies']
workplaces = workplaces.explode('dentist').reset_index(drop=True)
workplaces

Unnamed: 0,name,Kontakty:,Zubní lékaři:,url,Pojišťovny:
0,#staycool s.r.o.,"[J.E.Purkyně 365, 686 06 Uherské Hradiště]",[MDDr. Stašková Alžběta],https://www.dent.cz/zubar/df313eba-7447-4496-b...,
1,0-100 DENT s.r.o.,"[Thákurova 534/10, 160 00 Praha 6]",[MUDr. Hůlková Petra],https://www.dent.cz/zubar/41b6caf3-48b0-4e43-a...,
2,0-100 DENT s.r.o.,"[Újezd 427/28, 118 00 Praha 1]",[],https://www.dent.cz/zubar/c30146a2-8fc8-4a59-a...,
3,1. LF Dental s.r.o.,"[ZS Jaderná elektrárna Temelín, 373 05 Temelín...",[MDDr. Fechtnerová Lenka],https://www.dent.cz/zubar/48d9dea2-88ad-41b8-b...,
4,1. LF Dental s.r.o.,"[Senovážná 864/1, 110 00 Praha 1]",[MDDr. Fechtnerová Lenka],https://www.dent.cz/zubar/55722298-9ec7-4b6a-8...,
...,...,...,...,...,...
6903,Zubzit497 s.r.o.,"[Žitavského 497, 156 00 Praha 5]","[MDDr. Kohoutová Tereza, MDDr. Sekavová Tereza...",https://www.dent.cz/zubar/8eec538b-98cf-428f-b...,
6904,"ZUOP, s.r.o.","[Holubická 7, 664 07 Pozořice, E-mail: info@z...","[MUDr. Schrumpfová Olga, MUDr. Nekudová Schrum...",https://www.dent.cz/zubar/4158473a-ae6a-4199-9...,"[RBP, VoZP, OZP, ČPZP, ZPMV, VZP]"
6905,ZvolanekDental s.r.o.,"[Alej Svobody 732/55, 323 00 Plzeň, E-mail: i...","[MDDr. Kabátová Veronika, MDDr. Petříková Tere...",https://www.dent.cz/zubar/13723870-c043-4ace-a...,
6906,ZvolanekDental s.r.o.,"[nám. Generála Píky 2703/27, 326 00 Plzeň]",[],https://www.dent.cz/zubar/93fc7228-bc39-4c73-8...,


## Postprocessing

In [1]:
# TODO parse and combine workplaces with doctors
workplaces[(workplaces.workplace_address.isna() )].loc[4489]
# pd.DataFrame(l_detail)

NameError: name 'workplaces' is not defined