In [1]:
### Storemagic Commands ###
# %store - Show list of all variables and their current values
# %store -z - Remove all variables from storage
# %store -r - Refresh all variables from store (delete current vals)

In [2]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from datetime import datetime
from IPython.utils import io
import unicodedata
import re
import csv

In [3]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode("ASCII").replace(" ","_").replace("-","_")

In [22]:
def get_soup(url):
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    driver = webdriver.Chrome(executable_path=r'C:\chromedriver\chromedriver.exe', options=options)
    driver.get(url)
    time.sleep(10)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    driver.close()
    driver.quit()
    return soup

In [23]:
def get_contact(url):
    soup = get_soup(url)
    cdiv = soup.find_all('div', {"class": "infoWrap"})
    find_names = soup.find_all('h1', {"data-dot": "premise/title"})
    find_phones = cdiv[0].find_all('span', {'data-dot' : 'origin-phone-number'})
    find_emails = cdiv[0].find_all('a', {'data-dot' : 'e-mail'})
    find_webs = cdiv[0].find_all('a', {'data-dot' : 'show-website'})
    name = ';'.join([" ".join(find_name.get_text().split()) for find_name in find_names])
    phone = ';'.join([find_phone.get_text() for find_phone in find_phones]).replace(" ","")
    email = ';'.join([find_email.get_text() for find_email in find_emails])
    web = ';'.join([find_web.get_text() for find_web in find_webs])
    return (url,name,email,web,phone)

In [24]:
def get_category(category):
    if category.find("/") >= 0:
        link = "https://www.firmy.cz/" + category
        link_page = "https://www.firmy.cz/{}?page={}"
        name = remove_accents(category.split("/")[-1])
    else:
        link = "https://www.firmy.cz/?q=" + category
        link_page = "https://www.firmy.cz/?q={}&page={}"
        name = remove_accents(category)
    return (link,link_page,name)

In [25]:
categories = {
    "Velkoobchod-a-vyroba/Vyrobci-surovin-a-polotovaru/Vyrobci-kovovych-polotovaru"
}

In [26]:
### TESTING SET of categories ###
#categories = {
#    "Lakovny",
#    "Velkoobchod plastovych polotovaru",
#    "Zpracovani_Bioodpadu"
#}

In [27]:
category_pages = dict()
for category in categories:
    link, link_page, name = get_category(category)
    soup = get_soup(link)
    divs = soup.findAll("div", {"class": "results"}) 
    if not divs: 
        page_count = 1 
    else: 
        total_records = divs[0].findAll('strong')[1]
        page_count_raw = (int(total_records.text)//14)+1
        #page_count = 36 if page_count_raw > 36 else page_count_raw
        page_count = page_count_raw
    category_pages[category] = page_count
print(category_pages)

{'Velkoobchod-a-vyroba/Vyrobci-surovin-a-polotovaru/Vyrobci-kovovych-polotovaru': 80}


In [28]:
links = dict()
for category, pages in category_pages.items():
    for page in range(1,pages+1):
        link, link_page, name = get_category(category)
        category_link = link_page.format(category, page)
        links[category_link] = name
print('In TOTAL we have "'+str(len(links))+'" pages with "14" contacts on each of them.')

In TOTAL we have "80" pages with "14" contacts on each of them.


In [29]:
company_list = dict()

clinks = 0
for link, category in links.items():
    clinks += 1
    soup = get_soup(link)
    divs = soup.findAll("a", {"class": "companyTitle statCompanyDetail"})

    cdivs = 0
    for div in divs:
        cdivs += 1
        count = str(clinks)+'-'+str(cdivs)
        href = "https://www.firmy.cz" + div["href"]
        company_list[href] = [category, count]
        with io.capture_output() as captured:
            %store company_list
        print(category+' | '+str(href)+' | '+count)

Vyrobci_kovovych_polotovaru | https://www.firmy.cz/detail/12985232-profmultitec-s-r-o-praha-modrany.html | 1-1
Vyrobci_kovovych_polotovaru | https://www.firmy.cz/detail/666001-oh-ohyby-paskov.html | 1-2
Vyrobci_kovovych_polotovaru | https://www.firmy.cz/detail/655558-metal-fa-cz-s-r-o-ostrava-vitkovice.html | 80-12


In [30]:
### TESTING SET of company_list ###
#company_list=dict()
#company_list={
#    'https://www.firmy.cz/detail/238041-autolakovna-prikryl-hodslavice.html': ['Lakovny','1-1'],
#    'https://www.firmy.cz/detail/2670575-whb-polykarbonaty-ostrava-marianske-hory.html': ['Velkoobchod_plastovych_polotovaru','1-2'],
#    'https://www.firmy.cz/detail/12829605-gutta-cr-dolany.html': ['Velkoobchod_plastovych_polotovaru','1-3'],
#    'https://www.firmy.cz/detail/13083454-compag-votice-kompostarna-votice.html': ['Zpracovani_Bioodpadu','1-4']}

In [31]:
output={}
for category in categories:
    link, link_page, name = get_category(category)
    output[name] = [] 

In [32]:
for url, category in company_list.items():
    try: 
        dt = datetime.now().strftime('%H:%M:%S')
        url, name, email, web, phone = get_contact(url)
        output[category[0]].append((name, email, web, phone))
        with io.capture_output() as captured:
            %store output
        LogRow = 'Done|'+str(category[0])+'|'+str(url)+'|'+str(dt)+'|'+category[1]
        print('Done | '+str(url)+' | '+category[1])     
    except Exception as e:
        dt = datetime.now().strftime('%H:%M:%S')
        LogRow = 'Failed|'+str(category[0])+'|'+str(url)+'|'+str(dt)+'|'+str(e)+'|'+ category[1]
        print('Failed | '+str(e)+' | '+ category[1]) 
    
    log = open("Output/LogFile.txt","a")
    log.write(LogRow +"\n")
    log.close()

Done | https://www.firmy.cz/detail/12985232-profmultitec-s-r-o-praha-modrany.html | 1-1
Done | https://www.firmy.cz/detail/666001-oh-ohyby-paskov.html | 1-2
Done | https://www.firmy.cz/detail/655558-metal-fa-cz-s-r-o-ostrava-vitkovice.html | 80-12


In [33]:
# Insert records into separated csv. files based on main category
# If you run it for a first time you need to create the folder (!mkdir FirmyCZ_Output)

for cat, rows in output.items():
    with open("Output/"+cat+".csv","w", newline='',encoding='utf-8') as f:
        used = []
        writer = csv.writer(f,delimiter="|",quotechar='"')
        for row in rows:
            if row[1] not in used:
                writer.writerow(row)
                used.append(row[1])