In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import os
import time
import pandas as pd
import numpy as np

In [None]:
# instantiate a chrome options object so you can set the size and headless preference
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")

chrome_driver = os.getcwd() +"\\chromedriver.exe"

In [None]:
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver)
driver.get("https://www.bayern-international.de/en/company-database/")

# foldable_header = driver.find_element_by_class_name(name="foldable header")
foldable_header = driver.find_element_by_css_selector("div.foldable-container.keytech-search-form-extended.collapsed");
foldable_header.click()

select = Select(driver.find_element_by_name(name="tx_keytechrenew_keytech[state_province]"))
select.select_by_value('091')

time.sleep(5)

select_none = driver.find_element_by_css_selector("a.form-district-select-none")
select_none.click()

list_of_districts = ["form_district_09171","form_district_09173","form_district_09172","form_district_09174","form_district_09175","form_district_09176","form_district_09177","form_district_09178","form_district_09179","form_district_09180","form_district_09161","form_district_09181","form_district_09182","form_district_09183","form_district_09184","form_district_09162","form_district_09185","form_district_09186","form_district_09187","form_district_09163","form_district_09188","form_district_09189","form_district_09190"]
# We execute this twice, otherway it returns too many results
for item in list_of_districts[:13]:
    select_district = driver.find_element_by_id(item)
    select_district.click()


search_button = driver.find_element_by_name(name="tx_keytechrenew_keytech[search]")
search_button.click()

# capture the screen
driver.get_screenshot_as_file("capture.png")
# driver.page_source

In [None]:
from bs4 import BeautifulSoup
n_of_pages = int(driver.find_element_by_xpath("//*[@id='keytech_list_result']/div[1]/nav/ul/li[5]/a").text)

# If executed the first time, otherwise, comment that!!!!!!!!!!!!!
list_of_links = []

for i in range(n_of_pages):
    url = "https://www.bayern-international.de/en/company-database/results/" + str(i) + "/"
    driver.get(url)
    html_doc = driver.page_source
    soup = BeautifulSoup(html_doc, "lxml")
    a_tags = soup.find_all('a')
    # Print the URLs to the shell
    for link in a_tags:
        if 'en/company-database/company-details/' in link.get('href'):
            list_of_links.append(link.get('href').split('/')[-2])
    time_to_wati = max(0, 1 + np.random.normal(0, 1))
    time.sleep(time_to_wati)

In [None]:
with open("list_of_links.txt", 'w') as file:
    file.writelines("\n".join(list_of_links))

In [None]:
with open("list_of_links.txt", 'r') as file:
    line = file.readlines()
list_of_links = [li[:-1] for li in line]

In [None]:
# Connecting to all the pages and parsing the data
stand = 'https://www.bayern-international.de/en/company-database/company-details/'
list_of_div = ["//*[@id='content']/div/article/ul[2]/li[1]/div/dl[1]","//*[@id='content']/div/article/ul[2]/li[1]/div/dl[2]","//*[@id='content']/div/article/ul[2]/li[1]/dl[1]"]
all_info = []
for i, item in enumerate(list_of_links[:4]):
    url = stand + item
    driver.get(url)
    d = {}
    list_of_labels = []
    list_of_definitions = []
    with open('progress_log.txt', 'a') as file:
        file.write("\nSite number: " + str(i))
        file.write("\nSite name: " + str(url))
        try:
            for path in list_of_div:
                label = driver.find_element_by_xpath(path)
                all_children_by_css = label.find_elements_by_css_selector("*")
                for item in all_children_by_css:
                    attr = item.get_attribute("class")
                    text = item.text
                    if 'Map' in text: break
                    elif attr == 'label':
                        list_of_labels.append(text)
                    elif attr == 'definition':
                        list_of_definitions.append(text)
            for i, item in enumerate(list_of_labels):
                d[item] = list_of_definitions[i]
            all_info.append(d)
            time_to_wati = max(0, 3 + np.random.normal(0, 1))
            file.write("\n" + str(time_to_wati))
            time.sleep(time_to_wati)
        except:
            file.write("\nSomething went wrong.")

After we obtain the data it is time for the cleaning process. Because of German characters, we need to do some assumptions. 

In [None]:
# I tried to replace, or ignore problematic characters. I would be better to clean them directly in the dataframe though, as we can always redo the changes
def replace(values, searchFor):
    for k in values:
        for v in values[k]:
            if searchFor in str(v):
                print("Old form:", values[k])
                values[k] = values[k].replace('\u0131', 'i')
                values[k] = values[k].encode('latin-1', 'replace')
                print("New form:", values[k])
for pr in problematic_characters:
    print("Problematic is:", pr)

In [None]:
# Here we just search for characters
def search(values, searchFor):
    for k in values:
        for v in values[k]:
            if searchFor in v:
                print("Original values:", values[k])
                print("If we ignore:", values[k].encode('latin-1', 'ignore'))

In [None]:
try:
    for item in all_info:
        search(item, '\u2013')
except:
    print("There were no problems")

In [None]:
problematic_characters = ['\u2013', '\u2022', '\u202a', '\u2019', '\u202c','\u0131']
try:
    for cha in problematic_characters:
        for item in all_info:
            replace(item, cha)
except:
    print("Some problems!")

Look for the "Sometging went worng"! If there is missing data, add it to the list! (e.g. all_info.insert(2045,add_site_info))

In [None]:
# Define DATAFRAME and add the links to sites
df = pd.DataFrame(all_info)
df["Bayer Internatinal Links"] = list_of_links

In [None]:
# My cleaning script
def number_clean(number):
    if len(number)< 1:
        clean = "not available"
    elif len(number) < 8:
        clean = "not available"
#         print(number)
    else:
        clean = number
    if clean[0] == 'b':
        clean = clean[2:-1]
    if clean[:2] == '49':
        clean = '+' + clean
    if clean[:4] == '0049':
        clean = ('+49 ' + clean[4:])
    if '.' in clean:
        if 'bzw' not in clean:
            clean = clean.replace('.', ' ')
    clean = clean.replace('-', '')
    clean = clean.replace('  ', ' ')
    clean = clean.replace('(0)', '')
    clean = clean.replace('(', '')
    clean = clean.replace(')', '')
    clean = clean.replace('/', '')
    clean = clean.replace('  ', ' ')
    clean = clean.replace('+ 49', '+49')
    clean = clean.replace('+49', '+49 ')
    clean = clean.replace('  ', ' ')
    clean = clean.replace('++', '+')
    clean = clean.replace('+0', '+')
    if clean[:1] != '+':
        if clean != 'not available':
            clean = ('+49 ' + clean[1:]).replace('  ', ' ')
    if clean[:2] == '+0':
        clean = ('+49 ' + clean[2:])
    return clean

def format_number(number):
    value = number.replace(' ', '').replace('+49', '+49 ').replace('+43', '+43 ').replace('+48', '+48 ').replace('+86', '+86 ').replace('+7', '+7 ').replace('+0', '+49 ')
    parts = value.split(' ')
    value = parts[0] + ' ' + parts[1][:4] + ' ' + parts[1][4:]
    return value

# Cleaning the phone and fax numbers (and formating them)
string_to_clean = ["Phone", "Fax"]
for item in string_to_clean:
    for i in range(len(df[item])):
        item_clean = number_clean(str(df[item][i]))
        df[item][i] = item_clean
        value = df[item][i]
        if value != "not available":
            if len(value) > 22:
                value = value.replace('oder', '/').replace('bzw.', '/')
                if '/' not in value:
                    value = value.replace(' +49', ' / +49')
            splits = value.split('/')
            if len(splits) > 1:
                splits[1] = splits[1].replace('kostenfrei', '').replace('  ', '')
            for j in range(len(splits)):
                splits[j] = number_clean(splits[j])
                splits[j] = format_number(splits[j])
            df[item][i] = ' / '.join(splits)
        
        
for i in range(len(df["Contact person"])):
    if isinstance(df["Contact person"][i], float):
        df["Contact person"][i] = "not available"
        
for item in df:
    for i in range(len(df[item])):
        if isinstance(df[item][i], str):
            if len(df[item][i]) < 2:
                df[item][i] = "not available"
        else:
            df[item][i] = df[item][i].decode("latin-1")

In [None]:
df.to_csv('Upper_Bavaria_info.csv', sep=';', encoding='latin-1', index_label=False, index=False)