# Crawling the Central Business Index

This notebook contains the sample usage for crawling the [Central Business Index](https://www.zefix.ch/en/search). The basic idea is to extract:

* The UID (id for each firm)
* The name of the firm
* The revisor
* Members of the board, their functions and the history of the board

With the data we can then construct a network of firms, their boards and revisors to find how the market is linked between them. Maybe we can find some sort of systemic risk?

In [None]:
# General Stuff
import os
import gc
import pandas as pd
import numpy as np
import time
import sys

# Scraping
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from requests import get
from requests.exceptions import RequestException

# Parsing
import re
from bs4 import BeautifulSoup

# Chrome & Driver params
CHROME_PATH = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
CHROMEDRIVER_PATH = '../../chromedriver'

# Custom 
sys.path.append('../utils/')
from utils_scraping import extract_html

In [None]:
test_url = 'https://www.zefix.ch/de/search/entity/list/firm/1150001?name=Schweizerisch%20-%20Polnische%20Wirtschaftskammer%20PolChamber&searchType=exact'

In [None]:
# Define options (ie. headlessness)
chrome_options = Options()  
chrome_options.add_argument("--headless")  
chrome_options.binary_location = CHROME_PATH

In [None]:
html = extract_html(driver_path=CHROMEDRIVER_PATH, driver_options=chrome_options,
            url=test_url)

In [None]:
html

In [None]:
soup = BeautifulSoup(html, 'html.parser')

In [None]:
firmContent = soup.find('div', {'ng-show': 'firm.details.name'})

In [None]:
wantItems = ['Revisionsstelle/n:', 'UID:']

In [None]:
contentDelimiters = re.compile(r'>(.*?)<')

In [None]:
for item in firmContent.find_all('tr'):
    for nameItem in item.find_all('td', {'class': 'ng-binding'}):
        iterTerm = re.findall(contentDelimiters, str(nameItem))[0]
        
        if iterTerm in wantItems:
            for webItem in item.find_all('span', {'class': 'ng-binding'}):
                webIter = re.findall(contentDelimiters, str(webItem))[0]
                print(iterTerm, webIter)
        

In [None]:
for item in firmContent.find_all('strong'):
    firmName = re.findall(contentDelimiters, str(item))[0]
    print(firmName)

In [None]:
for item in firmContent.find_all('span', {'ng-repeat': "translation in firm.translation"}):
    otherNames = re.findall(contentDelimiters, str(item))[0]
    print(otherNames)

In [None]:
# Shap entries
for item in firmContent.find_all('section', {'id': 'firm-shab-entries'}):
    for iterItem in item.find_all('div', {'ng-repeat': "entry in firm.details.shabPub | orderBy : '-shabDate'"}):
        #print(iterItem.prettify())
        ort = re.findall(contentDelimiters, str(iterItem.find('span', {'class': "visible-xs ng-binding"})))[0]
        #print(ort.strip())
        shapEntry = re.findall('(SHAB:.*?)<', str(iterItem))[0]
        #print(shapEntry)
        tagesRegister = re.findall('(Tagesregister:.*?)<', str(iterItem))[0]
        #print(tagesRegister)
        messageBody = iterItem.find('p', {'class': "shab-message-body"})
        
        
        print(messageBody.prettify())