# Migros Location - Number of companies

In [None]:
# Check the robots.txt: https://www.swissyello.com/robots.txt

In [1]:
# Import libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="example app")
# https://towardsdatascience.com/pythons-geocoding-convert-a-list-of-addresses-into-a-map-f522ef513fd6#5352

In [None]:
# Extraction functions definition

# Field: Company name
def extr_company_name(element):
    try:
        company_name = element.find("h4").get_text()
    except:
        company_name = ""
        
    return company_name


# Field: Company address
def extr_company_address(element):
    try:
        company_address = element.find("div", {"class": "address"}).get_text()
    except:
        company_address = ""
    
    return company_address


# Field: Company latitude
def extr_company_ltd(element):
    try:
        company_ltd = element.find("a", {"class" : "mapmarker"}).get("data-ltd")
    except:
        company_ltd = ""
    
    return company_ltd


# Field: Company longitude
def extr_company_lng(element):
    try:
        company_lng = element.find("a", {"class" : "mapmarker"}).get("data-lng")
    except:
        company_lng = ""
    
    return company_lng


# Field: Company verified
def extr_company_verified(element):
    try:
        company_verified = element.find("u", {"class": "v"}).get_text()
    except:
        company_verified = ""
    
    return company_verified


# Field: Company coordinates
def extr_company_coordinates(address):
    
        try:
            data = geolocator.geocode(company_address)
            company_coordinates = data.point
        except:
            company_coordinates = ""
            
        return company_coordinates


In [None]:

# Initialize the objects needed for the scrape
url_base = "https://www.swissyello.com/location/Zurich/"
urls_failed = []
logs_dict = {}
logs_df = pd.DataFrame()
swissyello_df = pd.DataFrame()

for page_num in range(501, 600):
    
    # Create the url to scrape based on the page number
    url_scrape = url_base + str(page_num)
    
    # Request the page and load the response
    try:
        resp = requests.get(url_scrape) # , timeout=1
        soup = BeautifulSoup(resp.content, "html.parser")
    except:
        print("url get error")
        urls_failed.append(url_scrape)
        time.sleep(2)
    
    # Create a dict to append the data to a df
    swissyello_dict = {}
    
    # Download data of all the jobs in each page
    for element in soup.findAll("div", {"class": "company g_0"}):
        
        # Extract each field
        company_name     = extr_company_name(element)
        company_address  = extr_company_address(element)
        company_ltd      = extr_company_ltd(element)
        company_lng      = extr_company_lng(element)
        company_verified = extr_company_verified(element)
        
        # Extract coordinates from an address
        company_coordinates = extr_company_coordinates(company_address)

        swissyello_dict = {"company_name"        : company_name,
                           "company_address"     : company_address,
                           "company_ltd"         : company_ltd,
                           "company_lng"         : company_lng,
                           "company_verified"    : company_verified,
                           "company_coordinates" : company_coordinates
                          }
        
        # Insert rows in the dataframe
        swissyello_df = swissyello_df.append(swissyello_dict, True)
    
    # Register the logs in case of failure
    logs_dict = {"url": url_scrape}
    logs_df = logs_df.append(logs_dict, True)
    
    if page_num % 10 == 0:
        print("{}: OK".format(page_num))


In [None]:
swissyello_df.head()

In [None]:
swissyello_df.to_csv("companies_raw_data.csv", index=False)

# TESTS

In [3]:
company_address = "Schaffhauserstrasse 333, P.O. Box: 8050, Zurich	"
data = geolocator.geocode(company_address)
company_coordinates = data.point
company_coordinates

None


AttributeError: 'NoneType' object has no attribute 'point'

# WebScraping Glassdoor

In [None]:
# Import libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
