In [2]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
import os
import re
import pandas as pd
import time

#lower this number if you do not fear getting IP banned and just want to see the results
#or increase it if you want to run this multiple times in a short period
sleep_time = 1

Two prominent sellers of coffee beans in the city of Bern were choosen for this assignment.
The names are Adrianos and Blaser. 

The Coffee shops rarely sell their product on websites of larger retailers (e.g. Migros, Coop, Galaxus).
Their product can be bought locally in one of their stores or online. 

I chose these nice products because their products are mainly sold on their websites. Which makes it difficult to compare to other products.



In [3]:
#the following are the two sites which are compared
url_blaser = 'https://www.blasercafe.ch/shop/category/8'
url_adrianos = 'https://adrianos.ch/shop/kaffee/bohnen'

In [4]:

#with this method, chromedriver opens an instance of chromium
#by opening an instance of chromium graphically instead of headless
#it cirumvents some traditional ways websites check for webcrawlers
driver = webdriver.Chrome()
driver.get(url_blaser)

#By waiting a bit, there is a smaller chance of getting blocked 
time.sleep(sleep_time)

In [5]:
#A quick check of some typical products concludes that most of the prices are for 250g packages
#there are some exceptions though: through a manual check, the product "Geschenkset Berner Gourmet" contains 
#a variety of tee and coffee beans. This difference can only be spotted on the product specific page. 
# Because this information is not available from the overview, I chose to ignore this detail.

soup = bs(driver.page_source, 'html.parser')

#saving the html just in case
with open('soup_blaser.txt', 'w', encoding='utf-8') as file:
    file.write(str(soup))

#after checking the html for the first time it is clear that all 
#products start after a div with the class "views-row"
#so all of them are saved into the variable content
content = soup.find_all("div", class_=re.compile("views-row"))

#this list saves all information from the site "blaser" for later
products_blaser = []

for c in content: 
    #sometimes the row does not contian a product. Therefore it is surrounded by a try catch
    try:
        #the name is in another div called "product-teaser-content"
        product = c.find("div", class_=re.compile("product-teaser-content"))

        #the name is then in an span within an a. 
        # By striping the text the name is revealed as shown on the site
        name = product.find("a").find("span").text.strip()
        
        #the name usual contains "Bohnenkaffe" (Coffee beans). This information is unnecessary as only this 
        #type of coffee is viewed on this page
        if name.find("Bohnenkaffee") > -1:
            name = name[:(name.find("Bohnenkaffee"))-1]

        #the price is in another div called "product-teaser-content-wrapper"
        product_wrapper = c.find("div", class_=re.compile("product-teaser-content-wrapper"))

        #the price is then in an span. 
        # By striping the text the price is revealed as shown on the site
        price = product_wrapper.find("span").text.strip()
        
        #the price has some unnecessary information
        price = price[-(len(price)-7):]

        #Adding a dictionary containing the relevant information
        products_blaser.append({'name':name, 'price': price, 'grams':"250", 'product': "blaser", 'site': 'blaser'})
        
        #as a test the latest entry into the list is shown
        print(products_blaser[len(products_blaser)-1])
    
    except Exception as e:
        #this row has no products, so ignore
        continue

{'name': 'Terra Vita Bio Fairtrade', 'price': '6.90', 'grams': '250', 'product': 'blaser', 'site': 'blaser'}
{'name': 'Rösterei Spezial Maya', 'price': '7.90', 'grams': '250', 'product': 'blaser', 'site': 'blaser'}
{'name': 'Coffee for Champions -', 'price': '8.50', 'grams': '250', 'product': 'blaser', 'site': 'blaser'}
{'name': 'Blasercafé Degustations-Box', 'price': '45.–', 'grams': '250', 'product': 'blaser', 'site': 'blaser'}
{'name': 'Marrone', 'price': '6.10', 'grams': '250', 'product': 'blaser', 'site': 'blaser'}
{'name': 'Verde Bio Fairtrade', 'price': '6.70', 'grams': '250', 'product': 'blaser', 'site': 'blaser'}
{'name': 'Purista Fairtrade', 'price': '6.10', 'grams': '250', 'product': 'blaser', 'site': 'blaser'}
{'name': 'Lilla e Rose', 'price': '6.30', 'grams': '250', 'product': 'blaser', 'site': 'blaser'}
{'name': 'Lussuria', 'price': '6.–', 'grams': '250', 'product': 'blaser', 'site': 'blaser'}
{'name': 'Rosso & Nero', 'price': '6.90', 'grams': '250', 'product': 'blaser', 

In [6]:
#in order to prevent the computer endlessly opening new windows
#the current window is closed
driver.close()

In [7]:

#with this method, chromedriver opens an instance of chromium
#by opening an instance of chromium graphically instead of headless
#it cirumvents some traditional ways websites check for webcrawlers
driver = webdriver.Chrome()
driver.get(url_adrianos)

In [8]:
#According to the top of the page all the displayed prices of the adrianos products are for 250g packages
#there are some exceptions though: through a manual check, the product "Kaffeebohnen - Probierset" contains 
#6 different packages of coffee beans each weighing around 125g. This difference can only be spotted on the 
#product specific page. Because this information is not available from the overview, 
# I chose to ignore this detail

soup = bs(driver.page_source, 'html.parser')

#saving the html just in case
with open('soup_adrianos.txt', 'w', encoding='utf-8') as file:
    file.write(str(soup))

#after checking the html for the first time it is clear that all 
#products start after an article with the class "product product_preview"
#so all of them are saved into the variable content
content = soup.find_all("article", class_=re.compile("product product_preview"))

#this list saves all information from the site "adrianos" for later
products_adrianos = []

for c in content: 
    #sometimes the row does not contian a product. Therefore it is surrounded by a try catch
    try:
        #the relevant information is in a div called "product__caption product__caption_preview"
        product = c.find("div", class_=re.compile("product__caption product__caption_preview"))

        # By striping the text within the h2 the name is revealed as shown on the site
        name = product.find("h2").text.strip()

        #the name usual contains "Bohnenkaffe" (Coffee beans). This information is unnecessary as only this 
        #type of coffee is viewed on this page
        if name.find("Filterkaffee ") > -1:
            name = name[:(name.find("Filterkaffee "))-1]

        if name.find("Kaffee ") > -1:
            name = name[:(name.find("Kaffee "))-1]

        # By striping the text within the span the price is revealed as shown on the site
        price = product.find("span").text.strip()
        
        #the price has some unnecessary information
        price = price[-(len(price)-3):]

        #Adding a dictionary containing the relevant information
        products_adrianos.append({'name':name, 'price': price, 'grams':"250", 'product': "adrianos", 'site': 'adrianos'})
        
        #as a test the latest entry into the list is shown
        print(products_adrianos[len(products_adrianos)-1])
    
    except Exception as e:
        #this row has no products, so ignore
        continue

{'name': 'Tueste', 'price': ' 15.00', 'grams': '250', 'product': 'adrianos', 'site': 'adrianos'}
{'name': 'Juanita', 'price': ' 13.00', 'grams': '250', 'product': 'adrianos', 'site': 'adrianos'}
{'name': 'Festive Blend 2023', 'price': ' 12.50', 'grams': '250', 'product': 'adrianos', 'site': 'adrianos'}
{'name': 'Don Victor', 'price': ' 14.00', 'grams': '250', 'product': 'adrianos', 'site': 'adrianos'}
{'name': 'Lila Sorto', 'price': ' 14.00', 'grams': '250', 'product': 'adrianos', 'site': 'adrianos'}
{'name': 'Ichamama Natural', 'price': ' 14.00', 'grams': '250', 'product': 'adrianos', 'site': 'adrianos'}
{'name': 'ICE-T', 'price': ' 14.00', 'grams': '250', 'product': 'adrianos', 'site': 'adrianos'}
{'name': 'Pablo Montgomery', 'price': ' 14.00', 'grams': '250', 'product': 'adrianos', 'site': 'adrianos'}
{'name': 'Cascara Geisha Kaffeekirschen Tee', 'price': ' 9.00', 'grams': '250', 'product': 'adrianos', 'site': 'adrianos'}
{'name': 'Robusta Rocket', 'price': ' 11.50', 'grams': '250',

In [9]:
#in order to prevent the computer endlessly opening new windows
#the current window is closed
driver.close()

In [10]:
#Additionally, for Blaser the webiste "kaffeezentrale.ch" offers some of the blaser products as well
#It yields an interesting comparison, which is nice.

url_kaffeezentrale = 'https://kaffeezentrale.ch/kaffeebohnen?p=1&properties=3f287e0ff57d42daad52e4ad2b2ae68d'



In [11]:
#with this method, chromedriver opens an instance of chromium
#by opening an instance of chromium graphically instead of headless
#it cirumvents some traditional ways websites check for webcrawlers
driver = webdriver.Chrome()
driver.get(url_kaffeezentrale)

In [12]:
#As before, the prices are for 250g packages

soup = bs(driver.page_source, 'html.parser')

#saving the html just in case
with open('soup_kaffeezentrale.txt', 'w', encoding='utf-8') as file:
    file.write(str(soup))

#after checking the html for the first time it is clear that all 
#products start after a div with the class "views-row"
#so all of them are saved into the variable content
content = soup.find_all("div", class_=re.compile("cms-listing-col col dc-listing-col"))

#this list saves all information from the site "blaser" for later
products_kaffeezentrale = []

for c in content: 
    #sometimes the row does not contian a product. Therefore it is surrounded by a try catch
    try:
        #the name is in another div called "product-name"
        product = c.find("div", class_=re.compile("product-name"))

        # By striping the text the name is revealed as shown on the site
        name = product.text.strip()
        
        #the name usual contains "Blasercafe". This information is unnecessary as only this 
        #Coffee Brand is viewed
        if name.find("Blasercafé,") > -1:
            name = name[-(len(name)-(len("Blasercafé")+2)):]
        
        if name.find("Blasercafé") > -1:
            name = name[-(len(name)-(len("Blasercafé")+1)):]

        #the price is in another div called "custom-price"
        custom_price = c.find("div", class_=re.compile("custom-price"))

        #the price is then in an span. 
        # By striping the text the price is revealed as shown on the site
        price = custom_price.find("span").text.strip()
        
        #correcting the format
        price = price.replace(",", ".")

        #Adding a dictionary containing the relevant information
        products_kaffeezentrale.append({'name':name, 'price': price, 'grams':"250", 'product': "blaser", 'site': 'kaffeezentrale'})
        
        #as a test the latest entry into the list is shown
        print(products_kaffeezentrale[len(products_kaffeezentrale)-1])
    
    except Exception as e:
        #this row has no products, so ignore
        continue

{'name': 'Classico', 'price': '6.50', 'grams': '250', 'product': 'blaser', 'site': 'kaffeezentrale'}
{'name': 'Rosso e Nero', 'price': '6.90', 'grams': '250', 'product': 'blaser', 'site': 'kaffeezentrale'}
{'name': 'Terra Vita', 'price': '6.90', 'grams': '250', 'product': 'blaser', 'site': 'kaffeezentrale'}
{'name': 'Terroir, Brazil Capoerinha Estate', 'price': '8.90', 'grams': '250', 'product': 'blaser', 'site': 'kaffeezentrale'}
{'name': 'Terroir, Orang Utan Sumatra', 'price': '12.50', 'grams': '250', 'product': 'blaser', 'site': 'kaffeezentrale'}
{'name': 'Terroir, Panama Finca Bonita Springs', 'price': '12.90', 'grams': '250', 'product': 'blaser', 'site': 'kaffeezentrale'}
{'name': 'OCOA Santo Domingo, Dieter Meier Edition', 'price': '19.90', 'grams': '250', 'product': 'blaser', 'site': 'kaffeezentrale'}
{'name': 'Marrone', 'price': '22.50', 'grams': '250', 'product': 'blaser', 'site': 'kaffeezentrale'}
{'name': 'Lilla e Rose', 'price': '23.30', 'grams': '250', 'product': 'blaser',

In [13]:
#in order to prevent the computer endlessly opening new windows
#the current window is closed
driver.close()

In [14]:
#After gathering all this information. Following comparisons can be made:
# - How much does coffee beans on average per package and per gram cost at the different shops.
# - Is there a difference between the price for Blaser products at the different sites.

#Altough these are interesting questions, they are out of scope of the assingment