# Tutti.ch Webscraper
## Mini Challenge 5 - wdb @ FHNW BSc Data Science
### Author: Lukas Reber

Script für Webscraping von tutti.ch mittels Selenium. Das Script sucht sämtliche Inserate welche mit dem definierten Suchbegriff gefunden werden und speichert folgende Attribute:

* Titel
* Beschreibung
* Kategorie
* Preis
* Anzahl Aufrufe
* Benutzername
* Datum des Inserats

Damit das Script funktioniert muss Chrome und der entsprechende Chrome Driver installiert sein

In [1]:
# using ChromeDriver 89

from selenium import webdriver
from selenium.webdriver.common.keys import Keys # used to send keys

# https://selenium-python.readthedocs.io/waits.html
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import re
import pandas as pd
from time import sleep

In [27]:

def get_tutti_ad_links(searchterm):
    """Get a list of links from all Tutti ads from the defined search term.
        This list can then be further used to scrape all information form the tutti ads.

    Args:
        searchterm (String): String to search ads on tutti.ch

    Raises:
        Exception: Navigation could not access search bar
        Exception: Navigation could not access the number of search results

    Returns:
        list: List of links to all Tutti ads returned from the specific search term
    """    
    # defining the browser to be used
    PATH = './ChromeDriver/chromedriver'
    driver = webdriver.Chrome(PATH)

    # Website to be used - this is hardcoded since the function wouldn't work for another website anyways
    driver.get("https://tutti.ch")

    # accept cookie disclaimer
    cookie = driver.find_element_by_id('onetrust-accept-btn-handler')
    cookie.click()

    # access the search bar to search for defined term
    try:
        root = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "root"))
        )
        search = root.find_element_by_name('search')
        search.clear()
        search.send_keys(searchterm)
        search.send_keys(Keys.RETURN)

    except:
        driver.quit()
        raise Exception('Excecution failed on search bar navigation')

    # getting the number of search results the search term delivered
    try:
        root = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "root"))
        )

        # wait some more for everything to load
        sleep(5)

        # find number of search results by css selector (for unknown reasons xpath contains doesnt work here)
        # find_element_by_xpath("//*[contains(text(),'Inserate aus der ganzen Schweiz')]")
        num_results = root.find_element_by_css_selector('h1')
        
        # using regex to only get the number from the string
        n = re.findall(r'\d+',num_results.text)
        
        # return the number of search results
        print(f'number of search results: {n[0]}')
        
    except:
        driver.quit()
        raise Exception('Excecution failed on getting the number of search results: no results')

    # getting the links from all ads
    links = []
    condition = True
    while condition:
        try:
            root = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "root"))
            )
            # find all titles by xpath search
            ads = root.find_elements_by_xpath('//a[@data-automation="ad-subject"]')

            # from each title get the href link
            for ad in ads:
                links.append(ad.get_property('href'))

            # move to next page until there is are no more pages left
            try:
                element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div/div/div/div[2]/div/div[1]/div/div/div/div[1]/div/div[6]/nav/ul/li[last()-1]/a'))
                )
                element.click()

                # do some more waiting, just to be sure
                sleep(5)
            except:
                condition=False
        except:
            driver.quit()

    # close browser when everything is done
    driver.close()
    driver.quit()

    # return list of links
    return links




In [76]:
def get_ad_data(links):

    # defining the browser to be used
    PATH = './ChromeDriver/chromedriver'
    driver = webdriver.Chrome(PATH)

    # Open website to accept cookie message
    driver.get("https://tutti.ch")
    sleep(5)

    # accept cookie disclaimer
    cookie = driver.find_element_by_id('onetrust-accept-btn-handler')
    cookie.click()

    # create empty dataframe
    df = pd.DataFrame([], columns=['id','title','price','place','user','description','category','dateadded','views'])

    # iterate through all links and get the defined information
    for link in links:
        driver.get(link)
        try:
            root = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "root"))
            )
            dict = {}
            dict['id'] = link.split('/')[9]
            dict['title'] = root.find_element_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/div/div/div[1]/div[1]/div[1]/div/div[1]/div[1]/div[1]/h1').text
            dict['price'] = root.find_element_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/div/div/div[1]/div[1]/div[1]/div/div[1]/div[2]/div[1]/h2').text
            dict['place'] = root.find_element_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/div/div/div[1]/div[1]/div[1]/table/tbody/tr[4]/td/dd').text
            try:
                dict['user'] = root.find_element_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/div/div/div[1]/div[2]/div[2]/div[1]/div/div[2]/h4').text
            except:
                dict['user'] = None
            dict['description'] = root.find_element_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/div/div/div[1]/div[1]/div[1]/table/tbody/tr[2]/td').text
            dict['category'] = root.find_element_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/div/div/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[1]/div/a').text
            dict['dateadded'] = root.find_element_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/div/div/div[1]/div[1]/div[1]/div/div[1]/div[2]/div[2]/div[1]/span').text
            dict['views'] = root.find_element_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/div/div/div[1]/div[1]/div[1]/div/div[1]/div[2]/div[2]/div[2]/span').text
            
            df = df.append(dict, ignore_index=True)
        except:
            driver.close()
            driver.quit()
    
    # close open connection
    driver.close()
    driver.quit()

    return df


In [28]:
# getting links form all tutti ads
links = get_tutti_ad_links('bodenplatten aussen')

number of search results: 81


In [77]:
# scraping all data
data = get_ad_data(links[0:2])


In [78]:
data

Unnamed: 0,id,title,price,place,user,description,category,dateadded,views
0,31937422,Offroad Klappanhänger von TPV Böckmann,8'260.-,6017,,"Geht überall hin, wo dein Auto auch geht\nCamp...",Autozubehör,Heute 07:03,33
1,41359857,Bodenplatten - Terrassenplatten,89.-,9615,Aeden Platten & Natursteine,Emotion Grey Natural. Bodenplatten für aussen ...,Baumaterial,Heute 04:02,11
