# Web Scraping - Berlin

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import urllib.parse

In [2]:
website = 'https://www.immobilo.de/mieten/wohnung/berlin'

In [3]:
response = requests.get(website)

In [4]:
response.status_code

200

In [5]:
soup = BeautifulSoup(response.content, 'html.parser')

In [7]:
results = soup.find_all('div', {'class': 'item__main-info clearfix'})

In [37]:
#Title
results[0].find('a', {'class': 'js-item-title-link ci-search-result__link'}).get_text().strip()

'Möbliertes 1 Zimmer Studio im Herzen Berlins'

In [14]:
#Location
results[0].find('div', {'class': 'item__locality'}).get_text().strip()

'Lehrter Straße, 10557 Berlin'

In [16]:
#Rooms
results[0].find('div', {'class': 'item__spec item-spec-rooms'}).get_text()

'1 Zi.'

In [20]:
#Area
results[0].find('div', {'class': 'item__spec item-spec-area'}).get_text().strip().replace('\n', '')

'22,35 m2'

In [23]:
#Price
results[0].find('div', {'class': 'item__spec item-spec-price'}).get_text().replace('\xa0', '')

'1.532,00€'

In [26]:
#Type of price
results[0].find('div', {'class': 'small text-muted item-spec-price-type'}).get_text().strip()

'Warmmiete'

In [45]:
#Apartment type
results[0].find(text=re.compile('Immobilientyp:')).strip().replace('\n', '')

'Immobilientyp:    Wohnung, Wohngemeinschaft, Zimmer'

In [33]:
#Relative link
relative_url = results[0].find('a', {'class': 'js-item-title-link ci-search-result__link'}).get('href')

In [34]:
root_url = 'https://www.immobilo.de'

In [35]:
combine_url = root_url + relative_url

In [36]:
combine_url

'https://www.immobilo.de/immobilien/moebliertes-1-zimmer-studio-im-herzen-berlins-DL4THT'

In [53]:
# Iteration through multiple pages

title_list = []
location_list = []
room_list = []
area_list = []
price_list = []
price_type_list = []
apartment_type_list = []
relative_link_list = []

for i in range(1, 401):
    
    website = 'https://www.immobilo.de/mieten/wohnung/berlin?page=' + str(i)
    
    response = requests.get(website)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    results = soup.find_all('div', {'class': 'item__main-info clearfix'})

    
    #Loop through results
    for result in results:

        #Title
        try:
            title_list.append(result.find('a', {'class': 'js-item-title-link ci-search-result__link'}).get_text().strip())
        except:
            title_list.append('')

        #Location
        try:
            location_list.append(result.find('div', {'class': 'item__locality'}).get_text().strip())
        except:
            location_list.append('')

        #Rooms
        try:
            room_list.append(result.find('div', {'class': 'item__spec item-spec-rooms'}).get_text())
        except:
            room_list.append('')

        #Area
        try:
            area_list.append(result.find('div', {'class': 'item__spec item-spec-area'}).get_text().strip().replace('\n', ''))
        except:
            area_list.append('')

         #Price
        try:
            price_list.append(result.find('div', {'class': 'item__spec item-spec-price'}).get_text().replace('\xa0', ''))
        except:
            price_list.append('')

        #Type of price
        try:
            price_type_list.append(result.find('div', {'class': 'small text-muted item-spec-price-type'}).get_text().strip())
        except:
            price_type_list.append('')

        #Type of apartment
        try:
            apartment_type_list.append(result.find(text=re.compile('Immobilientyp:')).strip().replace('\n', ''))
        except:
            apartment_type_list.append('')

        #Relative link
        try:
            relative_link_list.append(result.find('a', {'class': 'js-item-title-link ci-search-result__link'}).get('href'))
        except:
            relative_link_list.append('')

In [54]:
url_list = []

for link in relative_link_list:
    url_list.append(urllib.parse.urljoin(root_url, link))

In [55]:
df = pd.DataFrame({'Title': title_list, 'Location': location_list, 'Rooms': room_list, 'Area': area_list, 
                  'Price': price_list, 'Price type': price_type_list, 'Apartment type': apartment_type_list, 'Link': url_list})

In [57]:
df.to_excel('Apartment_Berlin_Nov.xlsx')