In [1]:
# Import required libraries for scrapping
import urllib.request
from bs4 import BeautifulSoup
import smtplib
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import requests # library to handle requests


import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Regular Expressions to collect the addresses
import re

# Get the same number of District cells as Boroughs
from itertools import chain

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values



# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Class object has been created to store the credentials for safekeeping
#import credentials_oa_outlk

print('Libraries imported.')

Libraries imported.


## Getting the number of habitants per borough

In [2]:
url_hab = 'http://www.citypopulation.de/en/germany/dusseldorf/admin/'


In [3]:
# Getting the Coordinates for Düsseldorf

address = 'Düsseldorf, NRW'

geolocator = Nominatim(user_agent="duesseldorf_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Düsseldorf are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Düsseldorf are 51.2254018, 6.7763137.


# 
## Getting the Boroughs

In [4]:
url_wiki = 'https://de.wikipedia.org/wiki/Liste_der_Stadtbezirke_von_D%C3%BCsseldorf'

page = requests.get(url_wiki)
# Parsing through the URL
soup = BeautifulSoup(page.content, 'html.parser')


print("Data parsed.")

Data parsed.


In [5]:
#all_tables = soup.find_all("table")
right_table = soup.find('table', class_ = 'wikitable sortable')

# class="admin2" | class="rpop prio1"
# bors = soup.findAll(class_=['admin2'])

# 
### Assigning Districts and Boroughs to DataFrame

In [6]:
# SET THE DATAFRAME COLUMNS || CREATE EMPTY DATAFRAME WITH THE SAME STRUCTURE OF THE ORIGINAL
column_names = ['District','Borough','Fläche','Einwohner','Bewölkerungsdichte','Bezirkvorsteher','K','Karte']
df = pd.DataFrame(columns = column_names)

#####################################################
# LOAD OF THE DATAFRAME || CLEAR TO THE 2 COLUMNS NEEDEED
for tr_cell in right_table.find_all('tr'):
    row_data = []
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.rstrip())
        
    if len(row_data)>0:
        df.loc[len(df)] = row_data
        
df = df[['District', 'Borough']]
print(df.head(2))

#####################################################
# REMOVE NUMERIC DIGITS FROM THE BOROUGHS
df.set_index(['District', 'Borough'])  
df = df.replace(regex=r'[0-9]+', value=',')
district_numb = ['1','2','3','4','5','6','7','8','9','10']
df['district_numb'] = district_numb

print(df.head(2))

           District                                            Borough
0  Stadtbezirk 1[1]  011 Altstadt012 Carlstadt013 Stadtmitte014 Pem...
1  Stadtbezirk 2[2]     021 Flingern Süd022 Flingern Nord023 Düsseltal
           District                                            Borough  \
0  Stadtbezirk ,[,]  , Altstadt, Carlstadt, Stadtmitte, Pempelfort,...   
1  Stadtbezirk ,[,]           , Flingern Süd, Flingern Nord, Düsseltal   

  district_numb  
0             1  
1             2  


### Check the Import part

In [7]:
# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split(',')))

# calculate lengths of splits
lens = df['Borough'].str.split(',').map(len)

# create new dataframe, repeating or chaining as appropriate
res = pd.DataFrame({'District': np.repeat(df['District'], lens),
                    'Borough': chainer(df['Borough']),
                    'district_numb': np.repeat(df['district_numb'], lens)})
# Strip empty cells ""
res = res[res['Borough'].str.strip().astype(bool)]

In [8]:
res['Dis'] = res['District'].str[:11] 
res['District'] = res['Dis']+' '+res['district_numb']
df_boroughs = res[['District','Borough']]
df_boroughs.reset_index(drop=True)
print('Scrapping of Boroughs completed!\n')
print(df_boroughs.head(2))

Scrapping of Boroughs completed!

        District     Borough
0  Stadtbezirk 1    Altstadt
0  Stadtbezirk 1   Carlstadt


# 
## Scrapping housing information

In [9]:
# Set/access url to be scrapped
url_house = 'https://www.immobilienscout24.de/Suche/radius/wohnung-mieten?centerofsearchaddress=D%C3%BCsseldorf;;;1276010012;Nordrhein-Westfalen;&numberofrooms=2.0-&price=-2000.0&geocoordinates=51.23824;6.81513;3.0&pagenumber=3'
headers = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}

page = requests.get(url_house, headers=headers)

# Parsing through the URL
soup = BeautifulSoup(page.content, 'html.parser')

print("Data parsed.")

Data parsed.


## ~~Getting the address~~

In [10]:
# Getting the address of each house
address  = soup.find_all('button')

address1 = []
for a in address:
    a = a.decode()
    stuff = re.findall(("[^<>]+\w+[A-Za-z].\\s+Düsseldorf"), a)
    if len(stuff) == 0:
        continue
    else:        
        address1.append(stuff)

#address1

## ~~Getting the price~~

In [11]:
# Getting the price for each house
price  = soup.find_all('dd')

price1 = []
for p in price:
    p = p.decode()
    stuff = re.findall(("[^<>]+\\s+€"),p)
    if len(stuff) == 0:
        continue
    else:        
        price1.append(stuff)

#price1

## ~~Getting the number of rooms~~

In [12]:
# Getting the number of rooms
rooms = soup.findAll(class_=['onlySmall'])

rooms1 = []
for r in rooms:
    r = r.decode()
    stuff = re.findall(("[>]+[0-9]+[<]"), r)
    if len(stuff) == 0:
        continue
    else:        
        rooms1.append(stuff)

#rooms1

# 
## Sequential scrapping

In [18]:
pnumber = 1 
num = 1 
address1 = []
price1 = []
rooms1 = []

while pnumber < 11:
    url_house = 'https://www.immobilienscout24.de/Suche/radius/wohnung-mieten?centerofsearchaddress=D%C3%BCsseldorf;;;1276010012;Nordrhein-Westfalen;&numberofrooms=2.0-&price=-2000.0&geocoordinates=51.23824;6.81513;3.0&pagenumber={}'.format(pnumber)
    headers = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}

    page = requests.get(url_house, headers=headers)
    
    #Check if pages are being scraped
    print('Page {} scraped'.format(num))
    num = num + 1
    
    #Getting Adress
    address  = soup.find_all('button')
    
    for a in address:
        a = a.decode()
        stuff = re.findall(("[^<>]+\w+[A-Za-z].\\s+Düsseldorf"), a)
        if len(stuff) == 0:
            continue
        else:        
            address1.append(stuff)
    
    #Getting price
    price  = soup.find_all('dd')
    
    for p in price:
        p = p.decode()
        stuff = re.findall(("[^<>]+\\s+€"),p)
        if len(stuff) == 0:
            continue
        else:        
            price1.append(stuff)
    
    
    #Getting number of rooms
    rooms = soup.findAll(class_=['onlySmall'])
    
    for r in rooms:
        r = r.decode()
        stuff = re.findall(("[>]+[0-9]+[<]"), r)
        if len(stuff) == 0:
            continue
        else:   
            rooms1.append(stuff)
    
    
    pnumber = pnumber + 1
    

Page 1 scraped
Page 2 scraped
Page 3 scraped
Page 4 scraped
Page 5 scraped
Page 6 scraped
Page 7 scraped
Page 8 scraped
Page 9 scraped
Page 10 scraped


In [20]:
flat_address = []
flat_price = []
flat_rooms = []

for sublist in address1:
    for item in sublist:
        flat_address.append(item)

for sublist in price1:
    for item in sublist:
        flat_price.append(item)
        
for sublist in rooms1:
    for item in sublist:
        flat_rooms.append(item)

pd_address = pd.Series(flat_address)
pd_price = pd.Series(flat_price)
pd_rooms = pd.Series(flat_rooms)


In [23]:
pd_rooms = pd_rooms.replace(regex=)

0    >3<
1    >4<
2    >3<
3    >3<
4    >2<
dtype: object