# IBM Data Science Professional Certificate - Capstone Final Project - Oscar Antunes
##### 
### Analysis of Düsseldorf boroughs, Germany
##### 
#### Webscraping for  Boroughs, Houses available, Housing prices, Habitants, Venues
##### 


In [1]:
# Import required libraries for scrapping
import urllib.request
from bs4 import BeautifulSoup
import smtplib
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import requests # library to handle requests


import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Regular Expressions to collect the addresses
import re

# Get the same number of District cells as Boroughs
from itertools import chain

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values



# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Class object has been created to store the credentials for safekeeping
#import credentials_oa_outlk

print('Libraries imported.')

Libraries imported.


#### 
## Getting the number of habitants per borough

In [22]:
url_hab = 'http://www.citypopulation.de/en/germany/dusseldorf/admin/'
page = requests.get(url_hab)

    
# Parsing through the URL
soup = BeautifulSoup(page.content, 'html.parser')

In [23]:

pop = soup.find_all('td', class_=['admin2', 'rpop prio1'])
    
pop_2018 = []
for p in pop:
        p = p.decode()
        stuff = re.findall((r'[0-9][0-9,.]+'),p)
        if len(stuff) == 0:
            continue
        else:
            temp = []
            temp.append(stuff)
            for sublist in temp:
                for item in sublist:
                    pop_2018.append(item)
    

pd_pop = pd.Series(pop_2018, dtype=object)
pd_pop.head(2)

0    85,914
1     2,404
dtype: object

In [29]:
name = soup.find_all(itemprop="name")

bez_name = []
for n in name:
    n = n.decode()
    stuff = re.findall((r">(.*)\<"), n)
    if len(stuff) == 0:
        continue
    else:
        temp = []
        temp.append(stuff)
        for sublist in temp:
            for item in sublist:
                bez_name.append(item)
            
            
bez_name = bez_name[3:45]+bez_name[46:] #Remove Continent and Country names + Removing old borough that was surpressed
pd_name = pd.Series(bez_name, dtype=object)


pd_name.reset_index(drop=True)
pd_name.head(2)

0    Stadtbezirk 1
1         Altstadt
dtype: object

In [28]:
population = pd.concat([pd_name, pd_pop], axis=1, sort=False)
population.columns = ['Area', 'Population']
population.head()

Unnamed: 0,Area,Population
0,Stadtbezirk 1,85914
1,Altstadt,2404
2,Carlstadt,2285
3,Derendorf,20610
4,Golzheim,12597


#### 
## Getting the Coordinates for the city of Düsseldorf

In [6]:
# Getting the Coordinates for Düsseldorf

address = 'Düsseldorf, NRW'

geolocator = Nominatim(user_agent="duesseldorf_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Düsseldorf are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Düsseldorf are 51.2254018, 6.7763137.


# 
## Getting the Boroughs

In [30]:
url_wiki = 'https://de.wikipedia.org/wiki/Liste_der_Stadtbezirke_von_D%C3%BCsseldorf'

page = requests.get(url_wiki)
# Parsing through the URL
soup = BeautifulSoup(page.content, 'html.parser')
print("Data parsed.")

# Identify the table to scrap
right_table = soup.find('table', class_ = 'wikitable sortable')
print("Table located.")

Data parsed.
Table located.


# 
## Assigning Districts and Boroughs to DataFrame

In [31]:
# SET THE DATAFRAME COLUMNS || CREATE EMPTY DATAFRAME WITH THE SAME STRUCTURE OF THE ORIGINAL
column_names = ['District','Borough','Fläche','Einwohner','Bewölkerungsdichte','Bezirkvorsteher','K','Karte']
df = pd.DataFrame(columns = column_names)

#####################################################
# LOAD OF THE DATAFRAME || CLEAR TO THE 2 COLUMNS NEEDEED
for tr_cell in right_table.find_all('tr'):
    row_data = []
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.rstrip())
        
    if len(row_data)>0:
        df.loc[len(df)] = row_data
        
df = df[['District', 'Borough']]
print(df.head(2),'\n')

#####################################################
# REMOVE NUMERIC DIGITS FROM THE BOROUGHS
df.set_index(['District', 'Borough'])  
df = df.replace(regex=r'[0-9]+', value=',')

# As the numeric digit has been removed from the District Column, 
# we need to add the number of each district as per the current index 
district_numb = ['1','2','3','4','5','6','7','8','9','10']
df['district_numb'] = district_numb

print(df.head(2))

           District                                            Borough
0  Stadtbezirk 1[1]  011 Altstadt012 Carlstadt013 Stadtmitte014 Pem...
1  Stadtbezirk 2[2]     021 Flingern Süd022 Flingern Nord023 Düsseltal 

           District                                            Borough  \
0  Stadtbezirk ,[,]  , Altstadt, Carlstadt, Stadtmitte, Pempelfort,...   
1  Stadtbezirk ,[,]           , Flingern Süd, Flingern Nord, Düsseltal   

  district_numb  
0             1  
1             2  


In [32]:
### Get several District cells for each Borough
# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split(',')))

# calculate lengths of splits
lens = df['Borough'].str.split(',').map(len)

# create new dataframe, repeating or chaining as appropriate
res = pd.DataFrame({'District': np.repeat(df['District'], lens),
                    'Borough': chainer(df['Borough']),
                    'district_numb': np.repeat(df['district_numb'], lens)})
# Strip empty cells ""
res = res[res['Borough'].str.strip().astype(bool)]

In [42]:
# Concatenate the District to include the Number for each District
res['Dis'] = res['District'].str[:11] 
res['District'] = res['Dis']+' '+res['district_numb']

# Finalize the DataFrame with Districts and Boroughs of Düsseldorf
df_boroughs = res[['District','Borough']]
df_boroughs = df_boroughs.reset_index(drop=True)

print('Scrapping of Districts and Boroughs completed!\n')
print(df_boroughs.head(2))

Scrapping of Districts and Boroughs completed!

        District     Borough
0  Stadtbezirk 1    Altstadt
1  Stadtbezirk 1   Carlstadt


# 
## Scrapping housing information

In [12]:
pnumber = 1 
num = 1 
address1 = []
price1 = []
rooms1 = []

while pnumber < 21:
    url_house = 'https://www.immobilienscout24.de/Suche/radius/wohnung-mieten?centerofsearchaddress=D%C3%BCsseldorf;;;1276010012;Nordrhein-Westfalen;&numberofrooms=2.0-&price=-2000.0&geocoordinates=51.23824;6.81513;3.0&pagenumber={}'.format(pnumber)
    headers = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}

    page = requests.get(url_house, headers=headers)
    
    # Parsing through the URL
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #Check if pages are being scraped
    print('Page {} scraped'.format(num))
    num = num + 1
    
    #Getting Adress
    address  = soup.find_all('button')
    
    for a in address:
        a = a.decode()
        stuff = re.findall(("[^<>]+\w+[A-Za-z].\\s+Düsseldorf"), a)
        if len(stuff) == 0:
            continue
        else:        
            address1.append(stuff)
    
    #Getting price
    price  = soup.find_all('dd')
    
    for p in price:
        p = p.decode()
        stuff = re.findall(("[^<>]+\\s+€"),p)
        if len(stuff) == 0:
            continue
        else:        
            price1.append(stuff)
    
    #Getting number of rooms
    rooms = soup.findAll(class_=['onlySmall'])
    
    for r in rooms:
        r = r.decode()
        stuff = re.findall(("[>]+[0-9]+[<]"), r)
        if len(stuff) == 0:
            continue
        else:   
            rooms1.append(stuff)
    
    
    pnumber = pnumber + 1
    

Page 1 scraped
Page 2 scraped
Page 3 scraped
Page 4 scraped
Page 5 scraped
Page 6 scraped
Page 7 scraped
Page 8 scraped
Page 9 scraped
Page 10 scraped
Page 11 scraped
Page 12 scraped
Page 13 scraped
Page 14 scraped
Page 15 scraped
Page 16 scraped
Page 17 scraped
Page 18 scraped
Page 19 scraped
Page 20 scraped


In [13]:
rooms1[:5]

[['>3<'], ['>2<'], ['>2<'], ['>2<'], ['>3<']]

In [14]:

flat_address = []
flat_price = []
flat_rooms = []

for sublist in address1:
    for item in sublist:
        flat_address.append(item)

for sublist in price1:
    for item in sublist:
        flat_price.append(item)
        
for sublist in rooms1:
    for item in sublist:
        flat_rooms.append(item)

pd_address = pd.Series(flat_address, dtype = object)
pd_price = pd.Series(flat_price, dtype = object)
pd_rooms = pd.Series(flat_rooms, dtype = object)


In [15]:
# Finishing cleaning the number of rooms
pd_rooms = pd_rooms.str[1]

In [16]:
result = pd.concat([pd_address, pd_price, pd_rooms], axis=1, sort=False)
result.columns = ['Address', 'Price', 'Num_Rooms']
result.head()

Unnamed: 0,Address,Price,Num_Rooms
0,"Heideweg 5, Mörsenbroich, Düsseldorf","1.086,03 €",3
1,"Langerstraße 57, Flingern Süd, Düsseldorf",700 €,2
2,"Glockenstraße 35, Derendorf, Düsseldorf",600 €,2
3,"Mindener Straße 92, Oberbilk, Düsseldorf",899 €,2
4,"Mindener Straße 100, Oberbilk, Düsseldorf","1.403,27 €",3
