In [1]:
from bs4 import BeautifulSoup

# the second package we will need we already know it
import requests
import pandas as pd
import time
import random
import numpy as np
import re
import os
import seaborn as sns
import scipy.stats as st
import statistics

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

[WDM] - Downloading: 100%|██████████| 6.58M/6.58M [00:00<00:00, 11.5MB/s]


In [None]:
data_df = pd.read_csv('C:/Users/magavald/Desktop/data_project.csv', header=None)
data_df

In [None]:
target_attrs= {'Rental price':'listing-features__description listing-features__description--for_rent_price'
               ,'Deposit':'listing-features__description listing-features__description--deposit'
               ,'Number rooms':'listing-features__description listing-features__description--number_of_rooms'
               ,'Number floors':'listing-features__description listing-features__description--number_of_floors'
               ,'Number bathrooms':'listing-features__description listing-features__description--number_of_bathrooms'
               ,'Surface area':'listing-features__description listing-features__description--surface_area'
               ,'Interior':'listing-features__description listing-features__description--interior'
               ,'House Type':'listing-features__description listing-features__description--dwelling_type'
               ,'Construction Year':'listing-features__description listing-features__description--construction_period'
               ,'Balcony':'listing-features__description listing-features__description--balcony'}

In [None]:
attr_dict={}
for url in data_df[0]:
    attr_dict[url] = {}
    driver.get(url)
    time.sleep(random.uniform(0,5))
    apartment_page = driver.page_source
    
    soup = BeautifulSoup(apartment_page, 'lxml')
    
    for target_attr in target_attrs.keys():
        if soup.find('dd', attrs={'class':target_attrs[target_attr]}) is None:
            attr_dict[url][target_attr] = ''
        else:
            attr_dict[url][target_attr] = soup.find('dd', attrs={'class':target_attrs[target_attr]}).text.strip()

In [None]:
attr_df = pd.DataFrame(data=attr_dict)
attr_df = (attr_df.T).reset_index()
attr_df

In [None]:
#Data cleaning per column
attr_df['Rental price'] = attr_df['Rental price'].str.extract(r"(\d+\,\d+)").replace(',','', regex=True).astype(float)
attr_df['Deposit'] = attr_df['Deposit'].str.extract(r"(\d+\,\d+)").replace(',','', regex=True).astype(float)
attr_df['Deposit Rent ratio'] = attr_df['Deposit']/attr_df['Rental price']
attr_df['Surface area'] = attr_df['Surface area'].str.extract(r"(\d+\d+)").astype(float)
attr_df['Balcony'] = attr_df['Balcony'].replace({'Present':1,'Not present':0})
attr_df['Number rooms'] = attr_df['Number rooms'].str.extract(r"(\d+)").astype(float)
attr_df['Number floors'] = attr_df['Number floors'].str.extract(r"(\d+)").astype(float)
attr_df['Number bathrooms'] = attr_df['Number bathrooms'].str.extract(r"(\d+)").astype(float)
attr_df['Construction Year'] = attr_df['Construction Year'].str.extract(r"(\d+)").astype(float)

In [4]:
#reading file to acess data at the time starting the project
attr_df = pd.read_csv('final_data_set.csv')
attr_df.head()

Unnamed: 0.1,Unnamed: 0,index,Rental price,Deposit,Number rooms,Number floors,Number bathrooms,Surface area,Interior,House Type,Construction Year,Balcony,Deposit Rent ratio,1_x,1_y
0,0,https://www.pararius.com/apartment-for-rent/am...,2750.0,,4.0,3.0,2.0,92.0,,Apartment,1926.0,1,,Zuid,Zuid
1,1,https://www.pararius.com/apartment-for-rent/am...,2100.0,,2.0,1.0,1.0,75.0,Upholstered or furnished,Apartment,1910.0,1,,Zuid,Zuid
2,2,https://www.pararius.com/apartment-for-rent/am...,1800.0,3600.0,3.0,1.0,1.0,55.0,Upholstered,Apartment,1925.0,1,2.0,Zuid,Zuid
3,3,https://www.pararius.com/apartment-for-rent/am...,2400.0,4800.0,4.0,1.0,1.0,75.0,Furnished,Apartment,1921.0,0,2.0,Zuid,Zuid
4,4,https://www.pararius.com/apartment-for-rent/am...,4200.0,,4.0,2.0,1.0,130.0,,Apartment,1899.0,1,,Zuid,Zuid


In [5]:
#data cleaning on file read
attr_df['House Type'] = attr_df['House Type'].replace(to_replace = 'Appartement', value= 'Apartment')

In [8]:
#Confidence interval of Rental price
st.t.interval(0.95,len(attr_df['Rental price'].dropna())-1,loc=attr_df['Rental price'].dropna().mean(),scale=(np.std(attr_df['Rental price'].dropna(), ddof=1))/(np.sqrt(len(attr_df['Rental price'].dropna()))))

(2082.544959493127, 2611.131511095108)

In [9]:
#Confidence interval of Surface area
st.t.interval(0.95,len(attr_df['Surface area'])-1,loc=attr_df['Surface area'].mean(),scale=(np.std(attr_df['Surface area'], ddof=1))/(np.sqrt(len(attr_df['Surface area']))))

(81.96444250640613, 103.32127177930815)

In [10]:
price_m2 = attr_df['Rental price']/attr_df['Surface area']

In [11]:
st.t.interval(0.95,len(price_m2.dropna())-1,loc=price_m2.dropna().mean(),scale=(np.std(price_m2.dropna(), ddof=1))/(np.sqrt(len(price_m2.dropna()))))

(25.00949396433752, 28.678834466454763)

In [13]:
#Hypothesis testing
#H0: Average deposit is = 2 times rent
#H1: Average deposit is not = to 2 times rent
st.ttest_1samp(attr_df['Deposit Rent ratio'].dropna(),2)

Ttest_1sampResult(statistic=-1.9806452812789401, pvalue=0.0568649629323235)

In [14]:
#Hypothesis testing
#H0: Houses with >100m2 have the same price  or higher per m2 as rest of houses
#H1: Houses with >100m2 have a lower price per m2

attr_df_large = attr_df[attr_df['Surface area']>100]
attr_df_small = attr_df[attr_df['Surface area']<=100]

In [15]:
series_large = attr_df_large['Rental price']/attr_df_large['Surface area']
series_small = attr_df_small['Rental price']/attr_df_small['Surface area']
series_small = series_small.dropna()

In [16]:
st.ttest_ind(series_small,series_large, equal_var=False, alternative='greater') 

Ttest_indResult(statistic=2.19374310271379, pvalue=0.02031621396545005)

In [17]:
#Hypothesis testing
#H0: Newer houses (built after 1965) have the same price or lower per m2 than older ones
#H1: Newer houses have greater price per m2

attr_df_newer = attr_df[attr_df['Construction Year']>1964]
attr_df_older = attr_df[attr_df['Construction Year']<=1964]

In [19]:
series_newer = attr_df_newer['Rental price']/attr_df_newer['Surface area']
series_newer = series_newer.dropna()
series_older = attr_df_older['Rental price']/attr_df_older['Surface area']

In [20]:
st.ttest_ind(series_newer,series_older, equal_var=False, alternative='greater')

Ttest_indResult(statistic=-3.078057984296832, pvalue=0.9983287612275303)

In [21]:
#Hypothesis testing
#H0: Houses with balcony have a lower or equal price as houses without balcony
#H1: Houses with balcony have a higher price

attr_df_balcony = attr_df[attr_df['Balcony']=='1']
attr_df_no_balcony = attr_df[attr_df['Balcony']=='0']

In [22]:
series_balcony = attr_df_balcony['Rental price']/attr_df_balcony['Surface area']
series_no_balcony = attr_df_no_balcony['Rental price']/attr_df_no_balcony['Surface area']
series_no_balcony = series_no_balcony.dropna()

In [23]:
st.ttest_ind(series_balcony,series_no_balcony, equal_var=False, alternative='greater')

Ttest_indResult(statistic=0.6621407766310594, pvalue=0.25539768211268077)

In [24]:
#Confidence interval of % of Apartments
attr_df['House Type'].value_counts()

Apartment    58
House        10
Studio        2
Name: House Type, dtype: int64

In [25]:
a=58/70
a

0.8285714285714286

In [26]:
b = [1]*58
c = [0]*12
d = b + c

In [27]:
st.t.interval(0.95,len(d)-1,loc=a,scale=(np.std(d, ddof=1))/(np.sqrt(len(d))))

(0.7380579925797578, 0.9190848645630995)

In [28]:
#ANOVA test to compare differences across neighborhood
attr_df['1_x'].unique()

array(['Zuid', 'Centrum', 'Oost', 'Noord', 'Nieuw-West', 'West',
       'Zuidoost'], dtype=object)

In [29]:
attr_df['Rent per m2'] = attr_df['Rental price']/attr_df['Surface area']
attr_df.head()

Unnamed: 0.1,Unnamed: 0,index,Rental price,Deposit,Number rooms,Number floors,Number bathrooms,Surface area,Interior,House Type,Construction Year,Balcony,Deposit Rent ratio,1_x,1_y,Rent per m2
0,0,https://www.pararius.com/apartment-for-rent/am...,2750.0,,4.0,3.0,2.0,92.0,,Apartment,1926.0,1,,Zuid,Zuid,29.891304
1,1,https://www.pararius.com/apartment-for-rent/am...,2100.0,,2.0,1.0,1.0,75.0,Upholstered or furnished,Apartment,1910.0,1,,Zuid,Zuid,28.0
2,2,https://www.pararius.com/apartment-for-rent/am...,1800.0,3600.0,3.0,1.0,1.0,55.0,Upholstered,Apartment,1925.0,1,2.0,Zuid,Zuid,32.727273
3,3,https://www.pararius.com/apartment-for-rent/am...,2400.0,4800.0,4.0,1.0,1.0,75.0,Furnished,Apartment,1921.0,0,2.0,Zuid,Zuid,32.0
4,4,https://www.pararius.com/apartment-for-rent/am...,4200.0,,4.0,2.0,1.0,130.0,,Apartment,1899.0,1,,Zuid,Zuid,32.307692


In [30]:
st.f_oneway(attr_df[attr_df['1_x']=='Zuid']['Rent per m2'].dropna()
            ,attr_df[attr_df['1_x']=='Centrum']['Rent per m2'].dropna()
            ,attr_df[attr_df['1_x']=='Oost']['Rent per m2'].dropna()
            ,attr_df[attr_df['1_x']=='Noord']['Rent per m2'].dropna()
            ,attr_df[attr_df['1_x']=='Nieuw-West']['Rent per m2'].dropna()
            ,attr_df[attr_df['1_x']=='West']['Rent per m2'].dropna()
            ,attr_df[attr_df['1_x']=='Zuidoost']['Rent per m2'].dropna())

F_onewayResult(statistic=1.164751215636611, pvalue=0.3369486488598882)

In [31]:
# Hypothesis testing on survey
#H0: The expat community in Amsterdam that feel it's important or very important to contact rental agencies or landlords promptly after a property is listed to secure a viewing is =< 90%
#H1: The expat community in Amsterdam that feel it's important or very important to contact rental agencies or landlords promtptly after a property is listed to secure a viewing is > 90%
sample1 = 30*[1] + 0*[0]

In [32]:
st.ttest_1samp(np.array(sample1),0.9,alternative='greater')

Ttest_1sampResult(statistic=inf, pvalue=0.0)

In [33]:
#H0: The expat community in Amstrdam that say it's common or very common to encounter rental agencies or landlords that decline viewings due to having reached their maximum capacity for prospective tenants is =< 90%
#H0: The expat community in Amstrdam that say it's common or very common to encounter rental agencies or landlords that decline viewings due to having reached their maximum capacity for prospective tenants is > 90%

In [34]:
sample2 = 29*[1] + 1*[0]

In [35]:
st.ttest_1samp(np.array(sample2),0.9,alternative='greater')

Ttest_1sampResult(statistic=1.9999999999999996, pvalue=0.027471818591483614)