In [16]:
import scipy
import math
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats import ttest_1samp
from scipy.stats import poisson
from scipy.stats import chi2_contingency
from scipy.stats import ttest_1samp, ttest_rel, ttest_ind
pd.set_option('max_columns', 50)

### Load the dataset

Load the cleaned dataset.

In [17]:
rent = pd.read_csv("rent_clean.csv")
rent.drop(columns="Unnamed: 0",inplace=True)
airbnb = pd.read_csv("airbnb_final.csv")
airbnb.drop(columns="Unnamed: 0",inplace=True)

Let's have a quick look at the dataframe

In [18]:
rent.head(1)

Unnamed: 0,Zone,Rent,Area,Yearly rent,Number of bedrooms
0,1,564.0,56.0,6768.0,1


In [19]:
airbnb.head(1)

Unnamed: 0,id,host_id,room_type,bedrooms,price,count of reviews,days of occupancy,Yearly income,Zone,latitude,longitude,geometry
0,317273,1156398,Entire home/apt,1,155,7,42,6510,1,44.84734,-0.58034,POINT (-0.58034 44.84734)


# Question 1: Calcute a confidence interval as to how much income a potential investor can expect by renting an appartment on Airbnb?

A real-estate owner has several 1-bedroom appartments (the most common type of appartments in our data) and is contemplating switching from renting monthly to list them on Airbnb.


**Please calculate a confidence interval with a degree of confidence of 90%**

In [20]:
air_1b = airbnb["Yearly income"].loc[airbnb["bedrooms"]=='1']
air_1b.describe()
#mean 6558.769647696477

count     1476.000000
mean      6558.769648
std       5460.221302
min        624.000000
25%       2880.000000
50%       4848.000000
75%       8640.000000
max      64800.000000
Name: Yearly income, dtype: float64

In [21]:


mu, sigma = np.mean(air_1b), air_1b.std()
se= sigma/math.sqrt(len(air_1b))
conf_int = stats.norm.interval(0.90, loc=mu, 
    scale=se)
conf_int

(6324.996662297841, 6792.542633095112)

# Question 2: Goodness of fit

The distribution of incomes of airbnb and rented appartment seem to follow a Poisson distribution.


**Please check for the goodness of fit for both*

In [22]:
con = pd.read_csv("concatted.csv")
con.head(1)

Unnamed: 0.1,Unnamed: 0,id,host_id,room_type,bedrooms,price,count of reviews,days of occupancy,Yearly income,Zone,latitude,longitude,geometry,Rent,Area,Yearly rent
0,0,317273.0,1156398.0,Entire home/apt,1,155.0,7.0,42.0,6510.0,1,44.84734,-0.58034,POINT (-0.58034 44.84734),,,


In [23]:
#we need the mean sample for both samples (airbnb and rent)
mu = airbnb["Yearly income"].mean() #8292..
mu2 = rent["Yearly rent"].mean() #7718
print(airbnb["Yearly income"].shape[0])
print(rent["Yearly rent"].shape[0])
print(rent["Yearly rent"].index)

2373
904
RangeIndex(start=0, stop=904, step=1)


In [24]:
#Airbnb
observed = airbnb["Yearly income"]
proba = [stats.poisson.pmf(k,mu = mu) for k in range(len(observed) - 1)]
proba.append(1 - sum(proba))

expected = [p * sum(observed) for p in proba]


stats.chisquare(observed, f_exp=expected)



Power_divergenceResult(statistic=inf, pvalue=0.0)

In [25]:
observed = airbnb["Yearly income"]
proba = [stats.norm.pdf(k,loc = mu,scale=scipy.stats.sem(observed)) for k in range(len(observed) - 1)]
proba.append(1 - sum(proba))

expected = [p * sum(observed) for p in proba]


stats.chisquare(observed, f_exp=expected)

Power_divergenceResult(statistic=inf, pvalue=0.0)

In [26]:
#Rent

observed1 = rent["Yearly rent"]
proba1 = [stats.poisson.pmf(k,mu = mu2) for k in range(len(observed1) - 1)]
proba1.append(1 - sum(proba1))

expected1 = [p * sum(observed1) for p in proba1]


stats.chisquare(observed1, f_exp=expected1)

Power_divergenceResult(statistic=inf, pvalue=0.0)

In [27]:
observed1 = rent["Yearly rent"]
proba1 = [stats.norm.pdf(k,loc = mu,scale=scipy.stats.sem(observed1)) for k in range(len(observed1) - 1)]
proba1.append(1 - sum(proba1))

expected1 = [p * sum(observed1) for p in proba1]


stats.chisquare(observed1, f_exp=expected1)

Power_divergenceResult(statistic=inf, pvalue=0.0)

In [28]:
#There is no goodness of fit with Poisson or normal for our distributions

# Question 3: calculate the probability of an expected income

A real estate owner is contemplating switching his 1 bedroom appartment in zone 2 from monthly renting to airbnb listing. However, this would include some extra costs and organisations. Therefore, he is only considering switching if he can expect to make an axtra 3000E per year. He currently rents his appartment at the market price (average rent for this appartment in that zone.


**Please calculate a t_test_rel and the cdf in order to give a probability of reaching an extra 3000 per year*

In [32]:
#Ho
rent.dtypes

Zone                    int64
Rent                  float64
Area                  float64
Yearly rent           float64
Number of bedrooms     object
dtype: object

In [45]:
a=airbnb["Yearly income"].loc[(airbnb["Zone"]==2)&(airbnb["bedrooms"]=='1')]
b=rent["Yearly rent"].loc[(rent["Zone"]==2)&(rent["Number of bedrooms"]=='1')]
len(b)
a2=a.sample(n=77)
a2

1705     3000
54      23040
527      1536
929      4950
1725     6942
        ...  
1563     4992
1517     5202
1929     7800
659      5700
209     64800
Name: Yearly income, Length: 77, dtype: int64

In [46]:
scipy.stats.ttest_rel(a2, b)

Ttest_relResult(statistic=1.0201726671711338, pvalue=0.3108831216830079)

In [None]:
#cannot reject..

In [48]:
se =stats.sem(a)

In [52]:
prob =stats.norm.cdf(a,loc=0,scale=se)

In [53]:
prob

array([5.82689162e-040, 4.52557907e-030, 9.99999283e-001, 1.00000000e+000,
       1.00000000e+000, 2.41036096e-048, 2.42779756e-001, 2.20643685e-014,
       9.99999998e-001, 1.00000000e+000, 2.01919964e-034, 1.00000000e+000,
       6.26555884e-073, 1.00000000e+000, 9.67454207e-027, 2.42779756e-001,
       9.99951339e-001, 1.11457144e-007, 1.00000000e+000, 6.26555884e-073,
       7.30835905e-001, 1.00000000e+000, 2.09387869e-032, 7.30835905e-001,
       2.96131181e-094, 2.09387869e-032, 1.35124431e-001, 1.80996754e-078,
       1.35906805e-004, 3.95838826e-100, 1.00000000e+000, 1.00000000e+000,
       1.00000000e+000, 7.30835905e-001, 7.23588970e-085, 1.00000000e+000,
       1.28646167e-031, 1.00000000e+000, 4.32655183e-003, 8.64788527e-004,
       1.22836634e-095, 9.99989359e-001, 1.26834466e-106, 9.99999780e-001,
       2.92438202e-010, 4.75660012e-044, 2.18829794e-074, 4.75130099e-083,
       4.83638832e-001, 1.85597309e-045, 1.28591592e-023, 2.41036096e-048,
       8.79073225e-002, 2