In [3]:
import scipy
import math
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats import ttest_1samp
from scipy.stats import poisson
from scipy.stats import chi2_contingency
from scipy.stats import ttest_1samp, ttest_rel, ttest_ind
pd.set_option('max_columns', 50)

### Load the dataset

Load the cleaned dataset.

In [4]:
rent = pd.read_csv("rent_clean.csv")
rent.drop(columns="Unnamed: 0",inplace=True)
airbnb = pd.read_csv("airbnb_final.csv")
airbnb.drop(columns="Unnamed: 0",inplace=True)

Let's have a quick look at the dataframe

In [5]:
rent.head(1)

Unnamed: 0,Zone,Rent,Area,Yearly rent,Number of bedrooms
0,1,564.0,56.0,6768.0,1


In [6]:
airbnb.head(1)

Unnamed: 0,id,host_id,room_type,bedrooms,price,count of reviews,days of occupancy,Yearly income,Zone,latitude,longitude,geometry
0,317273,1156398,Entire home/apt,1,155,7,42,6510,1,44.84734,-0.58034,POINT (-0.58034 44.84734)


# Question 1: Calcute a confidence interval as to how much income a potential investor can expect by renting an appartment on Airbnb?

A real-estate owner has several 1-bedroom appartments (the most common type of appartments in our data) and is contemplating switching from renting monthly to list them on Airbnb.


**Please calculate a confidence interval with a degree of confidence of 90%**

In [7]:
air_1b = airbnb["Yearly income"].loc[airbnb["bedrooms"]=='1']
air_1b.describe()
#mean 6558.769647696477

count     1476.000000
mean      6558.769648
std       5460.221302
min        624.000000
25%       2880.000000
50%       4848.000000
75%       8640.000000
max      64800.000000
Name: Yearly income, dtype: float64

In [8]:
mu, sigma = np.mean(air_1b), air_1b.std()
se= sigma/math.sqrt(len(air_1b))
conf_int = stats.norm.interval(0.90, loc=mu, 
    scale=se)
conf_int

(6324.996662297841, 6792.542633095112)

In [9]:
rent.loc[rent["Number of bedrooms"]=='1'].mean()

Zone                     3.225248
Rent                   557.254950
Area                    50.524752
Yearly rent           6687.059406
Number of bedrooms            inf
dtype: float64

We calculated a confidence interval for the landlord's appartment expected yearly income if he put in on Airbnb, with a degree of certainty of 90%. The confidence intervals is between 6324 and 6792 € per year.
As this interval contains the average income for monthly-rented appartments (6687€), the landlord should keep things as they are now.

# Question 2: Goodness of fit

The distribution of incomes of airbnb and rented appartment seem to follow a Poisson distribution.


**Please check for the goodness of fit for both*

In [10]:
con = pd.read_csv("concatted.csv")
con.head(1)

Unnamed: 0.1,Unnamed: 0,id,host_id,room_type,bedrooms,price,count of reviews,days of occupancy,Yearly income,Zone,latitude,longitude,geometry,Rent,Area,Yearly rent
0,0,317273.0,1156398.0,Entire home/apt,1,155.0,7.0,42.0,6510.0,1,44.84734,-0.58034,POINT (-0.58034 44.84734),,,


In [11]:
#we need the mean sample for both samples (airbnb and rent)
mu = airbnb["Yearly income"].mean() #8292..
mu2 = rent["Yearly rent"].mean() #7718
print(airbnb["Yearly income"].shape[0])
print(rent["Yearly rent"].shape[0])
print(rent["Yearly rent"].index)

2373
904
RangeIndex(start=0, stop=904, step=1)


In [12]:
#Airbnb
observed = airbnb["Yearly income"]
proba = [stats.poisson.pmf(k,mu = mu) for k in range(len(observed) - 1)]
proba.append(1 - sum(proba))

expected = [p * sum(observed) for p in proba]


stats.chisquare(observed, f_exp=expected)



  terms = (f_obs - f_exp)**2 / f_exp


Power_divergenceResult(statistic=inf, pvalue=0.0)

In [13]:
observed = airbnb["Yearly income"]
proba = [stats.norm.pdf(k,loc = mu,scale=scipy.stats.sem(observed)) for k in range(len(observed) - 1)]
proba.append(1 - sum(proba))

expected = [p * sum(observed) for p in proba]


stats.chisquare(observed, f_exp=expected)

  terms = (f_obs - f_exp)**2 / f_exp


Power_divergenceResult(statistic=inf, pvalue=0.0)

In [14]:
#Rent

observed1 = rent["Yearly rent"]
proba1 = [stats.poisson.pmf(k,mu = mu2) for k in range(len(observed1) - 1)]
proba1.append(1 - sum(proba1))

expected1 = [p * sum(observed1) for p in proba1]


stats.chisquare(observed1, f_exp=expected1)

Power_divergenceResult(statistic=inf, pvalue=0.0)

In [15]:
observed1 = rent["Yearly rent"]
proba1 = [stats.norm.pdf(k,loc = mu,scale=scipy.stats.sem(observed1)) for k in range(len(observed1) - 1)]
proba1.append(1 - sum(proba1))

expected1 = [p * sum(observed1) for p in proba1]


stats.chisquare(observed1, f_exp=expected1)

Power_divergenceResult(statistic=inf, pvalue=0.0)

There is no goodness of fit with Poisson or normal distributions for both the yearly airbnb income or the yearly rent income.

# Question 3: calculate the probability of an expected income

A real estate owner is contemplating switching his 1 bedroom appartment in zone 2 from monthly renting to airbnb listing. However, this would include some extra costs and organisations. Therefore, he is only considering switching if he can expect to make an axtra 2000E per year. He currently rents his appartment at the market price (average rent for this appartment in that zone.


**Please calculate a t_test_rel and the cdf in order to give a probability of reaching an extra 2000 per year*

The current average yearly income for rented 1-bedroom appartments in zone 2 is 6688€ (see pivot table in exploratory data analysis). Let's consider this is the amount the landlord is currently receiving for this appartment.

Let's do some hypothesis testing.
Ho: On average, the yearly income for airbnb and monthly rental are about the same.

In [16]:
rent.dtypes

Zone                    int64
Rent                  float64
Area                  float64
Yearly rent           float64
Number of bedrooms     object
dtype: object

In [31]:
a=airbnb["Yearly income"].loc[(airbnb["Zone"]==2)&(airbnb["bedrooms"]=='1')]
b=rent["Yearly rent"].loc[(rent["Zone"]==2)&(rent["Number of bedrooms"]=='1')]
len(b)
a2=a.sample(n=77)
a2

1767    12960
1623     5700
1189     3600
493      3696
774      5742
        ...  
2145     4752
659      5700
2370     1566
1381     2898
815      3486
Name: Yearly income, Length: 77, dtype: int64

In [32]:
stats, pval = scipy.stats.ttest_rel(a2, b)
print(pval)

0.2793464867426043


Given the pvalue retrieved, we cannot reject the null hypothesis.

Let's calculate the probability of making an extra 2k a year by renting on AirBnb. (Reminder: we've proven that this sample doesn't fit a normal distribution.

Lets find out proportion of one-bedroom appartments in zone 2 that generate more than 8688k€ per year (that is the amount that she is currently making plus the 2k that she requires to make a change).


In [38]:
tot_1bd_zone2=len(airbnb["Yearly income"].loc[(airbnb["Zone"]==2)&(airbnb["bedrooms"]=='1')]) #317
filt_appts =len(airbnb["Yearly income"].loc[(airbnb["Zone"]==2)&(airbnb["bedrooms"]=='1')&(airbnb["Yearly income"]>8688)])
tot_1bd_zone2, filt_appts

(317, 95)

In [41]:
prob = round((filt_appts/tot_1bd_zone2),)

0.3

The probability for this owner to make an extra 2k a year by switching to airbnb in that zone is only 0.3