In [116]:
import numpy as np
import pandas as pd
import pylab
import scipy.stats, scipy

import matplotlib.pyplot  as plt
import seaborn as sns

<h2>Importing and exploring neighbourhood ranking</h2>

In [336]:
df_rank = pd.read_csv('neighbourhoods.csv')

In [337]:
df_rank.head()

Unnamed: 0,N_Barri,Area,Number of Air BnB Listings,Tourism,Numbers of Apartments,RANK
0,el Barri Gòtic,841905.08,1218,925,10479,20.5%
1,la Dreta de l'Eixample,2123404.58,1918,947,27051,10.6%
2,el Raval,1098392.91,1398,916,23285,9.9%
3,"Sant Pere, Santa Caterina i la Ribera",1114298.81,1041,302,14643,9.2%
4,la Barceloneta,1313867.98,406,228,9233,6.9%


In [338]:
df_rank.shape

(73, 6)

<h3>Calculating touristic apartments per neighbourhood/total touristic apartments in the city</h3>

In [340]:
df_rank['RANK'] = df_rank['RANK'].map(lambda x: x.rstrip('%'))
df_rank.head()

Unnamed: 0,N_Barri,Area,Number of Air BnB Listings,Tourism,Numbers of Apartments,RANK
0,el Barri Gòtic,841905.08,1218,925,10479,20.5
1,la Dreta de l'Eixample,2123404.58,1918,947,27051,10.6
2,el Raval,1098392.91,1398,916,23285,9.9
3,"Sant Pere, Santa Caterina i la Ribera",1114298.81,1041,302,14643,9.2
4,la Barceloneta,1313867.98,406,228,9233,6.9


In [366]:
df_rank['total'] = df_rank['Number of Air BnB Listings'] + df_rank['Tourism']
df_rank.head()

Unnamed: 0,N_Barri,Area,Number of Air BnB Listings,Tourism,Numbers of Apartments,RANK,total
0,el Barri Gòtic,841905.08,1218,925,10479,20.5,2143
1,la Dreta de l'Eixample,2123404.58,1918,947,27051,10.6,2865
2,el Raval,1098392.91,1398,916,23285,9.9,2314
3,"Sant Pere, Santa Caterina i la Ribera",1114298.81,1041,302,14643,9.2,1343
4,la Barceloneta,1313867.98,406,228,9233,6.9,634


In [367]:
df_rank['t_rank'] = df_rank['total']/df_rank['total'].sum()
df_rank.head()

Unnamed: 0,N_Barri,Area,Number of Air BnB Listings,Tourism,Numbers of Apartments,RANK,total,t_rank
0,el Barri Gòtic,841905.08,1218,925,10479,20.5,2143,0.080473
1,la Dreta de l'Eixample,2123404.58,1918,947,27051,10.6,2865,0.107585
2,el Raval,1098392.91,1398,916,23285,9.9,2314,0.086894
3,"Sant Pere, Santa Caterina i la Ribera",1114298.81,1041,302,14643,9.2,1343,0.050432
4,la Barceloneta,1313867.98,406,228,9233,6.9,634,0.023808


In [368]:
p_neigh = df_rank['t_rank'].as_matrix().astype(float)
p_neigh_n = p_neigh/p_neigh.sum(axis=0,keepdims=1)
p_neigh_n.sum()

  """Entry point for launching an IPython kernel.


1.0000000000000002

<h4>Generating trial neighbourhood sample</h4>

In [360]:
neigh = np.random.choice(np.arange(0,73), size=(10), p=p_neigh_n).astype(int)
neigh

array([14, 14, 55,  0, 17,  6, 36,  7,  0,  0])

<h2>Setting periods of accomodation as per 2017 data</h2>

In [344]:
one_to_four = [0.068, 0.139, 0.219, 0.192, 0.113]
five_and_more = (0.27/24) * np.ones(23)

print(one_to_four)
print(five_and_more)

[0.068, 0.139, 0.219, 0.192, 0.113]
[0.01125 0.01125 0.01125 0.01125 0.01125 0.01125 0.01125 0.01125 0.01125
 0.01125 0.01125 0.01125 0.01125 0.01125 0.01125 0.01125 0.01125 0.01125
 0.01125 0.01125 0.01125 0.01125 0.01125]


In [345]:
most_p = np.sum(one_to_four)+np.sum(five_and_more)
last_p = np.array(1 - most_p)
print(most)
last

0.9897500000000001


array(0.01025)

In [346]:
p_p = np.concatenate((one_to_four, five_and_more, [last]), axis=0)
p_p

array([0.068  , 0.139  , 0.219  , 0.192  , 0.113  , 0.01125, 0.01125,
       0.01125, 0.01125, 0.01125, 0.01125, 0.01125, 0.01125, 0.01125,
       0.01125, 0.01125, 0.01125, 0.01125, 0.01125, 0.01125, 0.01125,
       0.01125, 0.01125, 0.01125, 0.01125, 0.01125, 0.01125, 0.01125,
       0.01025])

In [347]:
np.sum(p_p)

0.9999999999999999

<h4>Generating trial periods sample</h4>

In [358]:
periods = np.random.choice(np.arange(1,30), size=(10), p=p_p).astype(int)

In [359]:
periods

array([ 4,  2,  3, 27,  2,  2, 10, 24,  4,  3])

<h2>Setting type of accomodation as per 2017 data</h2>

In [352]:
hotel_share = 0.51/0.837
apart_share = 0.179/0.837
hostel_share = 0.148/0.837
print('hotel share :', hotel_share)
print('apartshare :', apart_share)
print('hostel share :', hostel_share)

paid = hotel_share + apart_share + hostel_share
print('total paid acc:', paid)

five_s =  0.107 * hotel_share
regular_share = 0.531 * hotel_share
other_s = 0.363 * hotel_share

hotels = five_s + regular + other_s
print('hotels share 2:', hotels)

luxury_share = five_s + apart_share

budget_share = hostel_share + other_s

total_acc = luxury_share + regular_share + budget_share

print('Luxury share :', luxury_share)
print('Regular share :', regular_share)
print('Budget share :', budget_share)

budget_share_adj = budget_share - (total_acc-1)

total_acc_adj = luxury_share + regular_share + budget_share_adj
total_acc_adj

hotel share : 0.6093189964157707
apartshare : 0.21385902031063322
hostel share : 0.17682198327359616
total paid acc: 1.0
hotels share 2: 0.6099283154121865
Luxury share : 0.2790561529271207
Regular share : 0.32354838709677425
Budget share : 0.39800477897252096


1.0

In [353]:
p_acc = np.concatenate(([luxury_share], [regular_share], [budget_share_adj]), axis=0)
p_acc

array([0.27905615, 0.32354839, 0.39739546])

<h4>Generating type of accommodation sample</h4>

In [361]:
type_acc = np.random.choice([4,2,1], size=(10), p=p_acc).astype(int)
type_acc

array([1, 1, 2, 4, 2, 1, 1, 2, 1, 1])

<h2>Setting seasonality as per 2017 data</h2>

In [355]:
visitors = pd.read_csv('monthly_visitors_percentage.csv')
visitors

Unnamed: 0,Month,Percentage
0,January,0.061647
1,February,0.067146
2,March,0.083657
3,April,0.096346
4,May,0.097943
5,June,0.096428
6,July,0.101374
7,August,0.095854
8,September,0.088963
9,October,0.082776


In [356]:
visitors['Month'] = visitors.index
p_m = visitors['Percentage']
p_m

0     0.061647
1     0.067146
2     0.083657
3     0.096346
4     0.097943
5     0.096428
6     0.101374
7     0.095854
8     0.088963
9     0.082776
10    0.068251
11    0.059613
Name: Percentage, dtype: float64

<h4>Generating seasonality sample</h4>

In [362]:
month = np.random.choice(np.arange(0,12), size=(10), p=p_m).astype(int)
#month = np.random.normal(loc=6, scale=2.3, size=201).astype(int)
month

array([ 0,  6,  1,  9,  1,  3,  2,  2,  6, 11])

<h2>Building the dataset</h2>

In [313]:
df_n['Days'] = periods

In [314]:
df_n['Month'] = month

In [298]:
df_n['Type_of_Accomodation'] = type_acc

In [335]:
df_n['N_Barri'] = neigh
df_n.head()

Unnamed: 0,N_Barri,Days,Month,Type_of_Accomodation,Smart Tax Percentage
0,3,14,0,1,
1,27,4,3,0,
2,20,3,1,2,
3,11,4,3,1,
4,1,4,8,2,


In [None]:
df_n.to_csv('a_dataset.csv')

In [None]:
df_rank = pd.read_csv('neighbourhoods.csv')

In [369]:
def gentrynator (size, labels = True):
    
    df_gent = pd.DataFrame()
    
    df_gent['N_Barri'] = np.random.choice(np.arange(0,73), size=(size), p=p_neigh_n).astype(int)
    df_gent['Days'] = np.random.choice(np.arange(1,30), size=(size), p=p_p).astype(int)
    df_gent['Month'] = np.random.choice(np.arange(0,12), size=(size), p=p_m).astype(int)
    df_gent['Type_of_Accomodation'] = np.random.choice([4,2,1], size=(size), p=p_acc).astype(int)
   
    return df_gent

In [380]:
df = gentrynator(1000)
df.head()

Unnamed: 0,N_Barri,Days,Month,Type_of_Accomodation
0,14,5,10,4
1,39,4,6,1
2,7,1,6,1
3,0,3,11,4
4,14,2,6,2


In [383]:
df.to_csv('gen_dataset.csv')