In [1]:
import pandas as pd
import os
import sklearn as sk
from sklearn.linear_model import MultiTaskLasso, MultiTaskElasticNet, ElasticNet
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import sklearn.metrics as skmet
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, names
import datetime

In [2]:
listings = pd.read_csv('inside_airbnb/listings.csv')
list_summ = pd.read_csv('inside_airbnb/listings_summ.csv')
neighborhoods = pd.read_csv('inside_airbnb/neighbourhoods.csv')
reviews = pd.read_csv('inside_airbnb/reviews.csv')
reviews_summ = pd.read_csv('inside_airbnb/reviews_summ.csv')
calendar = pd.read_csv('inside_airbnb/calendar.csv')

In [3]:
listings.columns

Index([u'id', u'listing_url', u'scrape_id', u'last_scraped', u'name',
       u'summary', u'space', u'description', u'experiences_offered',
       u'neighborhood_overview', u'notes', u'transit', u'access',
       u'interaction', u'house_rules', u'thumbnail_url', u'medium_url',
       u'picture_url', u'xl_picture_url', u'host_id', u'host_url',
       u'host_name', u'host_since', u'host_location', u'host_about',
       u'host_response_time', u'host_response_rate', u'host_acceptance_rate',
       u'host_is_superhost', u'host_thumbnail_url', u'host_picture_url',
       u'host_neighbourhood', u'host_listings_count',
       u'host_total_listings_count', u'host_verifications',
       u'host_has_profile_pic', u'host_identity_verified', u'street',
       u'neighbourhood', u'neighbourhood_cleansed',
       u'neighbourhood_group_cleansed', u'city', u'state', u'zipcode',
       u'market', u'smart_location', u'country_code', u'country', u'latitude',
       u'longitude', u'is_location_exact', u'prope

In [4]:
def parse_columns(listings, cols):
    chars = "%$"
    for i in cols:
        listings[i].astype(str).map(lambda x: x.rstrip(chars))
        listings[i] = listings[i].apply(pd.to_numeric, errors='coerce')
        listings[i].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    return listings  

In [5]:
listings = parse_columns(listings, ['host_response_rate', 'cleaning_fee',\
                                     'host_acceptance_rate','extra_people',\
                                     'weekly_price', 'monthly_price', 'security_deposit'])

In [6]:
cal = calendar

In [7]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [13]:
cal['price'] = cal['price'].astype(str).map(lambda x: x.lstrip('$'))
cal['price'] = cal['price'].apply(pd.to_numeric, errors='coerce')
cal['price'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
cal['date'] = pd.to_datetime(cal['date'])
cal['month'] = cal['date'].apply(lambda x: x.month)
cal['day'] = cal['date'].apply(lambda x: x.day)
cal['day_of_week'] = cal['date'].dt.weekday_name
    
cl = calendar()
holidays = cl.holidays(start=cal['date'].min(), end=cal['date'].max())
    
cal['holiday'] = cal['date'].isin(holidays)
cal = cal[(cal['date']>'2016-07-06')&(cal['date']<'2016-10-06')]
    
c = cal.loc[cal.available!='f']
c = c[['listing_id','date','price','month','day_of_week','holiday']]
c=c.fillna(c.mean())
    
c_hol = c[c['holiday']==True]
c_wke = c[(c['holiday']==False)&((c['day_of_week']=='Sunday')|(c['day_of_week']=='Saturday'))]
c_wkd = c[(~c.isin(c_hol['date']))&(~c.isin(c_wke['date']))]


price_hol_dict = {'price': c_hol.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_hol.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_hol.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_hol.groupby(by='listing_id')['price'].fillna(0).std(),                 
                  'skew_of_price': c_hol.groupby(by='listing_id')['price'].fillna(0).skew()}


price_wke_dict = {'price': c_wke.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_wke.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_wke.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_wke.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c_wke.groupby(by='listing_id')['price'].fillna(0).skew()}


price_wkd_dict = {'price': c_wkd.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_wkd.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_wkd.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_wkd.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c_wkd.groupby(by='listing_id')['price'].fillna(0).skew()}


price_whole_dict = {'price': c.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c.groupby(by='listing_id')['price'].max(), 
                  'min_price': c.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c.groupby(by='listing_id')['price'].fillna(0).skew()}



price_hol = pd.DataFrame(price_hol_dict)
price_wke = pd.DataFrame(price_wke_dict)
price_wkd = pd.DataFrame(price_wkd_dict)
price_c = pd.DataFrame(price_whole_dict)    
    
price_hol = price_hol.reset_index()
price_wke = price_wke.reset_index()
price_wkd = price_wkd.reset_index()
price_c = price_c.reset_index()

listings_hol = listings.merge(price_hol, how='inner', left_on='id', right_on='listing_id')
listings_wke = listings.merge(price_wke, how='inner', left_on='id', right_on='listing_id')
listings_wkd = listings.merge(price_wkd, how='inner', left_on='id', right_on='listing_id')
listings_c = listings.merge(price_c, how='inner', left_on='id', right_on='listing_id')

In [14]:
listings_hol.shape, listings_wke.shape, listings_wkd.shape, listings_c.shape

((4000, 101), (5656, 101), (5753, 101), (5753, 101))

In [16]:
c_hol.shape, c_wke.shape, c_wkd.shape, c.shape

((4000, 6), (80532, 6), (288684, 6), (288684, 6))

In [22]:
price_hol

Unnamed: 0,listing_id,max_price,min_price,price,skew_of_price,stdev_of_price
0,9553,99.000000,99.000000,99.000000,1.916102,164.93312
1,14668,58.000000,58.000000,58.000000,1.916102,164.93312
2,38245,65.000000,65.000000,65.000000,1.916102,164.93312
3,45429,160.000000,160.000000,160.000000,1.916102,164.93312
4,54001,120.000000,120.000000,120.000000,1.916102,164.93312
5,62274,85.000000,85.000000,85.000000,1.916102,164.93312
6,63285,40.000000,40.000000,40.000000,1.916102,164.93312
7,69385,250.000000,250.000000,250.000000,1.916102,164.93312
8,75668,45.000000,45.000000,45.000000,1.916102,164.93312
9,79300,50.000000,50.000000,50.000000,1.916102,164.93312


In [23]:
price_hol_new = price_hol.rename(columns = {'max_price': 'hol_max_price', 'min_price': 'hol_min_price', 'price': 'hol_price',
                                           'skew_of_price': 'hol_skew_of_price', 'stdev_of_price': 'hol_stdev_of_price'})
price_wke_new = price_wke.rename(columns = {'max_price': 'wke_max_price', 'min_price': 'wke_min_price', 'price': 'wke_price',
                                           'skew_of_price': 'wke_skew_of_price', 'stdev_of_price': 'wke_stdev_of_price'})
price_wkd_new = price_wkd.rename(columns = {'max_price': 'wkd_max_price', 'min_price': 'wkd_min_price', 'price': 'wkd_price',
                                           'skew_of_price': 'wkd_skew_of_price', 'stdev_of_price': 'wkd_stdev_of_price'})

In [28]:
listings_all = listings_c.merge(price_hol_new, how='inner', left_on='listing_id', right_on='listing_id')
listings_all = listings_all.merge(price_wke_new, how='inner', left_on='listing_id', right_on='listing_id')
listings_all = listings_all.merge(price_wkd_new, how='inner', left_on='listing_id', right_on='listing_id')

In [29]:
listings_all

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,wke_max_price,wke_min_price,wke_price,wke_skew_of_price,wke_stdev_of_price,wkd_max_price,wkd_min_price,wkd_price,wkd_skew_of_price,wkd_stdev_of_price
0,11204286,https://www.airbnb.com/rooms/11204286,20160706203047,2016-07-07,Family friendly/California king,"Aquatica Waterpark, Sleep train Amphitheater, ...","Walking to Aquatica Waterpark, Sleep train Amp...","Aquatica Waterpark, Sleep train Amphitheater, ...",none,,...,49.0,49.000000,49.000000,1.89154,167.591639,49.0,49.000000,49.000000,1.930178,165.159132
1,7972006,https://www.airbnb.com/rooms/7972006,20160706203047,2016-07-07,Welcome to Sunset Suite,Your spacious room awaiting is with a Queen Si...,,Your spacious room awaiting is with a Queen Si...,none,Getting around is easy. Very close to Eastlake...,...,59.0,59.000000,59.000000,1.89154,167.591639,59.0,59.000000,59.000000,1.930178,165.159132
2,13124681,https://www.airbnb.com/rooms/13124681,20160706203047,2016-07-07,Townhome in Eastlake,This 2 Story TownHome is close to Otay Ranch ...,"My place is good for couples, business travele...",This 2 Story TownHome is close to Otay Ranch ...,none,"Located in eastern Chula Vista, Otay Ranch is ...",...,120.0,120.000000,120.000000,1.89154,167.591639,120.0,120.000000,120.000000,1.930178,165.159132
3,3469225,https://www.airbnb.com/rooms/3469225,20160706203047,2016-07-07,Bedroom suite in Large new home,Hello; we are offering a private secluded bedr...,"Beautiful, quiet award-winning suburban neighb...",Hello; we are offering a private secluded bedr...,none,"The quiet serenity; near Park and lakes, beaut...",...,80.0,70.000000,75.000000,1.89154,167.591639,80.0,70.000000,72.903226,1.930178,165.159132
4,877473,https://www.airbnb.com/rooms/877473,20160706203047,2016-07-07,Private Bedroom in Newer Home,"Private Bedroom with full size bed, + futon, p...","Private bedroom with full size Bed, in large n...","Private Bedroom with full size bed, + futon, p...",none,"It is quiet, serene, tranquil, suburban; yet c...",...,80.0,70.000000,74.666667,1.89154,167.591639,80.0,70.000000,72.641509,1.930178,165.159132
5,3124507,https://www.airbnb.com/rooms/3124507,20160706203047,2016-07-07,"Great view to mountains,lake,trails","In this beautiful home, located near the Olymp...",The location and view of lake and mountain,"In this beautiful home, located near the Olymp...",none,Like countryside and also near from downtown S...,...,90.0,90.000000,90.000000,1.89154,167.591639,90.0,90.000000,90.000000,1.930178,165.159132
6,3432507,https://www.airbnb.com/rooms/3432507,20160706203047,2016-07-07,2 bedrooms with shared bath,In a great house with view on Otay Lakes and ...,"the proximity to lake ,trail and Olympic train...",In a great house with view on Otay Lakes and ...,none,Freedom in nature,...,150.0,150.000000,150.000000,1.89154,167.591639,150.0,150.000000,150.000000,1.930178,165.159132
7,3249729,https://www.airbnb.com/rooms/3249729,20160706203047,2016-07-07,Near to Olympic Training center,Beautiful home in the Eastlake community with ...,Location on the border of the town with nice t...,Beautiful home in the Eastlake community with ...,none,"open , quiet, country feeling",...,90.0,90.000000,90.000000,1.89154,167.591639,90.0,90.000000,90.000000,1.930178,165.159132
8,13157364,https://www.airbnb.com/rooms/13157364,20160706203047,2016-07-07,Recamaras amuebladas con baño,"Habitaciones amuebladas con Closet, television...",,"Habitaciones amuebladas con Closet, television...",none,"Preciosa zona Residencial, en Eastlake Greens ...",...,110.0,110.000000,110.000000,1.89154,167.591639,110.0,110.000000,110.000000,1.930178,165.159132
9,9756570,https://www.airbnb.com/rooms/9756570,20160706203047,2016-07-07,Family Home in SoCal (Eastlake),Fully equipped home in a quiet family-friendly...,,Fully equipped home in a quiet family-friendly...,none,,...,300.0,300.000000,300.000000,1.89154,167.591639,300.0,300.000000,300.000000,1.930178,165.159132
