In [20]:
import pandas as pd
import os
import sklearn as sk
from sklearn.linear_model import MultiTaskLasso, MultiTaskElasticNet, ElasticNet
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import sklearn.metrics as skmet
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, names
import datetime

In [21]:
listings = pd.read_csv('inside_airbnb/listings.csv')
list_summ = pd.read_csv('inside_airbnb/listings_summ.csv')
neighborhoods = pd.read_csv('inside_airbnb/neighbourhoods.csv')
reviews = pd.read_csv('inside_airbnb/reviews.csv')
reviews_summ = pd.read_csv('inside_airbnb/reviews_summ.csv')
calendar = pd.read_csv('inside_airbnb/calendar.csv')

In [22]:
listings.columns

Index([u'id', u'listing_url', u'scrape_id', u'last_scraped', u'name',
       u'summary', u'space', u'description', u'experiences_offered',
       u'neighborhood_overview', u'notes', u'transit', u'access',
       u'interaction', u'house_rules', u'thumbnail_url', u'medium_url',
       u'picture_url', u'xl_picture_url', u'host_id', u'host_url',
       u'host_name', u'host_since', u'host_location', u'host_about',
       u'host_response_time', u'host_response_rate', u'host_acceptance_rate',
       u'host_is_superhost', u'host_thumbnail_url', u'host_picture_url',
       u'host_neighbourhood', u'host_listings_count',
       u'host_total_listings_count', u'host_verifications',
       u'host_has_profile_pic', u'host_identity_verified', u'street',
       u'neighbourhood', u'neighbourhood_cleansed',
       u'neighbourhood_group_cleansed', u'city', u'state', u'zipcode',
       u'market', u'smart_location', u'country_code', u'country', u'latitude',
       u'longitude', u'is_location_exact', u'prope

In [23]:
def parse_columns(listings, cols):
    chars = "%$"
    for i in cols:
        listings[i].astype(str).map(lambda x: x.rstrip(chars))
        listings[i] = listings[i].apply(pd.to_numeric, errors='coerce')
        listings[i].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    return listings  

In [24]:
listings = parse_columns(listings, ['host_response_rate', 'cleaning_fee',\
                                     'host_acceptance_rate','extra_people',\
                                     'weekly_price', 'monthly_price', 'security_deposit'])

In [25]:
cal = calendar

In [26]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [35]:
cal['price'] = cal['price'].astype(str).map(lambda x: x.lstrip('$'))
cal['price'] = cal['price'].apply(pd.to_numeric, errors='coerce')
cal['price'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
cal['date'] = pd.to_datetime(cal['date'])
cal['month'] = cal['date'].apply(lambda x: x.month)
cal['day'] = cal['date'].apply(lambda x: x.day)
cal['day_of_week'] = cal['date'].dt.weekday_name
    
cl = calendar()
holidays = cl.holidays(start=cal['date'].min(), end=cal['date'].max())
    
cal['holiday'] = cal['date'].isin(holidays)
cal = cal[(cal['date']>'2016-07-06')&(cal['date']<'2016-10-06')]
    
c = cal.loc[cal.available!='f']
c = c[['listing_id','date','price','month','day_of_week','holiday']]
c=c.fillna(c.mean())
    
c_hol = c[c['holiday']==True]
c_wke = c[(c['holiday']==False)&((c['day_of_week']=='Sunday')|(c['day_of_week']=='Saturday'))]
c_wkd = c[(~c.isin(c_hol['date']))&(~c.isin(c_wke['date']))]

price_hol_dict = {'price': c_hol.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_hol.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_hol.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_hol.groupby(by='listing_id')['price'].std(),
                  'skew_of_price': c_hol.groupby(by='listing_id')['price'].skew(),
                  'kurtosis_of_price': c_hol.groupby(by='listing_id').apply(pd.DataFrame.kurt)['price']}

price_wke_dict = {'price': c_wke.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_wke.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_wke.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_wke.groupby(by='listing_id')['price'].std(),
                  'skew_of_price': c_wke.groupby(by='listing_id')['price'].skew(),
                  'kurtosis_of_price': c_wke.groupby(by='listing_id').apply(pd.DataFrame.kurt)['price']}

price_wkd_dict = {'price': c_wkd.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_wkd.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_wkd.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_wkd.groupby(by='listing_id')['price'].std(),
                  'skew_of_price': c_wkd.groupby(by='listing_id')['price'].skew(),
                  'kurtosis_of_price': c_wkd.groupby(by='listing_id').apply(pd.DataFrame.kurt)['price']}

price_whole_dict = {'price': c.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c.groupby(by='listing_id')['price'].max(), 
                  'min_price': c.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c.groupby(by='listing_id')['price'].std(),
                  'skew_of_price': c.groupby(by='listing_id')['price'].skew(),
                  'kurtosis_of_price': c.groupby(by='listing_id').apply(pd.DataFrame.kurt)['price']}



price_hol = pd.DataFrame(price_hol_dict)
price_wke = pd.DataFrame(price_wke_dict)
price_wkd = pd.DataFrame(price_wkd_dict)
price_c = pd.DataFrame(price_whole_dict)    
    
price_hol = price_hol.reset_index()
price_wke = price_wke.reset_index()
price_wkd = price_wkd.reset_index()
price_c = price_c.reset_index()

listings_hol = listings.merge(price_hol, how='inner', left_on='id', right_on='listing_id')
listings_wke = listings.merge(price_wke, how='inner', left_on='id', right_on='listing_id')
listings_wkd = listings.merge(price_wkd, how='inner', left_on='id', right_on='listing_id')
listings_c = listings.merge(price_c, how='inner', left_on='id', right_on='listing_id')

In [36]:
listings_hol, listings_wke, listings_wkd, listings_c

(            id                            listing_url       scrape_id  \
 0     11204286  https://www.airbnb.com/rooms/11204286  20160706203047   
 1      7972006   https://www.airbnb.com/rooms/7972006  20160706203047   
 2     13124681  https://www.airbnb.com/rooms/13124681  20160706203047   
 3      3469225   https://www.airbnb.com/rooms/3469225  20160706203047   
 4       877473    https://www.airbnb.com/rooms/877473  20160706203047   
 5      3124507   https://www.airbnb.com/rooms/3124507  20160706203047   
 6      3432507   https://www.airbnb.com/rooms/3432507  20160706203047   
 7      3249729   https://www.airbnb.com/rooms/3249729  20160706203047   
 8     13157364  https://www.airbnb.com/rooms/13157364  20160706203047   
 9      9756570   https://www.airbnb.com/rooms/9756570  20160706203047   
 10    10687910  https://www.airbnb.com/rooms/10687910  20160706203047   
 11     7938260   https://www.airbnb.com/rooms/7938260  20160706203047   
 12    10779932  https://www.airbnb.co

In [37]:
listings_hol

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,listing_id,kurtosis_of_price,max_price,min_price,price_y,skew_of_price,stdev_of_price
0,11204286,https://www.airbnb.com/rooms/11204286,20160706203047,2016-07-07,Family friendly/California king,"Aquatica Waterpark, Sleep train Amphitheater, ...","Walking to Aquatica Waterpark, Sleep train Amp...","Aquatica Waterpark, Sleep train Amphitheater, ...",none,,...,f,2,4.57,11204286,,49.0,49.0,49.0,,
1,7972006,https://www.airbnb.com/rooms/7972006,20160706203047,2016-07-07,Welcome to Sunset Suite,Your spacious room awaiting is with a Queen Si...,,Your spacious room awaiting is with a Queen Si...,none,Getting around is easy. Very close to Eastlake...,...,f,1,0.76,7972006,,59.0,59.0,59.0,,
2,13124681,https://www.airbnb.com/rooms/13124681,20160706203047,2016-07-07,Townhome in Eastlake,This 2 Story TownHome is close to Otay Ranch ...,"My place is good for couples, business travele...",This 2 Story TownHome is close to Otay Ranch ...,none,"Located in eastern Chula Vista, Otay Ranch is ...",...,f,1,3.00,13124681,,120.0,120.0,120.0,,
3,3469225,https://www.airbnb.com/rooms/3469225,20160706203047,2016-07-07,Bedroom suite in Large new home,Hello; we are offering a private secluded bedr...,"Beautiful, quiet award-winning suburban neighb...",Hello; we are offering a private secluded bedr...,none,"The quiet serenity; near Park and lakes, beaut...",...,f,2,,3469225,,70.0,70.0,70.0,,
4,877473,https://www.airbnb.com/rooms/877473,20160706203047,2016-07-07,Private Bedroom in Newer Home,"Private Bedroom with full size bed, + futon, p...","Private bedroom with full size Bed, in large n...","Private Bedroom with full size bed, + futon, p...",none,"It is quiet, serene, tranquil, suburban; yet c...",...,f,2,0.63,877473,,70.0,70.0,70.0,,
5,3124507,https://www.airbnb.com/rooms/3124507,20160706203047,2016-07-07,"Great view to mountains,lake,trails","In this beautiful home, located near the Olymp...",The location and view of lake and mountain,"In this beautiful home, located near the Olymp...",none,Like countryside and also near from downtown S...,...,f,3,0.31,3124507,,90.0,90.0,90.0,,
6,3432507,https://www.airbnb.com/rooms/3432507,20160706203047,2016-07-07,2 bedrooms with shared bath,In a great house with view on Otay Lakes and ...,"the proximity to lake ,trail and Olympic train...",In a great house with view on Otay Lakes and ...,none,Freedom in nature,...,f,3,0.04,3432507,,150.0,150.0,150.0,,
7,3249729,https://www.airbnb.com/rooms/3249729,20160706203047,2016-07-07,Near to Olympic Training center,Beautiful home in the Eastlake community with ...,Location on the border of the town with nice t...,Beautiful home in the Eastlake community with ...,none,"open , quiet, country feeling",...,f,3,0.08,3249729,,90.0,90.0,90.0,,
8,13157364,https://www.airbnb.com/rooms/13157364,20160706203047,2016-07-07,Recamaras amuebladas con baño,"Habitaciones amuebladas con Closet, television...",,"Habitaciones amuebladas con Closet, television...",none,"Preciosa zona Residencial, en Eastlake Greens ...",...,f,1,,13157364,,110.0,110.0,110.0,,
9,9756570,https://www.airbnb.com/rooms/9756570,20160706203047,2016-07-07,Family Home in SoCal (Eastlake),Fully equipped home in a quiet family-friendly...,,Fully equipped home in a quiet family-friendly...,none,,...,f,1,0.32,9756570,,300.0,300.0,300.0,,
