In [170]:
import sys
import pandas as pd
import numpy as np

# Cleaning the listing data

In [171]:
df = pd.read_csv('../data/listings.csv');
df = df[['id', 'description', 'host_is_superhost', 'zipcode', 'property_type','room_type', 'bathrooms', 'bedrooms',
    'beds','price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'review_scores_rating']]

### Drop unknown zipcodes and unwanted property types

In [172]:
df = df.dropna(subset=['zipcode'])

df = df[df.room_type == 'Entire home/apt']
acceptedPropTypes = ['Apartment', 'House', 'Villa', 'Chalet', 'Condominium', 'Townhouse', 'Other', 'Loft', 'Bungalow',
 'Guesthouse', 'Cabin', 'Serviced apartment', 'Earth House', 'Nature lodge']
df = df[df.property_type.isin(acceptedPropTypes)]

### Cast the money-values into floats and booleans to proper booleans

In [173]:
# turn this into moneyLaundering()
# def moneyLaundering(c):
#     newCol = df.c.astype(str).apply(lambda x: (x.replace('$', '')))
#     newCol = newCol.astype(str).apply(lambda x: (x.replace(',', '')))
#     newCol = newCol.astype(str).apply(lambda x: (x.replace('nan', '0')))
#     df = df.drop([c], axis=1)
#     df[c] = pd.to_numeric(newCol.values, errors='raise')

# df.apply(moneyLaundering, args=(df.cleaning_fee))

cleaning_fee = df.cleaning_fee.astype(str).apply(lambda x: (x.replace('$', '')))
cleaning_fee = cleaning_fee.astype(str).apply(lambda x: (x.replace(',', '')))
cleaning_fee = cleaning_fee.astype(str).apply(lambda x: (x.replace('nan', '0')))
df = df.drop(['cleaning_fee'], axis=1)
df['cleaning_fee'] = pd.to_numeric(cleaning_fee.values, errors='raise')

security_deposit = df.security_deposit.astype(str).apply(lambda x: (x.replace('$', '')))
security_deposit = security_deposit.astype(str).apply(lambda x: (x.replace(',', '')))
security_deposit = security_deposit.astype(str).apply(lambda x: (x.replace('nan', '0')))
df = df.drop(['security_deposit'], axis=1)
df['security_deposit'] = pd.to_numeric(security_deposit.values, errors='raise')

price = df.price.astype(str).apply(lambda x: (x.replace('$', '')))
price = price.astype(str).apply(lambda x: (x.replace(',', '')))
price = price.astype(str).apply(lambda x: (x.replace('nan', '0')))
df = df.drop(['price'], axis=1)
df['price'] = pd.to_numeric(price.values, errors='raise')

extra = df.extra_people.astype(str).apply(lambda x: (x.replace('$', '')))
extra = extra.astype(str).apply(lambda x: (x.replace(',', '')))
extra = extra.astype(str).apply(lambda x: (x.replace('nan', '0')))
df = df.drop(['extra_people'], axis=1)
df['extra_people'] = pd.to_numeric(extra.values, errors='raise')

# WRONG CASTINGS
df['host_is_superhost'] = df['host_is_superhost'].str.contains('t', regex=False)
df['security_deposit'] = np.where(df['security_deposit']>0, True, False)

### Create seaViews and priceGroup columns (including extras)

In [174]:
seaViews = df['description'].str.contains('vistas al mar|sea view|seaview|mit Meeresblick|Blick aufs Meer', regex=True)
df = df.drop(['description'], axis=1)
df['seaViews'] = seaViews

# should first include all the 'guests_included' AND 'extra_people'!!

df['price'] = df['price'] + df['cleaning_fee']
a, b, c, d = np.percentile(df.price, [0, 33, 67, 100])
df['groupedPrice'] = pd.cut(df['price'], [a, b, c, d], labels=['lowCost', 'medium', 'premium'])

### Deal with the missing reviews

### Extranct the availability feature

In [175]:
dfcal = pd.read_csv('../data/15_03_2017_calendar.csv');
dfcal = dfcal[dfcal.listing_id.isin(df.id)]

IOError: File ../data/15_03_2017_calendar.csv does not exist

In [180]:
temp = dfcal[dfcal.available == 't']
temp = temp.groupby(['listing_id'], as_index=False).agg({'available': 'count'})
temp['availability'] = temp.available.apply(lambda x: x*100/365)

### Join both dataframes into "final"

In [181]:
final = df.set_index('id').join(temp.set_index('listing_id'))
# final = final.drop(['id'], axis=1)
final = final[['zipcode', 'property_type', 'bathrooms', 'bedrooms', 'beds', 'guests_included',
               'review_scores_rating', 'host_is_superhost', 'security_deposit', 'seaViews', 'availability', 'groupedPrice']]

In [182]:
# final.describe(include='all')
list(final.columns.values)

['zipcode',
 'property_type',
 'bathrooms',
 'bedrooms',
 'beds',
 'guests_included',
 'review_scores_rating',
 'host_is_superhost',
 'security_deposit',
 'seaViews',
 'availability',
 'groupedPrice']

In [183]:
final.host_is_superhost = final.host_is_superhost.apply(lambda x: x*1)
final.security_deposit = final.security_deposit.apply(lambda x: x*1)
final.seaViews = final.seaViews.apply(lambda x: x*1)
final.to_csv(path_or_buf= '../data/features.csv', index=False)