Data Preparation Notebook Consolidating All Data Prep code into linear process

In [1]:
import json
from StringIO import StringIO
import pandas as pd
import numpy as np
import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [2]:
listings_original = pd.read_csv('Datasources/inside_airbnb/listings.csv')

In [3]:
def parse_columns(listings, cols):
    chars = "%$"
    for i in cols:
        listings[i] = listings[i].astype(str).map(lambda x: x.rstrip(chars))
        listings[i] = listings[i].astype(str).map(lambda x: x.lstrip(chars))
        listings[i] = listings[i].apply(pd.to_numeric, errors='coerce')
        listings[i].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    return listings        

In [6]:
listings = parse_columns(listings_original, ['host_response_rate', 'cleaning_fee',
                                     'host_acceptance_rate','extra_people',
                                     'weekly_price', 'monthly_price', 'security_deposit','price'])

In [7]:
def encoder(listings, encoded_features):
    
    label_enc = LabelEncoder()
    
    for col in encoded_features:
        
        listings[col] = listings[col].astype(str)
        
        var_name = str(col) + '_enc'
        listings[var_name] = label_enc.fit_transform(listings[col])
    
    return listings

In [8]:
encoded_vars = ['host_response_time', 'calendar_updated', 'bed_type', 'jurisdiction_names', 'zipcode',
               'cancellation_policy']

In [9]:
new9 = listings.copy()
new9 = encoder(listings, encoded_vars)

In [10]:
#Caution!!! The input features are not dropped by the following to columns - they must be dropped as part of modeling
#5/23/18 the unwanted columns are detected and dropped in the featureExplorastio notebook

In [11]:
def binarizer(listings, binarized_features):
    
    label_enc = LabelBinarizer()
    
    for col in binarized_features:
        
        listings[col] = listings[col].astype(str)
        
        var_name = str(col) + '_bin'
        listings[var_name] = label_enc.fit_transform(listings[col])
    
    return listings

In [12]:
binarized_vars = ['host_is_superhost','is_location_exact','host_has_profile_pic','host_identity_verified',
                  'instant_bookable','require_guest_profile_picture','require_guest_phone_verification']

In [13]:
new10 = new9.copy()
new10 = binarizer(new10, binarized_vars)

In [14]:
#takes list of features that should be numeric and transforms them to float
#Also takes care of the topic features - these need not be input into the features parameter
def make_numeric(listings):
    #Taking Care of topics features
    topic_cols = listings.filter(regex='Topic').columns
    listings[topic_cols] = listings[topic_cols].astype(float)
    
    return listings

In [15]:
new11 = new10.copy()
new11 = make_numeric(new11)

In [16]:
new11 = new11.rename(columns = {'listing_id_x': 'id'})

In [17]:
from collections import defaultdict

In [18]:
col_counts = defaultdict(int)
col_ix = new11.first_valid_index()

In [19]:
cols = []
for col in new11.ix[col_ix].index:
    cnt = col_counts[col]
    col_counts[col] += 1
    suf = '_' + str(cnt) if cnt else ''
    cols.append(col + suf)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [20]:
new11.columns = cols
new11 = new11.drop([col_ix])

In [21]:
#Keep the below line just in case
#new11 = new11.drop(columns= ['id'])

In [22]:
num_features = ['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 
               'beds', 'guests_included', 'minimum_nights',
               'maximum_nights', 'availability_30', 'availability_60','availability_90',
               'availability_365', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
               'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 
               'reviews_per_month', 'id',
               'host_listings_count', 'host_total_listings_count']

In [23]:
new11 = parse_columns(new11,num_features)

In [24]:
def calendar_update_parse(x):
    if x == 'today':
        return 0
    
    elif x == 'yesterday':
        return 1    
    
    else:        
        b = x.split(' ')
        
        if 'days' in b:
            a = int(b[0])
            return a
        
        elif 'week' in b:
            return 7
        
        elif 'weeks' in b:
            a = 7*int(b[0])
            return a
        
        elif 'month' in b:
            a = 365.25/12.0
            return np.floor(a)
        
        elif 'months' in b:
            a = (365.25*float(b[0]))/12.0
            return np.floor(a)
    pass
            

In [25]:
def add_calendar_updated_cleaned(listings):
    listings["calendar_updated_numeric"] = listings["calendar_updated"].map(calendar_update_parse)
    listings["calendar_updated_numeric"] = listings["calendar_updated_numeric"].fillna(listings["calendar_updated_numeric"].max())
    return listings

In [26]:
new14 = new11.copy()
new14 = add_calendar_updated_cleaned(new14)

In [27]:
print "Listings Shape at Each Iteration"
print listings_original.shape
print listings.shape
print new9.shape
print new10.shape
print new11.shape
print new14.shape

Listings Shape at Each Iteration
(6608, 101)
(6608, 101)
(6608, 101)
(6608, 108)
(6607, 108)
(6607, 109)


In [29]:
new14.accommodates.dtypes

dtype('int64')

In [26]:
#Make sure to uncomment and update the count variable whenever needed

In [27]:
count = 0

In [28]:
import datetime
today = datetime.date.today()
count+=1
filename = 'Datasources/listings_augmented/listings_original_modified_' + str(today) + '_V' + str(count) + '.csv'

In [29]:
print filename

Datasources/listings_augmented/listings_original_modified_2018-06-04_V1.csv


In [30]:
new14.to_csv(filename, index=False)