Data Preparation Notebook Consolidating All Data Prep code into linear process

In [1]:
import json
from StringIO import StringIO
import pandas as pd

In [2]:
listings_original = pd.read_csv('Datasources/inside_airbnb/listings.csv')
calendar_original = pd.read_csv('Datasources/inside_airbnb/calendar.csv')

In [3]:
def parse_columns(listings, cols):
    chars = "%$"
    for i in cols:
        listings[i].astype(str).map(lambda x: x.rstrip(chars))
        listings[i] = listings[i].apply(pd.to_numeric, errors='coerce')
        listings[i].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    return listings        

In [4]:
listings = parse_columns(listings_original, ['host_response_rate', 'cleaning_fee',\
                                     'host_acceptance_rate','extra_people',\
                                     'weekly_price', 'monthly_price', 'security_deposit'])

In [5]:
#function to produce 4 listings dataframes (whole, holiday, wke, wkd) with listing mean price
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

def get_mean_price(cal, listings):
    
    cal['price'] = cal['price'].astype(str).map(lambda x: x.lstrip('$'))
    cal['price'] = cal['price'].apply(pd.to_numeric, errors='coerce')
    cal['price'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    cal['date'] = pd.to_datetime(cal['date'])
    cal['month'] = cal['date'].apply(lambda x: x.month)
    cal['day'] = cal['date'].apply(lambda x: x.day)
    cal['day_of_week'] = cal['date'].dt.weekday_name
    
    cl = calendar()
    holidays = cl.holidays(start=cal['date'].min(), end=cal['date'].max())
    
    cal['holiday'] = cal['date'].isin(holidays)
    cal = cal[(cal['date']>'2016-07-06')&(cal['date']<'2016-10-06')]
    
    c = cal.loc[cal.available!='f']
    c = c[['listing_id','date','price','month','day_of_week','holiday']]
    c=c.fillna(c.mean())
    
    c_hol = c[c['holiday']==True]
    c_wke = c[(c['holiday']==False)&((c['day_of_week']=='Sunday')|(c['day_of_week']=='Saturday'))]
    c_wkd = c[(~c.isin(c_hol['date']))&(~c.isin(c_wke['date']))]


    price_hol_dict = {'price': c_hol.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_hol.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_hol.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_hol.groupby(by='listing_id')['price'].fillna(0).std(),                 
                  'skew_of_price': c_hol.groupby(by='listing_id')['price'].fillna(0).skew(),
                     'median_price': c_hol.groupby(by='listing_id')['price'].median()}


    price_wke_dict = {'price': c_wke.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_wke.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_wke.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_wke.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c_wke.groupby(by='listing_id')['price'].fillna(0).skew(),
                     'median_price': c_wke.groupby(by='listing_id')['price'].median()}


    price_wkd_dict = {'price': c_wkd.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_wkd.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_wkd.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_wkd.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c_wkd.groupby(by='listing_id')['price'].fillna(0).skew(),
                     'median_price': c_wkd.groupby(by='listing_id')['price'].median()}


    price_whole_dict = {'price': c.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c.groupby(by='listing_id')['price'].max(), 
                  'min_price': c.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c.groupby(by='listing_id')['price'].fillna(0).skew(),
                       'median_price': c.groupby(by='listing_id')['price'].median()}



    price_hol = pd.DataFrame(price_hol_dict)
    price_wke = pd.DataFrame(price_wke_dict)
    price_wkd = pd.DataFrame(price_wkd_dict)
    price_c = pd.DataFrame(price_whole_dict)    
    
    price_hol = price_hol.reset_index()
    price_wke = price_wke.reset_index()
    price_wkd = price_wkd.reset_index()
    price_c = price_c.reset_index()

    listings_hol = listings.merge(price_hol, how='inner', left_on='id', right_on='listing_id')
    listings_wke = listings.merge(price_wke, how='inner', left_on='id', right_on='listing_id')
    listings_wkd = listings.merge(price_wkd, how='inner', left_on='id', right_on='listing_id')
    listings_c = listings.merge(price_c, how='inner', left_on='id', right_on='listing_id')
    
    price_hol_new = price_hol.rename(columns = {'max_price': 'hol_max_price', 'min_price': 'hol_min_price', 'price': 'hol_price',
                                           'skew_of_price': 'hol_skew_of_price', 'stdev_of_price': 'hol_stdev_of_price'})
    price_wke_new = price_wke.rename(columns = {'max_price': 'wke_max_price', 'min_price': 'wke_min_price', 'price': 'wke_price',
                                           'skew_of_price': 'wke_skew_of_price', 'stdev_of_price': 'wke_stdev_of_price'})
    price_wkd_new = price_wkd.rename(columns = {'max_price': 'wkd_max_price', 'min_price': 'wkd_min_price', 'price': 'wkd_price',
                                           'skew_of_price': 'wkd_skew_of_price', 'stdev_of_price': 'wkd_stdev_of_price'})
    
    listings_c = listings_c.merge(price_hol_new, how='outer', left_on='listing_id', right_on='listing_id')
    listings_c = listings_c.merge(price_wke_new, how='outer', left_on='listing_id', right_on='listing_id')
    listings_c = listings_c.merge(price_wkd_new, how='outer', left_on='listing_id', right_on='listing_id')
    
    #len(cal['listing_id'].astype(str).unique())
    #count = len(c['listing_id'].astype(str).unique())
    
    #print('Due to the above filtering on calendar, the right total count of listings is: ' %(count))
    
    return listings_hol, listings_wke, listings_wkd, listings_c

In [6]:
listings_hol, listings_wke, listings_wkd, listings = get_mean_price(calendar_original, listings)

In [120]:
listings.loc[4325,:]

id                                                                            3667220
listing_url                                      https://www.airbnb.com/rooms/3667220
scrape_id                                                              20160706203047
last_scraped                                                               2016-07-07
name                                                       New House in Mission Hills
summary                             Brand New Home, 3 Master bedrooms with walk in...
space                               Brand New Home, 2300 sqft model home in the HE...
description                         Brand New Home, 3 Master bedrooms with walk in...
experiences_offered                                                              none
neighborhood_overview               Mission Hills is an amazing spot.  I just move...
notes                                                           10 min from Comic-Con
transit                             Airport is just 5-

In [7]:
import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
#use n components in place of n topics when using gridsearchcv
def create_topics(pdseries, listings):
        corpus = pdseries.fillna('none')
        
        vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )
        
        data_vectorized = vectorizer.fit_transform(corpus)
        
        lda_model = LatentDirichletAllocation(n_topics=20,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
        
        lda_output = lda_model.fit_transform(data_vectorized)

        # column names
        col_name = pd.DataFrame(pdseries).columns[0]
        topicnames = [str(col_name) + "-" + "Topic" + str(i) for i in range(lda_model.n_topics)]

        # index names
        docnames = [str(col_name) + "-" + "Doc" + str(i) for i in range(len(corpus))]

        # Make the pandas dataframe
        df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

        # Get dominant topic for each document
        dominant_topic = np.argmax(df_document_topic.values, axis=1)
        df_document_topic[str(col_name) + "-" + 'Dominant_Topic'] = dominant_topic
        
        df_document_topic.index = [i for i in range(len(df_document_topic))]
        
        df_document_topic = df_document_topic.fillna(0)
        
        out = df_document_topic.merge(listings, left_index=True, right_index=True)
        out = out.astype('str')
        return out

In [10]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new = listings.copy()
for i in text_features:
    new = create_topics(listings[i], new)



In [51]:
new.loc[4325,:]

house_rules-Topic0                                 0.0
house_rules-Topic1                                0.26
house_rules-Topic2                                 0.0
house_rules-Topic3                                 0.0
house_rules-Topic4                                 0.0
house_rules-Topic5                                 0.0
house_rules-Topic6                                 0.0
house_rules-Topic7                                 0.0
house_rules-Topic8                                 0.0
house_rules-Topic9                                0.68
house_rules-Topic10                                0.0
house_rules-Topic11                                0.0
house_rules-Topic12                                0.0
house_rules-Topic13                                0.0
house_rules-Topic14                                0.0
house_rules-Topic15                                0.0
house_rules-Topic16                                0.0
house_rules-Topic17                                0.0
house_rule

In [11]:
import nltk
from nltk.corpus import stopwords
import re

def create_txt_features(pdseries, listings):
    
    textLength = []
    textWordsPerc = []
    textPuncPerc = []
    textDigitsPerc = []

    for i in pdseries:
        tokens = re.findall(r"[\w']+|[.,!?;]", i)
        textLength.append(len(tokens))

        if len(tokens)==0:
            textWordsPerc.append(0)
            textPuncPerc.append(0)
            textDigitsPerc.append(0)

        else:
            textWordsPerc.append(len(i.split())/float(len(tokens)))
            textPuncPerc.append(len(''.join(c for c in i if c in string.punctuation))/float(len(tokens)))
            textDigitsPerc.append(len(''.join(c for c in i if c in string.digits))/float(len(tokens)))

    col_name = pd.DataFrame(pdseries).columns[0]
    
    textLength_varname = str(col_name) + '_TextLength'
    textWordsPerc_varname = str(col_name) + '_TextWordsPerc'
    textPuncPerc_varname = str(col_name) + '_TextPuncPerc'
    textDigitsPerc_varname = str(col_name) + '_TextDigitsPerc'
    
    listings[textLength_varname] = textLength
    listings[textWordsPerc_varname] = textWordsPerc
    listings[textPuncPerc_varname] = textPuncPerc
    listings[textDigitsPerc_varname] = textDigitsPerc
    
    return listings

In [12]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new2 = new.copy()
for i in text_features:
    new2 = create_txt_features(new[i], new2)

In [52]:
new2.loc[4325,:]

house_rules-Topic0                            0.0
house_rules-Topic1                           0.26
house_rules-Topic2                            0.0
house_rules-Topic3                            0.0
house_rules-Topic4                            0.0
house_rules-Topic5                            0.0
house_rules-Topic6                            0.0
house_rules-Topic7                            0.0
house_rules-Topic8                            0.0
house_rules-Topic9                           0.68
house_rules-Topic10                           0.0
house_rules-Topic11                           0.0
house_rules-Topic12                           0.0
house_rules-Topic13                           0.0
house_rules-Topic14                           0.0
house_rules-Topic15                           0.0
house_rules-Topic16                           0.0
house_rules-Topic17                           0.0
house_rules-Topic18                           0.0
house_rules-Topic19                           0.0


In [13]:
def lexical_diversity(pdseries, listings):
    
    col_name = pd.DataFrame(pdseries).columns[0]
    varname = str(col_name) + "_LexicalDiversity"
    
    lx_div = pd.Series([len(i)/len(set(i)) for i in pdseries])
    listings[varname] = lx_div
    
    return listings

In [14]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new3 = new2.copy()
for i in text_features:
    new3 = lexical_diversity(new2[i], new3)

In [53]:
new3.loc[4325,:]

house_rules-Topic0                              0.0
house_rules-Topic1                             0.26
house_rules-Topic2                              0.0
house_rules-Topic3                              0.0
house_rules-Topic4                              0.0
house_rules-Topic5                              0.0
house_rules-Topic6                              0.0
house_rules-Topic7                              0.0
house_rules-Topic8                              0.0
house_rules-Topic9                             0.68
house_rules-Topic10                             0.0
house_rules-Topic11                             0.0
house_rules-Topic12                             0.0
house_rules-Topic13                             0.0
house_rules-Topic14                             0.0
house_rules-Topic15                             0.0
house_rules-Topic16                             0.0
house_rules-Topic17                             0.0
house_rules-Topic18                             0.0
house_rules-

In [16]:
def extract_grammar(pdseries, listings):
    
    import nltk
    from nltk.tag import pos_tag, map_tag
    from collections import Counter
      
    df = pd.DataFrame()
    for text in pdseries:
        
        col_name = pd.DataFrame(pdseries).columns[0]
        
        
        tokenized_text = nltk.word_tokenize(text.decode('utf-8'))
        grammar = [i[1] for i in nltk.pos_tag(tokenized_text, tagset='universal')]
        
        counter = Counter(grammar)
        fr = pd.DataFrame(counter, index=[0])
        fr.columns = [str(col_name) + '_' + str(i) for i in fr.columns]
        
        fr2 = fr/len(tokenized_text)
        fr2.columns = [str(i) + '_tokens_sum_ratio' for i in fr2.columns]
        
        fr3 = pd.concat([fr, fr2], ignore_index=True)
        
        df = pd.concat([df, fr3], ignore_index=True)
        
        
        
    df = df.fillna(0)
        
    return listings.merge(df, left_index=True, right_index=True)

In [17]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new4 = new3.copy()
for i in text_features:
    new4 = extract_grammar(new3[i], new4)

In [54]:
new4.loc[4325,:]

house_rules-Topic0                    0.0
house_rules-Topic1                   0.26
house_rules-Topic2                    0.0
house_rules-Topic3                    0.0
house_rules-Topic4                    0.0
house_rules-Topic5                    0.0
house_rules-Topic6                    0.0
house_rules-Topic7                    0.0
house_rules-Topic8                    0.0
house_rules-Topic9                   0.68
house_rules-Topic10                   0.0
house_rules-Topic11                   0.0
house_rules-Topic12                   0.0
house_rules-Topic13                   0.0
house_rules-Topic14                   0.0
house_rules-Topic15                   0.0
house_rules-Topic16                   0.0
house_rules-Topic17                   0.0
house_rules-Topic18                   0.0
house_rules-Topic19                   0.0
house_rules-Dominant_Topic              9
interaction-Topic0                    0.0
interaction-Topic1                    0.0
interaction-Topic2                

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def kmeans_Clusterer(pdseries, listings):
    
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(pdseries)
    true_k = 10
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    model.fit(X)
    
    col_name = pd.DataFrame(pdseries).columns[0]
    varname = str(col_name) + "_KmeansCluster"
    
    listings[varname] = pd.Series(model.labels_)
    listings[varname] = listings[varname].fillna(0)
    
    return listings

In [19]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new5 = new4.copy()
for i in text_features:
    new5 = kmeans_Clusterer(new4[i], new5)

In [55]:
new5.loc[4325,:]

house_rules-Topic0                      0.0
house_rules-Topic1                     0.26
house_rules-Topic2                      0.0
house_rules-Topic3                      0.0
house_rules-Topic4                      0.0
house_rules-Topic5                      0.0
house_rules-Topic6                      0.0
house_rules-Topic7                      0.0
house_rules-Topic8                      0.0
house_rules-Topic9                     0.68
house_rules-Topic10                     0.0
house_rules-Topic11                     0.0
house_rules-Topic12                     0.0
house_rules-Topic13                     0.0
house_rules-Topic14                     0.0
house_rules-Topic15                     0.0
house_rules-Topic16                     0.0
house_rules-Topic17                     0.0
house_rules-Topic18                     0.0
house_rules-Topic19                     0.0
house_rules-Dominant_Topic                9
interaction-Topic0                      0.0
interaction-Topic1              

Amenities

In [20]:
def string_to_set(x):
    c = set()
    for w in x[1:-1].split(","):
        c.add(w)
        
    return c

def has_amenity(x, amen_):
    if amen_ in x:
        return 1
    pass

In [21]:
def add_amenities(listings):
    listings['amenities_set'] = listings['amenities'].fillna('{}').map(string_to_set)
    all_amenities = set()
    
    for idx in listings['amenities'].fillna('{}').map(string_to_set).index:
        all_amenities = all_amenities.union(listings['amenities'].fillna('{}').map(string_to_set)[idx])
        
    for amen in all_amenities:
        listings['has' + amen] = 0
        listings['has' + amen] = listings['amenities_set'].map(lambda x: has_amenity(x, amen))

    has_amenties_list = []
    for amen in all_amenities:
        has_amenties_list.append('has' + amen)

    listings[has_amenties_list] = listings[has_amenties_list].fillna(0)
    
    return listings      

In [22]:
new6 = new5.copy()

In [23]:
new6 = add_amenities(new6)

In [56]:
new6.loc[4325,:]

house_rules-Topic0                                  0.0
house_rules-Topic1                                 0.26
house_rules-Topic2                                  0.0
house_rules-Topic3                                  0.0
house_rules-Topic4                                  0.0
house_rules-Topic5                                  0.0
house_rules-Topic6                                  0.0
house_rules-Topic7                                  0.0
house_rules-Topic8                                  0.0
house_rules-Topic9                                 0.68
house_rules-Topic10                                 0.0
house_rules-Topic11                                 0.0
house_rules-Topic12                                 0.0
house_rules-Topic13                                 0.0
house_rules-Topic14                                 0.0
house_rules-Topic15                                 0.0
house_rules-Topic16                                 0.0
house_rules-Topic17                             

In [24]:
def add_host_verifications(listings):
    a = listings['host_verifications'].map(lambda x: x[1:-1]).map(lambda j: j.split(',')).map(lambda k: set(k))
    all_host_verifications = set()
    
    for w in a.index:
        all_host_verifications = all_host_verifications.union(a[w])
        
    for w in all_host_verifications:
        listings['uses' + w] = 0
        listings['uses' + w] = a.map(lambda x: has_amenity(x, w))
    
    
    uses_verification_list = []
    
    for veri in all_host_verifications:
        uses_verification_list.append('uses' + veri)
    
    listings[uses_verification_list] = listings[uses_verification_list].fillna(0)
    
    return listings      

In [42]:
new7 = new6.copy()
new7 = add_host_verifications(new7)

In [57]:
new7.loc[4325,:]

house_rules-Topic0                  0.0
house_rules-Topic1                 0.26
house_rules-Topic2                  0.0
house_rules-Topic3                  0.0
house_rules-Topic4                  0.0
house_rules-Topic5                  0.0
house_rules-Topic6                  0.0
house_rules-Topic7                  0.0
house_rules-Topic8                  0.0
house_rules-Topic9                 0.68
house_rules-Topic10                 0.0
house_rules-Topic11                 0.0
house_rules-Topic12                 0.0
house_rules-Topic13                 0.0
house_rules-Topic14                 0.0
house_rules-Topic15                 0.0
house_rules-Topic16                 0.0
house_rules-Topic17                 0.0
house_rules-Topic18                 0.0
house_rules-Topic19                 0.0
house_rules-Dominant_Topic            9
interaction-Topic0                  0.0
interaction-Topic1                  0.0
interaction-Topic2                  0.1
interaction-Topic3                  0.0


In [43]:
def add_distance_from_ocean(listings):
    listings['distance_from_ocean'] = 0
    listings['distance_from_ocean'] = listings['distance_from_ocean'].astype('float')
    
    for w in listings.index:
        p = float(listings['latitude'][w])
        q = float(listings['longitude'][w])
        lon_diff = (q + 117.235585)*np.pi/180
        lat_diff = (p - 32.802458)*np.pi/180
        a = np.sin(lat_diff/2)**2 + np.cos(p*np.pi/180)*np.cos(32.802458*np.pi/180)*(np.sin(lon_diff/2)**2)
        c = np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        d = 6371.00*float(c)
        listings['distance_from_ocean'][w] = d
        
    return listings

In [44]:
new8 = new7.copy()
new8 = add_distance_from_ocean(new8)

In [59]:
new8.loc[4324:4326,:]

Unnamed: 0,house_rules-Topic0,house_rules-Topic1,house_rules-Topic2,house_rules-Topic3,house_rules-Topic4,house_rules-Topic5,house_rules-Topic6,house_rules-Topic7,house_rules-Topic8,house_rules-Topic9,...,uses 'facebook',uses 'amex',uses'phone',uses 'sent_id',uses 'jumio',uses 'google',uses 'manual_offline',uses 'kba',uses 'reviews',distance_from_ocean
4324,0.0,0.11,0.0,0.0,0.05,0.0,0.07,0.0,0.0,0.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4325,0.0,0.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.68,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4326,0.15,0.23,0.0,0.0,0.05,0.0,0.0,0.17,0.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [92]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [93]:
def encoder(listings, encoded_features):
    
    label_enc = LabelEncoder()
    
    for col in encoded_features:
        
        listings[col] = listings[col].astype(str)
        
        var_name = str(col) + '_enc'
        listings[var_name] = label_enc.fit_transform(listings[col])
    
    return listings

In [94]:
encoded_vars = ['host_response_time', 'calendar_updated', 'bed_type', 'jurisdiction_names', 'zipcode']

In [209]:
new9 = new8.copy()
new9 = encoder(new9, encoded_vars)

In [210]:
#Caution!!! The input features are not dropped by the following to columns - they must be dropped as part of modeling

In [211]:
def binarizer(listings, binarized_features):
    
    label_enc = LabelBinarizer()
    
    for col in binarized_features:
        
        listings[col] = listings[col].astype(str)
        
        var_name = str(col) + '_bin'
        listings[var_name] = label_enc.fit_transform(listings[col])
    
    return listings

In [212]:
binarized_vars = ['host_is_superhost','is_location_exact','host_has_profile_pic','host_identity_verified',
                  'instant_bookable','require_guest_profile_picture','require_guest_phone_verification']

In [213]:
new10 = new9.copy()
new10 = binarizer(new10, binarized_vars)

In [199]:
#takes list of features that should be numeric and transforms them to float
#Also takes care of the topic features - these need not be input into the features parameter
def make_numeric(listings, features):
    #Taking Care of topics features
    topic_cols = listings.filter(regex='Topic').columns
    listings[topic_cols] = listings[topic_cols].astype(float)
    
    #Now transforming the other features:
    for i in features:
        listings[i] = listings[i].astype(float)
    
    return listings

In [200]:
listings.wke_max_price.isnull().sum()

97

In [201]:
def parse_columns_new(listings, cols):
    
    topic_cols = listings.filter(regex='Topic').columns
    listings[topic_cols] = listings[topic_cols].astype(float)
    
    for i in cols:
        listings[i] = listings[i].replace('$', '')
        #listings[i] = listings[i].replace('%', '')
        listings[i] = listings[i].apply(pd.to_numeric, errors='coerce')
        listings[i].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    return listings

In [202]:
num_features = ['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 
               'beds', 'guests_included', 'minimum_nights',
               'maximum_nights', 'availability_30', 'availability_60','availability_90',
               'availability_365', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
               'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 
               'reviews_per_month', 'max_price', 'median_price_x', 'min_price', 'price_y', 'skew_of_price',
               'stdev_of_price', 'hol_max_price', 'median_price_y', 'hol_min_price', 'hol_price', 'hol_skew_of_price', 
               'hol_stdev_of_price', 'wke_max_price', 'wke_min_price', 'wke_price', 'wke_skew_of_price', 'wke_stdev_of_price',
               'wkd_max_price', 'wkd_min_price', 'wkd_skew_of_price', 'wkd_stdev_of_price', 'space_KmeansCluster', 
               'description_KmeansCluster']

In [214]:
new11 = new10.copy()
new11 = parse_columns_new(new11, num_features)
#new11 = make_numeric(new11, num_features)

In [219]:
for i in new10.columns:
    print i

house_rules-Topic0
house_rules-Topic1
house_rules-Topic2
house_rules-Topic3
house_rules-Topic4
house_rules-Topic5
house_rules-Topic6
house_rules-Topic7
house_rules-Topic8
house_rules-Topic9
house_rules-Topic10
house_rules-Topic11
house_rules-Topic12
house_rules-Topic13
house_rules-Topic14
house_rules-Topic15
house_rules-Topic16
house_rules-Topic17
house_rules-Topic18
house_rules-Topic19
house_rules-Dominant_Topic
interaction-Topic0
interaction-Topic1
interaction-Topic2
interaction-Topic3
interaction-Topic4
interaction-Topic5
interaction-Topic6
interaction-Topic7
interaction-Topic8
interaction-Topic9
interaction-Topic10
interaction-Topic11
interaction-Topic12
interaction-Topic13
interaction-Topic14
interaction-Topic15
interaction-Topic16
interaction-Topic17
interaction-Topic18
interaction-Topic19
interaction-Dominant_Topic
access-Topic0
access-Topic1
access-Topic2
access-Topic3
access-Topic4
access-Topic5
access-Topic6
access-Topic7
access-Topic8
access-Topic9
access-Topic10
access-Topi

In [35]:
#Corrections

In [36]:
#count = 0

In [215]:
import datetime
today = datetime.date.today()
count+=1
filename = 'listings_augmented_' + str(today) + '_V' + str(count) + '.csv'

In [216]:
print filename

listings_augmented_2018-05-17_V5.csv


In [48]:
"""For the date, you can use datetime.date.today() or datetime.datetime.now().date().

For the time, you can use datetime.datetime.now().time()."""

'For the date, you can use datetime.date.today() or datetime.datetime.now().date().\n\nFor the time, you can use datetime.datetime.now().time().'

In [217]:
new11.to_csv(filename)

In [218]:
print "Listings Shape at Each Iteration"
print listings_original.shape
print listings.shape
print new.shape
print new2.shape
print new3.shape
print new4.shape
print new5.shape
print new6.shape
print new7.shape
print new8.shape
print new9.shape
print new10.shape
print new11.shape

Listings Shape at Each Iteration
(6608, 95)
(5753, 120)
(5753, 288)
(5753, 320)
(5753, 328)
(5753, 520)
(5753, 528)
(5753, 572)
(5753, 585)
(5753, 586)
(5753, 591)
(5753, 598)
(5753, 598)
