Data Preparation Notebook Consolidating All Data Prep code into linear process

In [1]:
import json
from StringIO import StringIO
import pandas as pd

In [2]:
listings_original = pd.read_csv('Datasources/inside_airbnb/listings.csv')
calendar_original = pd.read_csv('Datasources/inside_airbnb/calendar.csv')

In [3]:
def parse_columns(listings, cols):
    chars = "%$"
    for i in cols:
        listings[i].astype(str).map(lambda x: x.rstrip(chars))
        listings[i] = listings[i].apply(pd.to_numeric, errors='coerce')
        listings[i].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    return listings        

In [4]:
listings = parse_columns(listings_original, ['host_response_rate', 'cleaning_fee',\
                                     'host_acceptance_rate','extra_people',\
                                     'weekly_price', 'monthly_price', 'security_deposit'])

In [5]:
#function to produce 4 listings dataframes (whole, holiday, wke, wkd) with listing mean price
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

def get_mean_price(cal, listings):
    
    cal['price'] = cal['price'].astype(str).map(lambda x: x.lstrip('$'))
    cal['price'] = cal['price'].apply(pd.to_numeric, errors='coerce')
    cal['price'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    cal['date'] = pd.to_datetime(cal['date'])
    cal['month'] = cal['date'].apply(lambda x: x.month)
    cal['day'] = cal['date'].apply(lambda x: x.day)
    cal['day_of_week'] = cal['date'].dt.weekday_name
    
    cl = calendar()
    holidays = cl.holidays(start=cal['date'].min(), end=cal['date'].max())
    
    cal['holiday'] = cal['date'].isin(holidays)
    cal = cal[(cal['date']>'2016-07-06')&(cal['date']<'2016-10-06')]
    
    c = cal.loc[cal.available!='f']
    c = c[['listing_id','date','price','month','day_of_week','holiday']]
    c=c.fillna(c.mean())
    
    c_hol = c[c['holiday']==True]
    c_wke = c[(c['holiday']==False)&((c['day_of_week']=='Sunday')|(c['day_of_week']=='Saturday'))]
    c_wkd = c[(~c.isin(c_hol['date']))&(~c.isin(c_wke['date']))]
    
    mean_price_hol = pd.DataFrame(c_hol.groupby(by='listing_id')['price'].mean())
    mean_price_wke = pd.DataFrame(c_wke.groupby(by='listing_id')['price'].mean())
    mean_price_wkd = pd.DataFrame(c_wkd.groupby(by='listing_id')['price'].mean())
    mean_price_c = pd.DataFrame(c.groupby(by='listing_id')['price'].mean())    
    
    mean_price_hol = mean_price_hol.reset_index()
    mean_price_wke = mean_price_wke.reset_index()
    mean_price_wkd = mean_price_wkd.reset_index()
    mean_price_c = mean_price_c.reset_index()

    listings_hol = listings.merge(mean_price_hol, how='inner', left_on='id', right_on='listing_id')
    listings_wke = listings.merge(mean_price_wke, how='inner', left_on='id', right_on='listing_id')
    listings_wkd = listings.merge(mean_price_wkd, how='inner', left_on='id', right_on='listing_id')
    listings_c = listings.merge(mean_price_c, how='inner', left_on='id', right_on='listing_id')
    
    return listings_hol, listings_wke, listings_wkd, listings_c

In [6]:
listings_hol, listings_wke, listings_wkd, listings = get_mean_price(calendar_original, listings)

In [7]:
import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
#use n components in place of n topics when using gridsearchcv
def create_topics(pdseries, listings):
        corpus = pdseries.fillna('none')
        
        vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )
        
        data_vectorized = vectorizer.fit_transform(corpus)
        
        lda_model = LatentDirichletAllocation(n_topics=20,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
        
        lda_output = lda_model.fit_transform(data_vectorized)

        # column names
        col_name = pd.DataFrame(pdseries).columns[0]
        topicnames = [str(col_name) + "-" + "Topic" + str(i) for i in range(lda_model.n_topics)]

        # index names
        docnames = [str(col_name) + "-" + "Doc" + str(i) for i in range(len(corpus))]

        # Make the pandas dataframe
        df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

        # Get dominant topic for each document
        dominant_topic = np.argmax(df_document_topic.values, axis=1)
        df_document_topic[str(col_name) + "-" + 'Dominant_Topic'] = dominant_topic
        
        df_document_topic.index = [i for i in range(len(df_document_topic))]
        
        out = df_document_topic.merge(listings, left_index=True, right_index=True)
        out = out.astype('str')
        return out
        
        

In [40]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new = listings.copy()
for i in text_features:
    new = create_topics(listings[i], new)



In [41]:
new.dtypes

house_rules-Topic0                  object
house_rules-Topic1                  object
house_rules-Topic2                  object
house_rules-Topic3                  object
house_rules-Topic4                  object
house_rules-Topic5                  object
house_rules-Topic6                  object
house_rules-Topic7                  object
house_rules-Topic8                  object
house_rules-Topic9                  object
house_rules-Topic10                 object
house_rules-Topic11                 object
house_rules-Topic12                 object
house_rules-Topic13                 object
house_rules-Topic14                 object
house_rules-Topic15                 object
house_rules-Topic16                 object
house_rules-Topic17                 object
house_rules-Topic18                 object
house_rules-Topic19                 object
house_rules-Dominant_Topic          object
interaction-Topic0                  object
interaction-Topic1                  object
interaction

In [42]:
import nltk
from nltk.corpus import stopwords
import re

def create_txt_features(pdseries, listings):
    
    textLength = []
    textWordsPerc = []
    textPuncPerc = []
    textDigitsPerc = []

    for i in pdseries:
        tokens = re.findall(r"[\w']+|[.,!?;]", i)
        textLength.append(len(tokens))

        if len(tokens)==0:
            textWordsPerc.append(0)
            textPuncPerc.append(0)
            textDigitsPerc.append(0)

        else:
            textWordsPerc.append(len(i.split())/float(len(tokens)))
            textPuncPerc.append(len(''.join(c for c in i if c in string.punctuation))/float(len(tokens)))
            textDigitsPerc.append(len(''.join(c for c in i if c in string.digits))/float(len(tokens)))

    col_name = pd.DataFrame(pdseries).columns[0]
    
    textLength_varname = str(col_name) + '_TextLength'
    textWordsPerc_varname = str(col_name) + '_TextWordsPerc'
    textPuncPerc_varname = str(col_name) + '_TextPuncPerc'
    textDigitsPerc_varname = str(col_name) + '_TextDigitsPerc'
    
    listings[textLength_varname] = textLength
    listings[textWordsPerc_varname] = textWordsPerc
    listings[textPuncPerc_varname] = textPuncPerc
    listings[textDigitsPerc_varname] = textDigitsPerc
    
    return listings

In [43]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new2 = new.copy()
for i in text_features:
    new2 = create_txt_features(new[i], new2)

In [51]:
def lexical_diversity(pdseries, listings):
    
    col_name = pd.DataFrame(pdseries).columns[0]
    varname = str(col_name) + "_LexicalDiversity"
    
    lx_div = pd.Series([len(i)/len(set(i)) for i in pdseries])
    listings[varname] = lx_div
    
    return listings

In [52]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new3 = new2.copy()
for i in text_features:
    new3 = lexical_diversity(new2[i], new3)

In [53]:
new3.columns

Index([u'house_rules-Topic0', u'house_rules-Topic1', u'house_rules-Topic2',
       u'house_rules-Topic3', u'house_rules-Topic4', u'house_rules-Topic5',
       u'house_rules-Topic6', u'house_rules-Topic7', u'house_rules-Topic8',
       u'house_rules-Topic9',
       ...
       u'house_rules_TextPuncPerc', u'house_rules_TextDigitsPerc',
       u'space_LexicalDiversity', u'description_LexicalDiversity',
       u'neighborhood_overview_LexicalDiversity', u'notes_LexicalDiversity',
       u'transit_LexicalDiversity', u'access_LexicalDiversity',
       u'interaction_LexicalDiversity', u'house_rules_LexicalDiversity'],
      dtype='object', length=305)

In [54]:
import datetime
now = datetime.datetime.now()
filename = 'listings_augmented_' + str(now) + '.csv'

In [55]:
new3.to_csv(filename)