Data Preparation Notebook Consolidating All Data Prep code into linear process

In [2]:
import json
from StringIO import StringIO
import pandas as pd

In [3]:
listings_original = pd.read_csv('../Datasources/inside_airbnb/listings.csv')
calendar_original = pd.read_csv('../Datasources/inside_airbnb/calendar.csv')

In [3]:
def parse_columns(listings, cols):
    chars = "%$"
    for i in cols:
        listings[i] = listings[i].astype(str).map(lambda x: x.rstrip(chars))
        listings[i] = listings[i].astype(str).map(lambda x: x.lstrip(chars))
        listings[i] = listings[i].apply(pd.to_numeric, errors='coerce')
        listings[i].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    return listings        

In [4]:
listings = parse_columns(listings_original, ['host_response_rate', 'cleaning_fee',
                                     'host_acceptance_rate','extra_people',
                                     'weekly_price', 'monthly_price', 'security_deposit'])

In [5]:
#function to produce 4 listings dataframes (whole, holiday, wke, wkd) with listing mean price
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

def get_mean_price(cal, listings):
    
    cal['price'] = cal['price'].astype(str).map(lambda x: x.lstrip('$'))
    cal['price'] = cal['price'].apply(pd.to_numeric, errors='coerce')
    cal['price'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    cal['date'] = pd.to_datetime(cal['date'])
    cal['month'] = cal['date'].apply(lambda x: x.month)
    cal['day'] = cal['date'].apply(lambda x: x.day)
    cal['day_of_week'] = cal['date'].dt.weekday_name
    
    cl = calendar()
    holidays = cl.holidays(start=cal['date'].min(), end=cal['date'].max())
    
    cal['holiday'] = cal['date'].isin(holidays)
    cal = cal[(cal['date']>'2016-07-06')&(cal['date']<'2016-10-06')]
    
    c = cal.loc[cal.available!='f']
    c = c[['listing_id','date','price','month','day_of_week','holiday']]
    c=c.fillna(c.mean())
    
    c_hol = c[c['holiday']==True]
    c_wke = c[(c['holiday']==False)&((c['day_of_week']=='Sunday')|(c['day_of_week']=='Saturday'))]
    c_wkd = c[(~c.isin(c_hol['date']))&(~c.isin(c_wke['date']))]


    price_hol_dict = {'price': c_hol.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_hol.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_hol.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_hol.groupby(by='listing_id')['price'].fillna(0).std(),                 
                  'skew_of_price': c_hol.groupby(by='listing_id')['price'].fillna(0).skew(),
                     'median_price': c_hol.groupby(by='listing_id')['price'].median()}


    price_wke_dict = {'price': c_wke.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_wke.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_wke.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_wke.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c_wke.groupby(by='listing_id')['price'].fillna(0).skew(),
                     'median_price': c_wke.groupby(by='listing_id')['price'].median()}


    price_wkd_dict = {'price': c_wkd.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_wkd.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_wkd.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_wkd.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c_wkd.groupby(by='listing_id')['price'].fillna(0).skew(),
                     'median_price': c_wkd.groupby(by='listing_id')['price'].median()}


    price_whole_dict = {'price': c.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c.groupby(by='listing_id')['price'].max(), 
                  'min_price': c.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c.groupby(by='listing_id')['price'].fillna(0).skew(),
                       'median_price': c.groupby(by='listing_id')['price'].median()}



    price_hol = pd.DataFrame(price_hol_dict)
    price_wke = pd.DataFrame(price_wke_dict)
    price_wkd = pd.DataFrame(price_wkd_dict)
    price_c = pd.DataFrame(price_whole_dict)    
    
    price_hol = price_hol.reset_index()
    price_wke = price_wke.reset_index()
    price_wkd = price_wkd.reset_index()
    price_c = price_c.reset_index()

    listings_hol = listings.merge(price_hol, how='inner', left_on='id', right_on='listing_id')
    listings_wke = listings.merge(price_wke, how='inner', left_on='id', right_on='listing_id')
    listings_wkd = listings.merge(price_wkd, how='inner', left_on='id', right_on='listing_id')
    listings_c = listings.merge(price_c, how='inner', left_on='id', right_on='listing_id')
    
    price_hol_new = price_hol.rename(columns = {'max_price': 'hol_max_price', 'min_price': 'hol_min_price', 'price': 'hol_price',
                                           'skew_of_price': 'hol_skew_of_price', 'stdev_of_price': 'hol_stdev_of_price',
                                               'median_price' : 'hol_median_price'})
    price_wke_new = price_wke.rename(columns = {'max_price': 'wke_max_price', 'min_price': 'wke_min_price', 'price': 'wke_price',
                                           'skew_of_price': 'wke_skew_of_price', 'stdev_of_price': 'wke_stdev_of_price',
                                               'median_price' : 'wke_median_price'})
    price_wkd_new = price_wkd.rename(columns = {'max_price': 'wkd_max_price', 'min_price': 'wkd_min_price', 'price': 'wkd_price',
                                           'skew_of_price': 'wkd_skew_of_price', 'stdev_of_price': 'wkd_stdev_of_price',
                                               'median_price' : 'wkd_median_price'})
    
    listings_c = listings_c.merge(price_hol_new, how='outer', left_on='id', right_on='listing_id')
    listings_c = listings_c.merge(price_wke_new, how='outer', left_on='id', right_on='listing_id')
    listings_c = listings_c.merge(price_wkd_new, how='outer', left_on='id', right_on='listing_id')
    
    L_hol = ['hol_max_price', 'hol_min_price', 'hol_price', 'hol_skew_of_price', 'hol_stdev_of_price', 'hol_median_price']
    L_wke = ['wke_max_price', 'wke_min_price', 'wke_price', 'wke_skew_of_price', 'wke_stdev_of_price', 'wke_median_price']
    L_wkd = ['wkd_max_price', 'wkd_min_price', 'wkd_price', 'wkd_skew_of_price', 'wkd_stdev_of_price', 'wkd_median_price']
    
    listings_c[L_hol + L_wke + L_wkd] = listings_c[L_hol + L_wke + L_wkd].fillna(0)
    listings_c = listings_c.drop(['listing_id_y'], axis = 1)
    listings_c['listing_id_x'] = listings_c['listing_id_x'].fillna(0)
    
    #len(cal['listing_id'].astype(str).unique())
    #count = len(c['listing_id'].astype(str).unique())
    
    #print('Due to the above filtering on calendar, the right total count of listings is: ' %(count))
    
    return listings_hol, listings_wke, listings_wkd, listings_c

In [6]:
listings_hol, listings_wke, listings_wkd, listings = get_mean_price(calendar_original, listings)

In [7]:
import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
#use n components in place of n topics when using gridsearchcv
def create_topics(pdseries, listings):
        corpus = pdseries.fillna('none')
        
        vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )
        
        data_vectorized = vectorizer.fit_transform(corpus)
        
        lda_model = LatentDirichletAllocation(n_topics=20,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
        
        lda_output = lda_model.fit_transform(data_vectorized)

        # column names
        col_name = pd.DataFrame(pdseries).columns[0]
        topicnames = [str(col_name) + "-" + "Topic" + str(i) for i in range(lda_model.n_topics)]

        # index names
        docnames = [str(col_name) + "-" + "Doc" + str(i) for i in range(len(corpus))]

        # Make the pandas dataframe
        df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

        # Get dominant topic for each document
        dominant_topic = np.argmax(df_document_topic.values, axis=1)
        df_document_topic[str(col_name) + "-" + 'Dominant_Topic'] = dominant_topic
        
        df_document_topic.index = [i for i in range(len(df_document_topic))]
        
        df_document_topic = df_document_topic.fillna(0)
        
        out = df_document_topic.merge(listings, left_index=True, right_index=True)
        out = out.astype('str')
        return out

In [10]:
#removing experiences offered as column was all nulls
"""text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']"""
#5/23/18
#using only description:
text_features = ['description']
new = listings.copy()
for i in text_features:
    new = create_topics(listings[i], new)



In [11]:
import nltk
from nltk.corpus import stopwords
import re

def create_txt_features(pdseries, listings):
    
    textLength = []
    textWordsPerc = []
    textPuncPerc = []
    textDigitsPerc = []

    for i in pdseries:
        tokens = re.findall(r"[\w']+|[.,!?;]", i)
        textLength.append(len(tokens))

        if len(tokens)==0:
            textWordsPerc.append(0)
            textPuncPerc.append(0)
            textDigitsPerc.append(0)

        else:
            textWordsPerc.append(len(i.split())/float(len(tokens)))
            textPuncPerc.append(len(''.join(c for c in i if c in string.punctuation))/float(len(tokens)))
            textDigitsPerc.append(len(''.join(c for c in i if c in string.digits))/float(len(tokens)))

    col_name = pd.DataFrame(pdseries).columns[0]
    
    textLength_varname = str(col_name) + '_TextLength'
    textWordsPerc_varname = str(col_name) + '_TextWordsPerc'
    textPuncPerc_varname = str(col_name) + '_TextPuncPerc'
    textDigitsPerc_varname = str(col_name) + '_TextDigitsPerc'
    
    listings[textLength_varname] = textLength
    listings[textWordsPerc_varname] = textWordsPerc
    listings[textPuncPerc_varname] = textPuncPerc
    listings[textDigitsPerc_varname] = textDigitsPerc
    
    return listings

In [12]:
new2 = new.copy()
for i in text_features:
    new2 = create_txt_features(new[i], new2)

In [13]:
def lexical_diversity(pdseries, listings):
    
    col_name = pd.DataFrame(pdseries).columns[0]
    varname = str(col_name) + "_LexicalDiversity"
    
    lx_div = pd.Series([len(i)/len(set(i)) for i in pdseries])
    listings[varname] = lx_div
    
    return listings

In [14]:
new3 = new2.copy()
for i in text_features:
    new3 = lexical_diversity(new2[i], new3)

In [15]:
def extract_grammar(pdseries, listings):
    
    import nltk
    from nltk.tag import pos_tag, map_tag
    from collections import Counter
      
    df = pd.DataFrame()
    for text in pdseries:
        
        col_name = pd.DataFrame(pdseries).columns[0]
        
        
        tokenized_text = nltk.word_tokenize(text.decode('utf-8'))
        grammar = [i[1] for i in nltk.pos_tag(tokenized_text, tagset='universal')]
        
        counter = Counter(grammar)
        fr = pd.DataFrame(counter, index=[0])
        fr.columns = [str(col_name) + '_' + str(i) for i in fr.columns]
        
        fr2 = fr/len(tokenized_text)
        fr2.columns = [str(i) + '_tokens_sum_ratio' for i in fr2.columns]
        
        fr3 = pd.concat([fr, fr2], ignore_index=True)
        
        df = pd.concat([df, fr3], ignore_index=True)
        
        
        
    df = df.fillna(0)
        
    return listings.merge(df, left_index=True, right_index=True)

In [16]:
new4 = new3.copy()
for i in text_features:
    new4 = extract_grammar(new3[i], new4)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def kmeans_Clusterer(pdseries, listings):
    
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(pdseries)
    true_k = 10
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    model.fit(X)
    
    col_name = pd.DataFrame(pdseries).columns[0]
    varname = str(col_name) + "_KmeansCluster"
    
    listings[varname] = pd.Series(model.labels_)
    listings[varname] = listings[varname].fillna(0)
    
    return listings

In [18]:
new5 = new4.copy()
for i in text_features:
    new5 = kmeans_Clusterer(new4[i], new5)

Amenities

In [19]:
def string_to_set(x):
    c = set()
    for w in x[1:-1].split(","):
        c.add(w)
        
    return c

def has_amenity(x, amen_):
    if amen_ in x:
        return 1
    pass

def boolean_convert(z):
    if z:
        return 1
    else:
        return 0

In [20]:
def add_amenities(listings):
    
    a = listings['amenities'].fillna('{}')
    listings['amenities_set'] = a.map(string_to_set)
    all_amenities = set()
    
    for idx in listings['amenities_set'].index:
        all_amenities = all_amenities.union(listings['amenities_set'][idx])
        
    b = set(['', '"translation missing: en.hosting_amenity_49"','"translation missing: en.hosting_amenity_50"'])
    
    all_amenities = all_amenities.difference(b)
    
    for amen in all_amenities:
        
        b = listings['amenities_set'].map(lambda x: amen in x).map(boolean_convert)
        
        if len(amen.split(' ')) == 1:
            listings['has_' + amen] = b
            continue
            
        if "" in amen:
            amen = amen[1:-1].replace(' ', '_')
            
        listings['has_' + amen] = b
    
    return listings      

In [21]:
new6 = new5.copy()

In [22]:
new6 = add_amenities(new6)

In [23]:
def add_host_verifications(listings):
    a = listings['host_verifications'].map(lambda x: x[1:-1]).map(lambda j: j.split(',')).map(lambda k: set(k))
    all_host_verifications = set()
    
    for w in a.index:
        all_host_verifications = all_host_verifications.union(a[w])
    
    for w in all_host_verifications:
        
        b = a.map(lambda x: w in x).map(boolean_convert)
        
        if '' in w:
            w = w.strip()[1:-1].replace(' ', '_')
            
        listings['uses_' + w] = b
    
    return listings      

In [24]:
new7 = new6.copy()
new7 = add_host_verifications(new7)

In [25]:
#def new_add_distance_from_ocean(listings):
    #lst2 = []
    #for i,k in zip(new7['latitude'],new7['longitude']):
        #lon_diff = (float(k) + 117.235585)*np.pi/180
        #lat_diff = (float(i) - 32.802458)*np.pi/180
        #a = np.sin(lat_diff/2)**2 + np.cos(float(i)*np.pi/180)*np.cos(32.802458*np.pi/180)*(np.sin(lon_diff/2)**2)
        #c = np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        #d = 6371.00*float(c)
        #lst2.append(d)
        
    #listings['distance_from_ocean'] = lst2
    
    #return listings

In [26]:
contour_list = [(-117.12542727272728, 32.50603636363636),(-117.1239, 32.52068181818182),(-117.12237272727273, 32.53532727272727),(-117.12785, 32.550577272727274),
 (-117.13332727272727, 32.565827272727276),(-117.14155757575757, 32.58140454545455),(-117.14978787878788, 32.59698181818182),(-117.15801818181818, 32.61255909090909),
 (-117.14722727272728, 32.6186),(-117.13142727272728, 32.61995454545455), (-117.11562727272728, 32.621309090909094),(-117.16624848484848, 32.628136363636365),(-117.11614090909092, 32.6319),
 (-117.15394090909092, 32.632684090909095),(-117.11665454545455, 32.64249090909091),(-117.17447878787878, 32.643713636363636),(-117.16065454545455, 32.64676818181818),(-117.12490000000001, 32.6552),
 (-117.2549909090909, 32.65713636363636),(-117.18270909090909, 32.659290909090906), (-117.16736818181818, 32.66085227272727),(-117.24204545454546, 32.66253636363636),
 (-117.19277575757576, 32.66720303030303),(-117.13314545454546, 32.66790909090909),(-117.26173636363637, 32.67278181818182),(-117.24129545454545, 32.673590909090905),
 (-117.17408181818182, 32.67493636363636),(-117.20284242424242, 32.67511515151515),(-117.14139090909092, 32.68061818181818), (-117.2129090909091, 32.68302727272727),(-117.24054545454545, 32.68464545454545),
 (-117.18729545454545, 32.686263636363634), (-117.26848181818183, 32.688427272727274), (-117.15354363636364, 32.690034545454544),
 (-117.21238181818183, 32.69705454545455), (-117.2005090909091, 32.69759090909091), (-117.26729545454546, 32.698513636363636),
 (-117.16569636363637, 32.699450909090906), (-117.23467727272728, 32.700131818181816), (-117.2661090909091, 32.7086),
 (-117.17784909090909, 32.708867272727275), (-117.2288090909091, 32.71561818181818), (-117.19000181818183, 32.71828363636364),
 (-117.21548181818181, 32.721659090909085), (-117.26427272727273, 32.72405454545454), (-117.20215454545455, 32.7277), (-117.26243636363637, 32.73950909090909),
 (-117.26060000000001, 32.754963636363634), (-117.25876363636364, 32.77041818181818), (-117.26415909090909, 32.78511818181818),
 (-117.26955454545455, 32.79981818181818), (-117.27495, 32.81451818181819), (-117.28034545454545, 32.829218181818185), (-117.27120303030303, 32.83975454545455),
 (-117.26206060606061, 32.85029090909091), (-117.25291818181819, 32.86082727272727), (-117.24906363636364, 32.87129090909091),
 (-117.24953181818182, 32.88050454545454), (-117.25, 32.88971818181818), (-117.25397272727272, 32.90718909090909), (-117.25794545454545, 32.92466),
 (-117.26191818181819, 32.942130909090906), (-117.26589090909091, 32.95960181818182), (-117.26986363636364, 32.97707272727273),
 (-117.27609772727273, 32.993097727272726), (-117.28233181818182, 33.009122727272725), (-117.2885659090909, 33.025147727272724),
 (-117.29480000000001, 33.04117272727272), (-117.3010340909091, 33.05719772727273), (-117.30726818181819, 33.07322272727273), (-117.31350227272728, 33.08924772727273)]

In [27]:
def search_for_min_index(L, x):
    B = []
    count = 0
    for i, j in L:
        J = float(j)
        X = float(x)
        b = abs(J - X)
        B.append((count, b))
        count = count + 1
        
    B = sorted(B, key = lambda x: x[1])
    return (B[0][0], B[1][0])

In [28]:
def new_add_distance_from_ocean(listings):
    lst2 = []
    for i,k in zip(new7['latitude'],new7['longitude']):
        j_one, j_two = search_for_min_index(contour_list, i)
        val_one_i, val_one_ii = contour_list[j_one]
        val_two_i, val_two_ii = contour_list[j_two]
        val_ii_max = max(val_one_ii, val_two_ii)
        val_ii_min = min(val_one_ii, val_two_ii)
        
        #rat = (i - val_ii_min)/val_ii_max
        #val_i_max = max(val_one_i, val_two_i)
        #val_i_min = min(val_one_i, val_two_i)
        #val_i = val_i_min*rat + val_i_max*(1 - rat)
        
        lon_diff = (float(k) - val_one_i)*np.pi/180
        lat_diff = (float(i) - val_one_ii)*np.pi/180
        a = np.sin(lat_diff/2)**2 + np.cos(float(i)*np.pi/180)*np.cos(val_one_ii*np.pi/180)*(np.sin(lon_diff/2)**2)
        c_one = np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        d_one = 6371.00*float(c_one)
        
        lon_diff = (float(k) - val_two_i)*np.pi/180
        lat_diff = (float(i) - val_two_ii)*np.pi/180
        a = np.sin(lat_diff/2)**2 + np.cos(float(i)*np.pi/180)*np.cos(val_two_ii*np.pi/180)*(np.sin(lon_diff/2)**2)
        c_two = np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        d_two = 6371.00*float(c_two)
        
        #lon_diff = (float(k) - val_i)*np.pi/180
        #a = (np.cos(float(i)*np.pi/180)**2)*(np.sin(lon_diff/2)**2)
        #c_three = np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        #d_three = 6371.00*float(c_three)
        #d = min(d_one, d_two, d_three)
        
        d = min(d_one, d_two)
        lst2.append(d)
        
    listings['distance_from_ocean'] = lst2
    
    return listings

In [29]:
new8 = new7.copy()
new8 = new_add_distance_from_ocean(new8)

In [30]:
def street_zipcode_parse(x):
    st = ''
    
    if 'Mexico' in x:
            return '00000.0'
    
    for w in x:
        try:
            a = int(w)
        except ValueError:
            continue
        
        st = st + w
        
    return (st + '.0')

In [31]:
#Getting the cleansed zipcode
new8['zipcode_cleansed'] = new8['street'].map(street_zipcode_parse).map(lambda x: x[-7:]).astype('float')

In [32]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [33]:
def encoder(listings, encoded_features):
    
    label_enc = LabelEncoder()
    
    for col in encoded_features:
        
        listings[col] = listings[col].astype(str)
        
        var_name = str(col) + '_enc'
        listings[var_name] = label_enc.fit_transform(listings[col])
    
    return listings

In [34]:
encoded_vars = ['host_response_time', 'calendar_updated', 'bed_type', 'jurisdiction_names', 'zipcode',
               'cancellation_policy', 'zipcode_cleansed']

In [35]:
new9 = new8.copy()
new9 = encoder(new9, encoded_vars)

In [36]:
#Caution!!! The input features are not dropped by the following to columns - they must be dropped as part of modeling
#5/23/18 the unwanted columns are detected and dropped in the featureExplorastio notebook

In [37]:
def binarizer(listings, binarized_features):
    
    label_enc = LabelBinarizer()
    
    for col in binarized_features:
        
        listings[col] = listings[col].astype(str)
        
        var_name = str(col) + '_bin'
        listings[var_name] = label_enc.fit_transform(listings[col])
    
    return listings

In [38]:
binarized_vars = ['host_is_superhost','is_location_exact','host_has_profile_pic','host_identity_verified',
                  'instant_bookable','require_guest_profile_picture','require_guest_phone_verification']

In [39]:
new10 = new9.copy()
new10 = binarizer(new10, binarized_vars)

In [40]:
#takes list of features that should be numeric and transforms them to float
#Also takes care of the topic features - these need not be input into the features parameter
def make_numeric(listings):
    #Taking Care of topics features
    topic_cols = listings.filter(regex='Topic').columns
    listings[topic_cols] = listings[topic_cols].astype(float)
    
    return listings

In [41]:
new11 = new10.copy()
new11 = make_numeric(new11)

In [42]:
new11 = new11.rename(columns = {'listing_id_x': 'id'})

In [43]:
from collections import defaultdict

In [44]:
col_counts = defaultdict(int)
col_ix = new11.first_valid_index()

In [45]:
cols = []
for col in new11.ix[col_ix].index:
    cnt = col_counts[col]
    col_counts[col] += 1
    suf = '_' + str(cnt) if cnt else ''
    cols.append(col + suf)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [46]:
new11.columns = cols
new11 = new11.drop([col_ix])

In [47]:
#Keep the below line just in case
#new11 = new11.drop(columns= ['id'])

In [48]:
num_features = ['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 
               'beds', 'guests_included', 'minimum_nights',
               'maximum_nights', 'availability_30', 'availability_60','availability_90',
               'availability_365', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
               'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 
               'reviews_per_month', 'max_price','median_price','min_price','price_y','skew_of_price',
                'stdev_of_price','hol_max_price','hol_median_price','hol_min_price','hol_price',
                'hol_skew_of_price','hol_stdev_of_price','wke_max_price','wke_median_price',
                'wke_min_price','wke_price','wke_skew_of_price','wke_stdev_of_price','wkd_max_price',
                'wkd_median_price','wkd_min_price','wkd_price','wkd_skew_of_price','wkd_stdev_of_price', 'id',
               'host_listings_count', 'host_total_listings_count']

In [49]:
new11 = parse_columns(new11,num_features)

In [50]:
events = pd.read_csv('../Datasources/listings_events/listings_events_2018-05-29_V1.csv')
parks = pd.read_csv('../Datasources/listings_parks/listings_parks_2018-05-29_V1.csv')
#add businesses file

In [51]:
def take_equality(x):
    if x == 0:
        return True
    else:
        return False

In [52]:
events['all_events_within_1_km_at_min_distance'] = (events['event_count_1km'] - events['count_of_events_at_min_distance']).map(take_equality)

In [53]:
events['all_events_within_3_km_at_min_distance'] = (events['event_count_3km'] - events['count_of_events_at_min_distance']).map(take_equality)
events['all_events_within_5_km_at_min_distance'] = (events['event_count_5km'] - events['count_of_events_at_min_distance']).map(take_equality)

In [54]:
new11 = new11.merge(events, how = 'inner', left_on = 'id', right_on = 'listing_id')
new11 = new11.merge(parks, how = 'inner', left_on = 'id', right_on = 'listing_id')

In [55]:
new_enc_vars = ['closest_park_full_name', 'park_type']

In [56]:
#EVENTS HAS WAY TOO MANY ROWS -- FIX
events.shape

(6608, 9)

In [57]:
parks.shape

(6608, 8)

In [58]:
new12 = new11.copy()
new12 = encoder(new12, new_enc_vars)

In [59]:
new_bin_vars = ['all_events_within_1_km_at_min_distance', 'all_events_within_3_km_at_min_distance', 'all_events_within_5_km_at_min_distance']

In [60]:
new12 = binarizer(new12, new_bin_vars)

In [61]:
import datetime

In [62]:
def datetime_strip(x):
    x_date = datetime.datetime.strptime(x, "%Y-%m-%d")
    return x_date

In [63]:
def get_days(b):
    return b.days

In [64]:
def add_days_since_host(listings):
    listings['days_since_host'] = (listings['last_scraped'].map(datetime_strip) - listings['host_since'].map(datetime_strip)).map(get_days)
    return listings

In [65]:
new13 = new12.copy()
new13 = add_days_since_host(new13)

In [66]:
def calendar_update_parse(x):
    if x == 'today':
        return 0
    
    elif x == 'yesterday':
        return 1    
    
    else:        
        b = x.split(' ')
        
        if 'days' in b:
            a = int(b[0])
            return a
        
        elif 'week' in b:
            return 7
        
        elif 'weeks' in b:
            a = 7*int(b[0])
            return a
        
        elif 'month' in b:
            a = 365.25/12.0
            return np.floor(a)
        
        elif 'months' in b:
            a = (365.25*float(b[0]))/12.0
            return np.floor(a)
    pass
            

In [67]:
def add_calendar_updated_cleaned(listings):
    listings["calendar_updated_numeric"] = listings["calendar_updated"].map(calendar_update_parse)
    listings["calendar_updated_numeric"] = listings["calendar_updated_numeric"].fillna(listings["calendar_updated_numeric"].max())
    return listings

In [68]:
new14 = new13.copy()
new14 = add_calendar_updated_cleaned(new14)

In [69]:
high_quality_amenities_list = ['has_Other_pet(s)', 'has_Elevator_in_Building', 'has_Indoor_Fireplace', 'has_Buzzer/Wireless_Intercom',
 'has_Gym', 'has_Hot_Tub', 'has_Suitable_for_Events', 'has_Cat(s)', 'has_Pets_live_on_this_property', 'has_Safety_Card','has_Smoking_Allowed',
 'has_Pool', 'has_Pets_Allowed', 'has_Wheelchair_Accessible', 'has_Dog(s)', 'has_Breakfast', 'has_Doorman', 'has_Lock_on_Bedroom_Door']

In [70]:
mid_quality_amenities_list = ['has_Fire_Extinguisher', 'has_Cable_TV', 'has_Air_Conditioning', 'has_Hair_Dryer',
 'has_Iron', 'has_First_Aid_Kit', 'has_Laptop_Friendly_Workspace', 'has_24-Hour_Check-in']

In [71]:
low_quality_amenities_list = ['has_Essentials', 'has_Carbon_Monoxide_Detector', 'has_Internet', 'has_Washer', 'has_Hangers',
 'has_TV', 'has_Kitchen', 'has_Family/Kid_Friendly', 'has_Shampoo', 'has_Heating', 'has_Smoke_Detector', 'has_Free_Parking_on_Premises',
 'has_Dryer', 'has_Wireless_Internet']

In [72]:
def listings_quality_classification(listings):
    listings['amenity_level'] = ''
        
    for w in listings.index:
        
        for j in high_quality_amenities_list:
            a = listings[j][w]
            
            if a == 1:
                listings['amenity_level'][w] = 'high'
                break 
                
        if listings['amenity_level'][w] == 'high':
            continue
                
        for k in mid_quality_amenities_list:
            a = listings[k][w]
            
            if a == 1:
                listings['amenity_level'][w] = 'mid_level'
                break
                
        if listings['amenity_level'][w] == 'mid_level':
            continue
            
        else:
            listings['amenity_level'][w] = 'low'
            
    return listings

In [73]:
new15 = new14.copy()
new15 = listings_quality_classification(new15)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [74]:
new_make_num = ['uses_phone', 'uses_manual_online','has_Pets_Allowed','has_Wheelchair_Accessible',
               'has_First_Aid_Kit', 'has_Cat(s)','has_Pets_Allowed', 'has_24-Hour_Check-in',
               ]

In [75]:
new15 = parse_columns(new15,new_make_num)

In [76]:
new_enc = ['amenity_level']

In [77]:
new16 = new15.copy()
new16 = encoder(new16, new_enc)

In [78]:
print "Listings Shape at Each Iteration"
print listings_original.shape
print listings.shape
print new.shape
print new2.shape
print new3.shape
print new4.shape
print new5.shape
print new6.shape
print new7.shape
print new8.shape
print new9.shape
print new10.shape
print new11.shape
print new12.shape
print new13.shape
print new14.shape
print new15.shape
print new16.shape

Listings Shape at Each Iteration
(6608, 95)
(5753, 121)
(5753, 142)
(5753, 146)
(5753, 147)
(5753, 171)
(5753, 172)
(5753, 213)
(5753, 225)
(5753, 227)
(5753, 234)
(5753, 241)
(5752, 258)
(5752, 263)
(5752, 264)
(5752, 265)
(5752, 266)
(5752, 267)


In [79]:
#Make sure to uncomment and update the count variable whenever needed

In [82]:
count = 0

In [83]:
import datetime
today = datetime.date.today()
count+=1
filename = '../Datasources/listings_augmented/listings_augmented_' + str(today) + '_V' + str(count) + '.csv'

In [84]:
print filename

Datasources/listings_augmented/listings_augmented_2018-06-03_V1.csv


In [85]:
new16.to_csv(filename, index=False)