Data Preparation Notebook Consolidating All Data Prep code into linear process

In [1]:
import json
from StringIO import StringIO
import pandas as pd

In [2]:
listings_original = pd.read_csv('Datasources/inside_airbnb/listings.csv')
calendar_original = pd.read_csv('Datasources/inside_airbnb/calendar.csv')

In [3]:
def parse_columns(listings, cols):
    chars = "%$"
    for i in cols:
        listings[i] = listings[i].astype(str).map(lambda x: x.rstrip(chars))
        listings[i] = listings[i].astype(str).map(lambda x: x.lstrip(chars))
        listings[i] = listings[i].apply(pd.to_numeric, errors='coerce')
        listings[i].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    return listings        

In [4]:
listings = parse_columns(listings_original, ['host_response_rate', 'cleaning_fee',
                                     'host_acceptance_rate','extra_people',
                                     'weekly_price', 'monthly_price', 'security_deposit'])

In [5]:
#function to produce 4 listings dataframes (whole, holiday, wke, wkd) with listing mean price
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

def get_mean_price(cal, listings):
    
    cal['price'] = cal['price'].astype(str).map(lambda x: x.lstrip('$'))
    cal['price'] = cal['price'].apply(pd.to_numeric, errors='coerce')
    cal['price'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    cal['date'] = pd.to_datetime(cal['date'])
    cal['month'] = cal['date'].apply(lambda x: x.month)
    cal['day'] = cal['date'].apply(lambda x: x.day)
    cal['day_of_week'] = cal['date'].dt.weekday_name
    
    cl = calendar()
    holidays = cl.holidays(start=cal['date'].min(), end=cal['date'].max())
    
    cal['holiday'] = cal['date'].isin(holidays)
    cal = cal[(cal['date']>'2016-07-06')&(cal['date']<'2016-10-06')]
    
    c = cal.loc[cal.available!='f']
    c = c[['listing_id','date','price','month','day_of_week','holiday']]
    c=c.fillna(c.mean())
    
    c_hol = c[c['holiday']==True]
    c_wke = c[(c['holiday']==False)&((c['day_of_week']=='Sunday')|(c['day_of_week']=='Saturday'))]
    c_wkd = c[(~c.isin(c_hol['date']))&(~c.isin(c_wke['date']))]


    price_hol_dict = {'price': c_hol.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_hol.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_hol.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_hol.groupby(by='listing_id')['price'].fillna(0).std(),                 
                  'skew_of_price': c_hol.groupby(by='listing_id')['price'].fillna(0).skew(),
                     'median_price': c_hol.groupby(by='listing_id')['price'].median()}


    price_wke_dict = {'price': c_wke.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_wke.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_wke.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_wke.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c_wke.groupby(by='listing_id')['price'].fillna(0).skew(),
                     'median_price': c_wke.groupby(by='listing_id')['price'].median()}


    price_wkd_dict = {'price': c_wkd.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c_wkd.groupby(by='listing_id')['price'].max(), 
                  'min_price': c_wkd.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c_wkd.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c_wkd.groupby(by='listing_id')['price'].fillna(0).skew(),
                     'median_price': c_wkd.groupby(by='listing_id')['price'].median()}


    price_whole_dict = {'price': c.groupby(by='listing_id')['price'].mean(), 
                  'max_price': c.groupby(by='listing_id')['price'].max(), 
                  'min_price': c.groupby(by='listing_id')['price'].min(), 
                  'stdev_of_price': c.groupby(by='listing_id')['price'].fillna(0).std(),
                  'skew_of_price': c.groupby(by='listing_id')['price'].fillna(0).skew(),
                       'median_price': c.groupby(by='listing_id')['price'].median()}



    price_hol = pd.DataFrame(price_hol_dict)
    price_wke = pd.DataFrame(price_wke_dict)
    price_wkd = pd.DataFrame(price_wkd_dict)
    price_c = pd.DataFrame(price_whole_dict)    
    
    price_hol = price_hol.reset_index()
    price_wke = price_wke.reset_index()
    price_wkd = price_wkd.reset_index()
    price_c = price_c.reset_index()

    listings_hol = listings.merge(price_hol, how='inner', left_on='id', right_on='listing_id')
    listings_wke = listings.merge(price_wke, how='inner', left_on='id', right_on='listing_id')
    listings_wkd = listings.merge(price_wkd, how='inner', left_on='id', right_on='listing_id')
    listings_c = listings.merge(price_c, how='inner', left_on='id', right_on='listing_id')
    
    price_hol_new = price_hol.rename(columns = {'max_price': 'hol_max_price', 'min_price': 'hol_min_price', 'price': 'hol_price',
                                           'skew_of_price': 'hol_skew_of_price', 'stdev_of_price': 'hol_stdev_of_price',
                                               'median_price' : 'hol_median_price'})
    price_wke_new = price_wke.rename(columns = {'max_price': 'wke_max_price', 'min_price': 'wke_min_price', 'price': 'wke_price',
                                           'skew_of_price': 'wke_skew_of_price', 'stdev_of_price': 'wke_stdev_of_price',
                                               'median_price' : 'wke_median_price'})
    price_wkd_new = price_wkd.rename(columns = {'max_price': 'wkd_max_price', 'min_price': 'wkd_min_price', 'price': 'wkd_price',
                                           'skew_of_price': 'wkd_skew_of_price', 'stdev_of_price': 'wkd_stdev_of_price',
                                               'median_price' : 'wkd_median_price'})
    
    listings_c = listings_c.merge(price_hol_new, how='outer', left_on='id', right_on='listing_id')
    listings_c = listings_c.merge(price_wke_new, how='outer', left_on='id', right_on='listing_id')
    listings_c = listings_c.merge(price_wkd_new, how='outer', left_on='id', right_on='listing_id')
    
    L_hol = ['hol_max_price', 'hol_min_price', 'hol_price', 'hol_skew_of_price', 'hol_stdev_of_price', 'hol_median_price']
    L_wke = ['wke_max_price', 'wke_min_price', 'wke_price', 'wke_skew_of_price', 'wke_stdev_of_price', 'wke_median_price']
    L_wkd = ['wkd_max_price', 'wkd_min_price', 'wkd_price', 'wkd_skew_of_price', 'wkd_stdev_of_price', 'wkd_median_price']
    
    listings_c[L_hol + L_wke + L_wkd] = listings_c[L_hol + L_wke + L_wkd].fillna(0)
    listings_c = listings_c.drop(['listing_id_y'], axis = 1)
    listings_c['listing_id_x'] = listings_c['listing_id_x'].fillna(0)
    
    #len(cal['listing_id'].astype(str).unique())
    #count = len(c['listing_id'].astype(str).unique())
    
    #print('Due to the above filtering on calendar, the right total count of listings is: ' %(count))
    
    return listings_hol, listings_wke, listings_wkd, listings_c

In [6]:
listings_hol, listings_wke, listings_wkd, listings = get_mean_price(calendar_original, listings)

In [7]:
import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
#use n components in place of n topics when using gridsearchcv
def create_topics(pdseries, listings):
        corpus = pdseries.fillna('none')
        
        vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )
        
        data_vectorized = vectorizer.fit_transform(corpus)
        
        lda_model = LatentDirichletAllocation(n_topics=20,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
        
        lda_output = lda_model.fit_transform(data_vectorized)

        # column names
        col_name = pd.DataFrame(pdseries).columns[0]
        topicnames = [str(col_name) + "-" + "Topic" + str(i) for i in range(lda_model.n_topics)]

        # index names
        docnames = [str(col_name) + "-" + "Doc" + str(i) for i in range(len(corpus))]

        # Make the pandas dataframe
        df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

        # Get dominant topic for each document
        dominant_topic = np.argmax(df_document_topic.values, axis=1)
        df_document_topic[str(col_name) + "-" + 'Dominant_Topic'] = dominant_topic
        
        df_document_topic.index = [i for i in range(len(df_document_topic))]
        
        df_document_topic = df_document_topic.fillna(0)
        
        out = df_document_topic.merge(listings, left_index=True, right_index=True)
        out = out.astype('str')
        return out

In [10]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new = listings.copy()
for i in text_features:
    new = create_topics(listings[i], new)



In [11]:
import nltk
from nltk.corpus import stopwords
import re

def create_txt_features(pdseries, listings):
    
    textLength = []
    textWordsPerc = []
    textPuncPerc = []
    textDigitsPerc = []

    for i in pdseries:
        tokens = re.findall(r"[\w']+|[.,!?;]", i)
        textLength.append(len(tokens))

        if len(tokens)==0:
            textWordsPerc.append(0)
            textPuncPerc.append(0)
            textDigitsPerc.append(0)

        else:
            textWordsPerc.append(len(i.split())/float(len(tokens)))
            textPuncPerc.append(len(''.join(c for c in i if c in string.punctuation))/float(len(tokens)))
            textDigitsPerc.append(len(''.join(c for c in i if c in string.digits))/float(len(tokens)))

    col_name = pd.DataFrame(pdseries).columns[0]
    
    textLength_varname = str(col_name) + '_TextLength'
    textWordsPerc_varname = str(col_name) + '_TextWordsPerc'
    textPuncPerc_varname = str(col_name) + '_TextPuncPerc'
    textDigitsPerc_varname = str(col_name) + '_TextDigitsPerc'
    
    listings[textLength_varname] = textLength
    listings[textWordsPerc_varname] = textWordsPerc
    listings[textPuncPerc_varname] = textPuncPerc
    listings[textDigitsPerc_varname] = textDigitsPerc
    
    return listings

In [12]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new2 = new.copy()
for i in text_features:
    new2 = create_txt_features(new[i], new2)

In [13]:
def lexical_diversity(pdseries, listings):
    
    col_name = pd.DataFrame(pdseries).columns[0]
    varname = str(col_name) + "_LexicalDiversity"
    
    lx_div = pd.Series([len(i)/len(set(i)) for i in pdseries])
    listings[varname] = lx_div
    
    return listings

In [14]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new3 = new2.copy()
for i in text_features:
    new3 = lexical_diversity(new2[i], new3)

In [15]:
def extract_grammar(pdseries, listings):
    
    import nltk
    from nltk.tag import pos_tag, map_tag
    from collections import Counter
      
    df = pd.DataFrame()
    for text in pdseries:
        
        col_name = pd.DataFrame(pdseries).columns[0]
        
        
        tokenized_text = nltk.word_tokenize(text.decode('utf-8'))
        grammar = [i[1] for i in nltk.pos_tag(tokenized_text, tagset='universal')]
        
        counter = Counter(grammar)
        fr = pd.DataFrame(counter, index=[0])
        fr.columns = [str(col_name) + '_' + str(i) for i in fr.columns]
        
        fr2 = fr/len(tokenized_text)
        fr2.columns = [str(i) + '_tokens_sum_ratio' for i in fr2.columns]
        
        fr3 = pd.concat([fr, fr2], ignore_index=True)
        
        df = pd.concat([df, fr3], ignore_index=True)
        
        
        
    df = df.fillna(0)
        
    return listings.merge(df, left_index=True, right_index=True)

In [16]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new4 = new3.copy()
for i in text_features:
    new4 = extract_grammar(new3[i], new4)

LookupError: 
**********************************************************************
  Resource u'taggers/universal_tagset/en-ptb.map' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - 'C:\\Users\\sanka/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\Users\\sanka\\Anaconda2\\nltk_data'
    - 'C:\\Users\\sanka\\Anaconda2\\lib\\nltk_data'
    - 'C:\\Users\\sanka\\AppData\\Roaming\\nltk_data'
    - u''
**********************************************************************

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def kmeans_Clusterer(pdseries, listings):
    
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(pdseries)
    true_k = 10
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    model.fit(X)
    
    col_name = pd.DataFrame(pdseries).columns[0]
    varname = str(col_name) + "_KmeansCluster"
    
    listings[varname] = pd.Series(model.labels_)
    listings[varname] = listings[varname].fillna(0)
    
    return listings

In [18]:
#removing experiences offered as column was all nulls
text_features = ['space', 'description', 
                 'neighborhood_overview', 'notes', 'transit', 
                 'access', 'interaction', 'house_rules']
new5 = new4.copy()
for i in text_features:
    new5 = kmeans_Clusterer(new4[i], new5)

Amenities

In [19]:
def string_to_set(x):
    c = set()
    for w in x[1:-1].split(","):
        c.add(w)
        
    return c

def has_amenity(x, amen_):
    if amen_ in x:
        return 1
    pass

In [20]:
def add_amenities(listings):
    listings['amenities_set'] = listings['amenities'].fillna('{}').map(string_to_set)
    all_amenities = set()
    
    for idx in listings['amenities'].fillna('{}').map(string_to_set).index:
        all_amenities = all_amenities.union(listings['amenities'].fillna('{}').map(string_to_set)[idx])
    
    for amen in all_amenities:
        
        if len(amen.split(' ')) == 1:
            listings['has_' + amen] = 0
            listings['has_' + amen] = listings['amenities_set'].map(lambda x: has_amenity(x, amen))
            continue
            
        if "" in amen:
            amen = amen[1:-1].replace(' ', '_')
            
        listings['has_' + amen] = 0
        listings['has_' + amen] = listings['amenities_set'].map(lambda x: has_amenity(x, amen))
        
    
    has_amenties_list = []
    for amen in all_amenities:
        
        if len(amen.split(' ')) == 1:
            has_amenties_list.append('has_' + amen)
            continue
            
        if "" in amen:
            amen = amen[1:-1].replace(' ', '_')
            
        has_amenties_list.append('has_' + amen)
        
    listings[has_amenties_list] = listings[has_amenties_list].fillna(0)
    
    return listings      

In [21]:
new6 = new5.copy()

In [22]:
new6 = add_amenities(new6)

In [23]:
def add_host_verifications(listings):
    a = listings['host_verifications'].map(lambda x: x[1:-1]).map(lambda j: j.split(',')).map(lambda k: set(k))
    all_host_verifications = set()
    
    for w in a.index:
        all_host_verifications = all_host_verifications.union(a[w])
        
    for w in all_host_verifications:
        
        if '' in w:
            w = w.strip()[1:-1].replace(' ', '_')
            
        listings['uses_' + w] = 0
        listings['uses_' + w] = a.map(lambda x: has_amenity(x, w))
        
    
    uses_verification_list = []
    for veri in all_host_verifications:
        
        if '' in veri:
            veri = veri.strip()[1:-1].replace(' ', '_')
            
        uses_verification_list.append('uses_' + veri)
        
    listings[uses_verification_list] = listings[uses_verification_list].fillna(0)
    
    return listings      

In [24]:
new7 = new6.copy()
new7 = add_host_verifications(new7)

In [25]:
def add_distance_from_ocean(listings):
    listings['distance_from_ocean'] = 0
    listings['distance_from_ocean'] = listings['distance_from_ocean'].astype('float')
    
    for w in listings.index:
        p = float(listings['latitude'][w])
        q = float(listings['longitude'][w])
        lon_diff = (q + 117.235585)*np.pi/180
        lat_diff = (p - 32.802458)*np.pi/180
        a = np.sin(lat_diff/2)**2 + np.cos(p*np.pi/180)*np.cos(32.802458*np.pi/180)*(np.sin(lon_diff/2)**2)
        c = np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        d = 6371.00*float(c)
        listings['distance_from_ocean'][w] = d
        
    return listings

In [26]:
new8 = new7.copy()
new8 = add_distance_from_ocean(new8)

In [27]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [28]:
def encoder(listings, encoded_features):
    
    label_enc = LabelEncoder()
    
    for col in encoded_features:
        
        listings[col] = listings[col].astype(str)
        
        var_name = str(col) + '_enc'
        listings[var_name] = label_enc.fit_transform(listings[col])
    
    return listings

In [29]:
encoded_vars = ['host_response_time', 'calendar_updated', 'bed_type', 'jurisdiction_names', 'zipcode']

In [30]:
new9 = new8.copy()
new9 = encoder(new9, encoded_vars)

In [31]:
#Caution!!! The input features are not dropped by the following to columns - they must be dropped as part of modeling

In [32]:
def binarizer(listings, binarized_features):
    
    label_enc = LabelBinarizer()
    
    for col in binarized_features:
        
        listings[col] = listings[col].astype(str)
        
        var_name = str(col) + '_bin'
        listings[var_name] = label_enc.fit_transform(listings[col])
    
    return listings

In [33]:
binarized_vars = ['host_is_superhost','is_location_exact','host_has_profile_pic','host_identity_verified',
                  'instant_bookable','require_guest_profile_picture','require_guest_phone_verification']

In [34]:
new10 = new9.copy()
new10 = binarizer(new10, binarized_vars)

In [35]:
#takes list of features that should be numeric and transforms them to float
#Also takes care of the topic features - these need not be input into the features parameter
def make_numeric(listings):
    #Taking Care of topics features
    topic_cols = listings.filter(regex='Topic').columns
    listings[topic_cols] = listings[topic_cols].astype(float)
    
    return listings

In [36]:
new11 = new10.copy()
new11 = make_numeric(new11)

In [37]:
new11 = new11.rename(columns = {'listing_id_x': 'id'})

In [38]:
from collections import defaultdict

In [39]:
col_counts = defaultdict(int)
col_ix = new11.first_valid_index()

In [40]:
cols = []
for col in new11.ix[col_ix].index:
    cnt = col_counts[col]
    col_counts[col] += 1
    suf = '_' + str(cnt) if cnt else ''
    cols.append(col + suf)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


In [41]:
new11.columns = cols
new11 = new11.drop([col_ix])

In [42]:
#Keep the below line just in case

#new11 = new11.drop(columns= ['id'])

In [43]:
num_features = ['latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 
               'beds', 'guests_included', 'minimum_nights',
               'maximum_nights', 'availability_30', 'availability_60','availability_90',
               'availability_365', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
               'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
               'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 
               'reviews_per_month', 'max_price','median_price','min_price','price_y','skew_of_price',
                'stdev_of_price','hol_max_price','hol_median_price','hol_min_price','hol_price',
                'hol_skew_of_price','hol_stdev_of_price','wke_max_price','wke_median_price',
                'wke_min_price','wke_price','wke_skew_of_price','wke_stdev_of_price','wkd_max_price',
                'wkd_median_price','wkd_min_price','wkd_price','wkd_skew_of_price','wkd_stdev_of_price', 'id']

In [44]:
new11 = parse_columns(new11,num_features)

In [45]:
events = pd.read_csv('listings_events_information_two.csv')
parks = pd.read_csv('listings_parks_information_two.csv')

In [46]:
new11 = new11.merge(events, how = 'inner', left_on = 'id', right_on = 'listing_id')
new11 = new11.merge(parks, how = 'inner', left_on = 'id', right_on = 'listing_id')

In [47]:
new11.columns

Index([u'house_rules-Topic0', u'house_rules-Topic1', u'house_rules-Topic2',
       u'house_rules-Topic3', u'house_rules-Topic4', u'house_rules-Topic5',
       u'house_rules-Topic6', u'house_rules-Topic7', u'house_rules-Topic8',
       u'house_rules-Topic9',
       ...
       u'events_within_16_km', u'distance_of_closest_park_x', u'Unnamed: 0_y',
       u'listing_id_y', u'parks_within_1_km', u'parks_within_3_km',
       u'parks_within_5_km', u'parks_within_10_km', u'parks_within_16_km',
       u'distance_of_closest_park_y'],
      dtype='object', length=422)

In [48]:
new11.loc[4325,:]

house_rules-Topic0                             0.16
house_rules-Topic1                             0.29
house_rules-Topic2                                0
house_rules-Topic3                                0
house_rules-Topic4                             0.05
house_rules-Topic5                             0.07
house_rules-Topic6                                0
house_rules-Topic7                             0.15
house_rules-Topic8                              0.1
house_rules-Topic9                                0
house_rules-Topic10                               0
house_rules-Topic11                            0.09
house_rules-Topic12                               0
house_rules-Topic13                               0
house_rules-Topic14                               0
house_rules-Topic15                               0
house_rules-Topic16                            0.06
house_rules-Topic17                               0
house_rules-Topic18                               0
house_rules-

In [49]:
new11

Unnamed: 0,house_rules-Topic0,house_rules-Topic1,house_rules-Topic2,house_rules-Topic3,house_rules-Topic4,house_rules-Topic5,house_rules-Topic6,house_rules-Topic7,house_rules-Topic8,house_rules-Topic9,...,events_within_16_km,distance_of_closest_park_x,Unnamed: 0_y,listing_id_y,parks_within_1_km,parks_within_3_km,parks_within_5_km,parks_within_10_km,parks_within_16_km,distance_of_closest_park_y
0,0.00,0.00,0.12,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,2426,3.499712,6594,7972006.0,0.0,0.0,14,113,196,3.176838
1,0.16,0.04,0.00,0.05,0.02,0.03,0.00,0.05,0.05,0.11,...,2254,4.552351,6607,13124681.0,0.0,0.0,5,86,169,4.063204
2,0.01,0.49,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,2208,5.131095,6596,3469225.0,0.0,0.0,1,76,165,4.460993
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.54,...,2208,5.107075,6581,877473.0,0.0,0.0,1,76,165,4.419178
4,0.01,0.59,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,2142,5.321442,6595,3124507.0,0.0,0.0,1,70,162,4.532876
5,0.02,0.35,0.02,0.02,0.02,0.35,0.02,0.02,0.02,0.02,...,2181,5.275917,6599,3432507.0,0.0,0.0,1,72,162,4.514606
6,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,2181,5.312318,6582,3249729.0,0.0,0.0,1,71,162,4.541011
7,0.99,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,2261,4.196094,6597,11756336.0,0.0,0.0,8,88,170,3.699495
8,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,2254,4.439086,6589,13157364.0,0.0,0.0,7,84,168,3.807714
9,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,2258,4.092862,6590,9756570.0,0.0,0.0,8,84,168,3.315470


In [50]:
#Make sure to uncomment and update the count variable whenever needed

In [51]:
count = 2

In [52]:
import datetime
today = datetime.date.today()
count+=1
filename = 'listings_augmented_' + str(today) + '_V' + str(count) + '.csv'

In [53]:
print filename

listings_augmented_2018-05-19_V3.csv


In [54]:
listings_augmented_2018_05_18_V3 = pd.read_csv('listings_augmented_2018-05-18_V3.csv')

In [55]:
L = []

for w in listings_augmented_2018_05_18_V3.columns:
    if w not in new11.columns:
        L.append(w)

In [56]:
K = L[:194]

In [57]:
new11 = new11.merge(listings_augmented_2018_05_18_V3[K], how = 'outer', left_on = 'id', right_on = 'listing_id_x_1')

In [58]:
new11

Unnamed: 0,house_rules-Topic0,house_rules-Topic1,house_rules-Topic2,house_rules-Topic3,house_rules-Topic4,house_rules-Topic5,house_rules-Topic6,house_rules-Topic7,house_rules-Topic8,house_rules-Topic9,...,house_rules_NUM,house_rules_NUM_tokens_sum_ratio,house_rules_PRON,house_rules_PRON_tokens_sum_ratio,house_rules_PRT,house_rules_PRT_tokens_sum_ratio,house_rules_VERB,house_rules_VERB_tokens_sum_ratio,house_rules_X,house_rules_X_tokens_sum_ratio
0,0.00,0.00,0.12,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000000,0.0,0.075758,0.0,0.015152,0.0,0.106061,0.0,0.000000
1,0.16,0.04,0.00,0.05,0.02,0.03,0.00,0.05,0.05,0.11,...,1.0,0.000000,0.0,0.000000,0.0,0.000000,6.0,0.000000,0.0,0.000000
2,0.01,0.49,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.0,0.050000,0.0,0.000000,0.0,0.000000,0.0,0.300000,0.0,0.000000
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.54,...,4.0,0.000000,3.0,0.000000,4.0,0.000000,20.0,0.000000,0.0,0.000000
4,0.01,0.59,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.0,0.041237,0.0,0.030928,0.0,0.041237,0.0,0.206186,0.0,0.000000
5,0.02,0.35,0.02,0.02,0.02,0.35,0.02,0.02,0.02,0.02,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
6,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
7,0.99,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,2.0,0.000000,12.0,0.000000,2.0,0.000000,17.0,0.000000,0.0,0.000000
8,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.0,0.024096,0.0,0.144578,0.0,0.024096,0.0,0.204819,0.0,0.000000
9,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000


In [59]:
new11.to_csv(filename)

In [60]:
print "Listings Shape at Each Iteration"
print listings_original.shape
print listings.shape
print new.shape
print new2.shape
print new3.shape
print new4.shape
print new5.shape
print new6.shape
print new7.shape
print new8.shape
print new9.shape
print new10.shape
print new11.shape

Listings Shape at Each Iteration
(6608, 95)
(5753, 121)
(5753, 289)
(5753, 321)
(5753, 329)
(5753, 329)
(5753, 337)
(5753, 381)
(5753, 393)
(5753, 394)
(5753, 399)
(5753, 406)
(5752, 616)
