In [92]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer

airbnb_files = ['data/raw_data/Austin_listings.csv', 'data/raw_data/Boston_listings.csv', 'data/raw_data/Asheville_listings.csv']


In [205]:
def drop_airbnb_cols(filename):
    '''
        input: a filename for detailed airbnb listing data
        output: a pandas dataframe with unecessary columns dropped
    '''
    
    df = pd.read_csv(filename)
    df.drop(['id', 'listing_url', 'scrape_id', 'last_scraped', 'thumbnail_url', 'medium_url', 'picture_url', \
             'xl_picture_url', 'host_id', 'host_url', 'host_thumbnail_url', 'host_picture_url', 'calendar_last_scraped', \
             'weekly_price', 'monthly_price', 'neighbourhood_cleansed', 'license', 'jurisdiction_names', 'square_feet', \
             'neighbourhood', 'calculated_host_listings_count', 'first_review', 'last_review', 'country', 'country_code', \
            'latitude', 'longitude', 'host_name', 'host_location', 'market', 'state', 'city', 'is_location_exact', \
            'smart_location', 'has_availability', 'calendar_updated', 'host_listings_count', 'experiences_offered', \
            'host_since'], axis=1, inplace=True)
    return df


In [71]:
def get_col_names(files):
    '''
        input: a list of detailed airbnb listing data files
        output: a list of common column names (columns that occur in all the files)
    '''
    
    counts = {}
    
    for f in files: 
        for c in drop_airbnb_cols(f).columns: 
            if c in counts: 
                counts[c] += 1
            else: 
                counts[c] = 1

    cols = []
    for c in counts: 
        if counts[c] == len(files):
            cols.append(c)
    
    return cols

In [72]:
def segment(vector, train, dev, test): 
    '''
        Separates a vector into train, dev, and test data in 80%/10%/10% divisions.
    
        input: 
            - a vector of featurized data
            - a list of existing featurized training data
            - a list of existing featurized development data
            - a list of existing featurized testing data
        
        output: 
            - train, dev, and test lists that include the existing featurized dataset (from the input) 
                appended with data from the input vector
    '''
    
    count = len(vector)
    test += vector[-1*int(count*.1):]
    dev += vector[-1*int(count*.2):-1*int(count*.1)]
    train += vector[:int(count*.8)]
    
    return train, dev, test


In [73]:
def featurize(df):
    '''
        old featurize function from milestone 2, will be replaced with individual functions for each col category
    '''
    
    df['price'] = df['price'].map(lambda x: x.replace('$', "").replace(',',"")) # strip $'s and ,'s from the price field
    df[['price']] = df[['price']].apply(pd.to_numeric) # turn the cleaned price col into a number col
    cols = df.columns.tolist()
    cols.remove('price')
    cols = ['price'] + cols # put price as the 0th field in the rows
    df = df[cols]
    
    text_cols = df.select_dtypes(exclude=['float64', 'int64']) # get text fields (naïvely)
    num_cols = df.select_dtypes(include=['float64', 'int64']) # get number fields (naïvely)
    num_cols.fillna(value=0, inplace=True) # fill all NA number fields with 0
    
    return [list(i) for i in num_cols.as_matrix()] # turn matrix of num cols into a list of lists to write to csv file

In [74]:
def featurize_categorical(df): 
    pass

In [75]:
def featurize_text(df): 
    pass

In [76]:
def featurize_num(df): 
    pass

In [212]:
def separate_cols(files): 
    '''
        Separates out the different column types into 3 lists of column names. 
        
        input: list of airbnb data files
        output: a tuple of length three 
    '''

    cols = get_col_names(files)

    # ones that are never null
    categorical_cols = ['require_guest_profile_picture', 'require_guest_phone_verification', 'requires_license', 'instant_bookable', 'bed_type', 'cancellation_policy', 'room_type']
    num_cols = ['number_of_reviews', 'accommodates', 'minimum_nights', 'maximum_nights', 'guests_included', 'availability_30', 'availability_60', 'availability_90', 'availability_365']
    
    # c nulls to "" 
    text_cols = ['name', 'neighborhood_overview', 'summary', 'transit', 'street', 'host_neighbourhood', 'notes', 'space', 'description']
    
    return categorical_cols, num_cols, text_cols


In [60]:
def save_datasets(col_type, train, dev, test):
    '''
        input: 
            - col_type: categorical, text, num
            - train, dev, and test lists of featurized vectors
            
        Saves 3 csv files: train, dev, and test with col_type prefix
    '''
    
    with open('data/' + col_type + 'train.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(train)

    with open('data/' + col_type + 'dev.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(dev)

    with open('data/' + col_type + 'test.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(test)

In [61]:
def create_datasets():
    '''
        Creates train, dev, and test files for each column type (categorical, num, text)
    '''
    
    # this could be cleaned up with a loop for modularity
    
    categorical_train = []
    num_train = []
    text_train = []
    
    categorical_dev = []
    num_dev = []
    text_dev = []
    
    categorical_test = []
    num_test = []
    text_test = []
    
    categorical_cols, num_cols, text_cols = separate_cols(airbnb_files)

    for f in airbnb_files:   
        df = pd.read_csv(f, dtype={'zipcode': 'str'})
        
        categorical_df = df[categorical_cols]
        num_df = df[num_cols]
        text_df = df[text_cols]
        
        categorical_vector = featurize_categorical(categorical_df)
        num_vector = featurize_num(num_df)
        text_vector = featurize_text(text_df)

        categorical_train, categorical_dev, categorical_test = segment(categorical_vector, categorical_train, categorical_dev, categorical_test)
        num_train, num_dev, num_test = segment(num_vector, num_train, num_dev, num_test)
        text_train, text_dev, text_test = segment(text_vector, text_train, text_dev, text_test)
        
    save_datasets('categorical', categorical_train, categorical_dev, categorical_test)
    save_datasets('num', num_train, num_dev, num_test)
    save_datasets('text', text_train, text_dev, text_test)

In [62]:
create_datasets()

['United States' 'Spain']
['United States']
['United States']


In [25]:
print get_col_names(airbnb_files)

['review_scores_accuracy', 'reviews_per_month', 'bathrooms', 'host_identity_verified', 'cancellation_policy', 'transit', 'room_type', 'accommodates', 'host_neighbourhood', 'street', 'review_scores_communication', 'host_acceptance_rate', 'country_code', 'is_location_exact', 'review_scores_cleanliness', 'neighborhood_overview', 'availability_365', 'host_location', 'market', 'city', 'property_type', 'space', 'availability_90', 'availability_60', 'zipcode', 'host_since', 'host_has_profile_pic', 'state', 'amenities', 'host_listings_count', 'maximum_nights', 'latitude', 'review_scores_location', 'requires_license', 'security_deposit', 'instant_bookable', 'description', 'experiences_offered', 'price', 'bedrooms', 'extra_people', 'smart_location', 'host_verifications', 'number_of_reviews', 'host_response_rate', 'host_about', 'availability_30', 'review_scores_rating', 'name', 'bed_type', 'country', 'notes', 'has_availability', 'calendar_updated', 'longitude', 'summary', 'beds', 'minimum_nights'