In [1]:
import pandas as pd
import csv
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

airbnb_files = ['data/raw_data/Austin_listings.csv', 'data/raw_data/Boston_listings.csv', 'data/raw_data/Asheville_listings.csv']


In [70]:
def drop_airbnb_cols(filename):
    df = pd.read_csv(filename)
    df.drop(['id', 'listing_url', 'scrape_id', 'last_scraped', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_thumbnail_url', 'host_picture_url', 'calendar_last_scraped', 'weekly_price', 'monthly_price', 'neighbourhood_cleansed', \
             'license', 'jurisdiction_names', 'square_feet', 'neighbourhood', 'calculated_host_listings_count'], axis=1, inplace=True)
    return df

def get_col_names(files):
    counts = {}
    
    for f in files: 
        for c in drop_airbnb_cols(f).columns: 
            if c in counts: 
                counts[c] += 1
            else: 
                counts[c] = 1

    cols = []
    for c in counts: 
        if counts[c] == 3:
            cols.append(c)
    
    return cols

def segment(vector, train, dev, test): 
    count = len(vector)
    test += vector[-1*int(count*.1):]
    dev += vector[-1*int(count*.2):-1*int(count*.1)]
    train += vector[:int(count*.8)]
    
    return train, dev, test
    
def featurize(df):
    
    df['price'] = df['price'].map(lambda x: x.replace('$', "").replace(',',""))
    df[['price']] = df[['price']].apply(pd.to_numeric) # turn the price col into a number col
    text_cols = ['neighborhood_overview', 'space', 'description', 'host_about', 'notes', 'summary']
    X = [[] for i in range(len(df))]
    for col in text_cols:
        corpus = df[col].fillna(value="").values
        vectorizer = CountVectorizer()
        x = vectorizer.fit_transform(corpus) #TODO: clean text
#         X[0] += x[0] #can't iterate or join sparse matrices
#         print X[0]
#         x = x[:50].toarray().tolist()
        for i, bow in enumerate(x[:50]):
            indices = bow.indices
            data = bow.data
            X[i] += zip(indices, data)
    print 'CHECK: '
    return X
    
    # call dictvectorizor on that list --> numpy array of features
    # convert rest of df into numpy array with price as first value
    # append 2 numpy arrays by index
    
    # return big numpy array
    




In [71]:
def create_datasets():
    col_names = get_col_names(airbnb_files)

    train = []
    dev = []
    test = []
    
    for f in airbnb_files:   
        df = pd.read_csv(f, dtype={'zipcode': 'str'})
        df = df[col_names]
        vector = featurize(df)
        train, dev, test = segment(vector, train, dev, test) # put data from f into train, dev, test
        
    return train, dev, test

In [72]:
create_datasets()

<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
CHECK: 
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
CHECK: 
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
CHECK: 


([[(3791, 1),
   (446, 2),
   (1612, 1),
   (529, 1),
   (3548, 1),
   (2116, 2),
   (268, 1),
   (6375, 2),
   (6291, 1),
   (3784, 1),
   (3098, 1),
   (3036, 1),
   (3400, 1),
   (2030, 1),
   (6972, 2),
   (4104, 1),
   (5273, 1),
   (4222, 1),
   (6292, 1),
   (4028, 1),
   (5600, 1),
   (2922, 1),
   (5660, 1),
   (587, 1),
   (3683, 1),
   (1935, 1),
   (4504, 1),
   (6801, 1),
   (1959, 1),
   (6038, 1),
   (4629, 1),
   (3309, 1),
   (5040, 1),
   (4307, 1),
   (11032, 3),
   (5083, 2),
   (10208, 1),
   (9460, 1),
   (1600, 1),
   (3691, 1),
   (11210, 1),
   (8126, 1),
   (9227, 1),
   (1586, 1),
   (4472, 1),
   (8446, 1),
   (3669, 2),
   (9479, 1),
   (4001, 1),
   (3200, 1),
   (4380, 1),
   (8229, 1),
   (7577, 1),
   (8687, 2),
   (5712, 2),
   (25, 1),
   (1824, 2),
   (80, 1),
   (7271, 1),
   (113, 1),
   (1506, 1),
   (7223, 1),
   (3556, 1),
   (7172, 1),
   (7227, 1),
   (1598, 1),
   (5075, 1),
   (5007, 1),
   (1525, 2),
   (1291, 1),
   (10152, 1),
   (7300, 1

In [133]:
def save_datasets(train, dev, test):
    with open('data/train.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(train)

    with open('data/dev.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(dev)

    with open('data/test.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(test)

In [134]:
save_datasets(*create_datasets())

In [None]:
def save_sparse_csr(train, dev, test):
    