In [41]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

airbnb_files = ['data/raw_data/Austin_listings.csv', 'data/raw_data/Boston_listings.csv', 'data/raw_data/Asheville_listings.csv']


In [120]:
def drop_airbnb_cols(filename):
    df = pd.read_csv(filename)
    df.drop(['id', 'listing_url', 'scrape_id', 'last_scraped', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_thumbnail_url', 'host_picture_url', 'calendar_last_scraped', 'weekly_price', 'monthly_price', 'neighbourhood_cleansed', 'license', 'jurisdiction_names'], axis=1, inplace=True)
    return df

def get_col_names(files):
    counts = {}
    
    for f in files: 
        for c in drop_airbnb_cols(f).columns: 
            if c in counts: 
                counts[c] += 1
            else: 
                counts[c] = 1

    cols = []
    for c in counts: 
        if counts[c] == 3:
            cols.append(c)
    
    return cols

def segment(vector, train, dev, test): 
    count = len(vector)
    test += vector[-1*int(count*.1):]
    dev += vector[-1*int(count*.2):-1*int(count*.1)]
    train += vector[:int(count*.8)]
    
    return train, dev, test
    
def featurize(df):
    
    df['price'] = df['price'].map(lambda x: x.replace('$', "").replace(',',""))
    df[['price']] = df[['price']].apply(pd.to_numeric) # turn the price col into a number col
    cols = df.columns.tolist()
    cols.remove('price')
    cols = ['price'] + cols
    df = df[cols]
    text_cols = df.select_dtypes(exclude=['float64', 'int64'])
#     for col in text_cols: 
#         corpus = df[col].fillna(value="").values
#         print corpus
#         vectorizer = CountVectorizer()
#         X = vectorizer.fit_transform(corpus)
#         print X
    num_cols = df.select_dtypes(include=['float64', 'int64'])
    num_cols.fillna(value=0, inplace=True)
    return [list(i) for i in num_cols.as_matrix()]
    
    # call dictvectorizor on that list --> numpy array of features
    # convert rest of df into numpy array with price as first value
    # append 2 numpy arrays by index
    
    # return big numpy array
    

In [121]:
def create_datasets():
    col_names = get_col_names(airbnb_files)

    train = []
    dev = []
    test = []
    
    for f in airbnb_files:   
        df = pd.read_csv(f, dtype={'zipcode': 'str'})
        df = df[col_names]
        vector = featurize(df)
        train, dev, test = segment(vector, train, dev, test) # put data from f into train, dev, test
        
    return train, dev, test

In [122]:
train, dev, test = create_datasets()

0        300.00
1         99.00
2        100.00
3        100.00
4        599.00
5        100.00
6         54.00
7         40.00
8        130.00
9         44.00
10        99.00
11        50.00
12        49.00
13        50.00
14        55.00
15       180.00
16        36.00
17       125.00
18        59.00
19        49.00
20        40.00
21       150.00
22       175.00
23        50.00
24       120.00
25       450.00
26        49.00
27        36.00
28        75.00
29        50.00
         ...   
5805     400.00
5806      90.00
5807     999.00
5808     250.00
5809     170.00
5810      99.00
5811     214.00
5812     600.00
5813     110.00
5814     275.00
5815      49.00
5816     250.00
5817    1350.00
5818     250.00
5819     185.00
5820     172.00
5821     150.00
5822     250.00
5823     150.00
5824     350.00
5825     500.00
5826     250.00
5827     500.00
5828     300.00
5829     125.00
5830     179.00
5831      85.00
5832      66.00
5833      25.00
5834     120.00
Name: price, dtype: obje

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [125]:
import csv

with open('data/train.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(train)
    
with open('data/dev.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(dev)

with open('data/test.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(test)