In [17]:
import pandas as pd
import numpy as np
import geocoder
import math
import re
from scipy.stats import spearmanr, kendalltau, pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [18]:
train = pd.read_json('data/train.json')
test = pd.read_json('data/test.json')
train_test = pd.concat((train.drop(['interest_level'], axis=1), test), axis=0).reset_index(drop=True)

In [6]:
train_test.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
3,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
4,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [16]:
#photos
photos = pd.read_csv('data/photos2.csv')
photo_col = list(photos)[1:]
photos = photos[photo_col]
train_test_photos = pd.merge(train_test[['listing_id', 'photos']], photos, how = 'left', on = 'listing_id')
for i in photo_col[1:]:
    train_test_photos.loc[train_test_photos[i].isnull(), i] = [0]*train_test_photos[i].isnull().sum()
train_test_photos['num_photos'] = train_test_photos['photos'].apply(len)
train_test_photos.head()

Unnamed: 0,listing_id,photos,photo_len_mean,photo_wid_mean,photo_rat_mean,photo_pix_mean,photo_bri_mean,photo_len_max,photo_wid_max,photo_rat_max,photo_pix_max,photo_bri_max,photo_len_min,photo_wid_min,photo_rat_min,photo_pix_min,photo_bri_min,num_photos
0,7211212,[https://photos.renthop.com/2/7211212_1ed4542e...,597.2,468.8,1.335003,817920.0,167.577018,640.0,640.0,1.502347,817920.0,185.544487,426.0,426.0,0.665625,817920.0,147.076801,5
1,7150865,[https://photos.renthop.com/2/7150865_be3306c5...,596.636364,417.0,1.446305,744475.363636,148.479967,640.0,501.0,1.904762,819840.0,203.485281,443.0,336.0,0.884232,574500.0,87.68765,11
2,6887163,[https://photos.renthop.com/2/6887163_de85c427...,499.125,619.875,0.821897,919680.0,129.902187,640.0,640.0,1.336117,919680.0,163.543636,479.0,479.0,0.748437,919680.0,87.584701,8
3,6888711,[https://photos.renthop.com/2/6888711_6e660cee...,586.666667,533.333333,1.138889,921600.0,115.422773,640.0,640.0,1.333333,921600.0,119.627829,480.0,480.0,0.75,921600.0,108.700869,3
4,6934781,[https://photos.renthop.com/2/6934781_1fa4b41a...,586.666667,533.333333,1.138889,921600.0,103.477746,640.0,640.0,1.333333,921600.0,109.261442,480.0,480.0,0.75,921600.0,92.266452,3


In [177]:
#saving photos
train_photos = pd.merge(train[['interest_level', 'listing_id']], train_test_photos.drop(['photos'], axis = 1), how = 'left', on = 'listing_id')
test_photos = pd.merge(test[['listing_id']], train_test_photos.drop(['photos'], axis = 1), how = 'left', on = 'listing_id')
train_photos.to_csv('train_photos.csv', index = False)
test_photos.to_csv('test_photos.csv', index = False)

In [102]:
print train_photos.shape[0], test_photos.shape[0]
for i in (train_test_photos):
    print i + ': ' str(sum(train_test_photos[i].isnull()))

49352 74659
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [19]:
#fill in missing lat. and long.
latceil = math.ceil(np.mean(train_test.latitude) + 2*np.std(train_test.latitude))
latfloor = math.floor(np.mean(train_test.latitude) - 2*np.std(train_test.latitude))
longceil = math.ceil(np.mean(train_test.longitude) + 2*np.std(train_test.longitude))
longfloor = math.floor(np.mean(train_test.longitude) - 2*np.std(train_test.longitude))
train_test_location = train_test[['listing_id', 'display_address', 'street_address', 'latitude', 'longitude']]
missingCoords = train_test_location[(train_test_location.longitude > longceil) | (train_test_location.longitude < \
                longfloor) | (train_test_location.latitude > latceil) | (train_test_location.latitude < latfloor)]
missingGeoms = (missingCoords.street_address + ' New York').apply(geocoder.google)
train_test_location.loc[(train_test_location.longitude > longceil) | (train_test_location.longitude < longfloor) | \
                        (train_test_location.latitude > latceil) | (train_test_location.latitude < latfloor), \
                        'latitude'] = missingGeoms.apply(lambda x: x.lat)
train_test_location.loc[(train_test_location.longitude > longceil) | (train_test_location.longitude < longfloor) | \
                        (train_test_location.latitude > latceil) | (train_test_location.latitude < latfloor), \
                        'longitude'] = missingGeoms.apply(lambda x: x.lng)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
#check for missing lat and long
sum(train_test_location.latitude.isnull())

1

In [21]:
missingCoords = train_test_location[train_test_location.latitude.isnull()]
missingGeoms = (missingCoords.street_address + ' New York').apply(geocoder.google)
train_test_location.loc[train_test_location.latitude.isnull(), 'latitude'] = missingGeoms.apply(lambda x: x.lat)
train_test_location.loc[train_test_location.longitude.isnull(), 'longitude'] = missingGeoms.apply(lambda x: x.lng)
sum(train_test_location.latitude.isnull())

1

In [162]:
#neighborhoods
latlist = []
for n in range(1, int(latceil - latfloor)*40+1):
    x = latfloor + n/40.
    latlist.append([x, str(x)])
longlist = []
for n in range(0, int(longceil - longfloor)*40+1):
    x = longfloor + n/40.
    longlist.append([x, str(x)])
def getlat(coord):
    if coord > latfloor:
        for i in latlist:
            if coord < i[0]: 
                return i[1]
    return 'z'
def getlong(coord):
    if coord > longfloor:
        for i in longlist:
            if coord < i[0]: 
                return i[1]
    return 'z'
#grouping neighborhoods by percentile
train_test_location.loc[:,'neighborhood'] = train_test_location.latitude.apply(getlat) + train_test_location.longitude.apply(getlong)
neighborhood_count = train_test_location.neighborhood.value_counts()
train_test_location['top_1_neighborhood'] = train_test_location['neighborhood'].apply(lambda x: 1 if x in 
    neighborhood_count.index.values[neighborhood_count.values >= np.percentile(neighborhood_count.values, 99)] else 0)
train_test_location['top_2_neighborhood'] = train_test_location['neighborhood'].apply(lambda x: 1 if x in 
    neighborhood_count.index.values[neighborhood_count.values >= np.percentile(neighborhood_count.values, 98)] else 0)
train_test_location['top_5_neighborhood'] = train_test_location['neighborhood'].apply(lambda x: 1 if x in 
    neighborhood_count.index.values[neighborhood_count.values >= np.percentile(neighborhood_count.values, 95)] else 0)
train_test_location['top_10_neighborhood'] = train_test_location['neighborhood'].apply(lambda x: 1 if x in 
    neighborhood_count.index.values[neighborhood_count.values >= np.percentile(neighborhood_count.values, 90)] else 0)
train_test_location['top_20_neighborhood'] = train_test_location['neighborhood'].apply(lambda x: 1 if x in 
    neighborhood_count.index.values[neighborhood_count.values >= np.percentile(neighborhood_count.values, 80)] else 0)
train_test_location['top_25_neighborhood'] = train_test_location['neighborhood'].apply(lambda x: 1 if x in 
    neighborhood_count.index.values[neighborhood_count.values >= np.percentile(neighborhood_count.values, 75)] else 0)
train_test_location['top_30_neighborhood'] = train_test_location['neighborhood'].apply(lambda x: 1 if x in 
    neighborhood_count.index.values[neighborhood_count.values >= np.percentile(neighborhood_count.values, 70)] else 0)
train_test_location['top_40_neighborhood'] = train_test_location['neighborhood'].apply(lambda x: 1 if x in 
    neighborhood_count.index.values[neighborhood_count.values >= np.percentile(neighborhood_count.values, 60)] else 0)
train_test_location['top_50_neighborhood'] = train_test_location['neighborhood'].apply(lambda x: 1 if x in 
    neighborhood_count.index.values[neighborhood_count.values >= np.percentile(neighborhood_count.values, 50)] else 0)
#getting rid of lower limit for neighborhoods
train_test_location['neighborhood_cleaned'] = train_test_location['neighborhood'].apply(lambda x: x if x in 
    neighborhood_count.index.values[neighborhood_count.values >= 50] else 'notapopularneighborhood')

In [22]:
#grouping address
def address(x):
    x = clean(x)
    addr = []
    for i in x:
        if i in ['n', 'n.', 'north']:
            addr += ['north']
        elif i in ['s', 's.', 'south']:
            addr += ['south']
        elif i in ['e', 'e.', 'east']:
            addr += ['east']
        elif i in ['w', 'w.', 'west']:
            addr += ['west']
        elif i in ['st', 'st.', 'street']:
            addr += ['street']
        elif i in ['ave', 'ave.', 'avenue']:
            addr += ['avenue']
        elif len(i.split()) > 0 : 
            addr += ['other']
    return list(set(addr))
clean = lambda x, a =' ': [i.strip() for i in x.lower().replace('*', '+').split(a)]    
flatten = lambda l: [item for sublist in l for item in sublist]
train_test_location.loc[:, 'address_components'] = [address(w) for w in train_test_location.loc[:, 'street_address']]
train_test_location.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


Unnamed: 0,listing_id,display_address,street_address,latitude,longitude,address_components
0,7211212,Metropolitan Avenue,792 Metropolitan Avenue,40.7145,-73.9425,"[other, avenue]"
1,7150865,Columbus Avenue,808 Columbus Avenue,40.7947,-73.9667,"[other, avenue]"
2,6887163,W 13 Street,241 W 13 Street,40.7388,-74.0018,"[west, other, street]"
3,6888711,East 49th Street,333 East 49th Street,40.7539,-73.9677,"[east, other, street]"
4,6934781,West 143rd Street,500 West 143rd Street,40.8241,-73.9493,"[west, other, street]"


In [27]:
c = train_test_location[['listing_id', 'address_components']].set_index(['listing_id'])
c = c['address_components'].str.join('@').str.get_dummies('@').reset_index()
train_test_location = pd.merge(train_test_location, c, how = 'left', on = 'listing_id')

In [41]:
#saving locations
train_location = pd.merge(train[['interest_level', 'listing_id']], \
                          train_test_location.drop(['display_address', 'street_address', 'neighborhood', \
                          'address_components'], axis = 1), how = 'left', on = 'listing_id')
test_location = pd.merge(test[['listing_id']], train_test_location.drop(['display_address', \
                         'street_address', 'neighborhood', 'address_components'], axis = 1), \
                         how = 'left', on = 'listing_id')
train_location.to_csv('train_location.csv', index = False)
test_location.to_csv('test_location.csv', index = False)

In [44]:
train_test_location.head()

Unnamed: 0,listing_id,latitude,longitude,top_1_neighborhood,top_2_neighborhood,top_5_neighborhood,top_10_neighborhood,top_20_neighborhood,top_25_neighborhood,top_30_neighborhood,...,top_50_neighborhood,neighborhood_cleaned,address_components,avenue,east,north,other,south,street,west
0,7211212,40.7145,-73.9425,0,0,0,1,1,1,1,...,1,40.725-73.925,"['other', 'avenue']",1,0,0,1,0,0,0
1,7150865,40.7947,-73.9667,0,1,1,1,1,1,1,...,1,40.8-73.95,"['other', 'avenue']",1,0,0,1,0,0,0
2,6887163,40.7388,-74.0018,0,0,1,1,1,1,1,...,1,40.75-74.0,"['west', 'other', 'street']",0,0,0,1,0,1,1
3,6888711,40.7539,-73.9677,1,1,1,1,1,1,1,...,1,40.775-73.95,"['east', 'other', 'street']",0,1,0,1,0,1,0
4,6934781,40.8241,-73.9493,0,0,0,1,1,1,1,...,1,40.825-73.925,"['west', 'other', 'street']",0,0,0,1,0,1,1


In [45]:
print train_location.shape[0], test_location.shape[0]
for i in (train_test_location):
    print i + ': ' + str(sum(train_test_location[i].isnull()))

49352 74659
listing_id: 0
latitude: 0
longitude: 0
top_1_neighborhood: 0
top_2_neighborhood: 0
top_5_neighborhood: 0
top_10_neighborhood: 0
top_20_neighborhood: 0
top_25_neighborhood: 0
top_30_neighborhood: 0
top_40_neighborhood: 0
top_50_neighborhood: 0
neighborhood_cleaned: 0
address_components: 0
avenue: 0
east: 0
north: 0
other: 0
south: 0
street: 0
west: 0


In [94]:
#building and manager_ids
train_test_ids = train_test[['listing_id', 'building_id', 'manager_id']]
manager_count = train_test_ids['manager_id'].value_counts()
building_count = train_test_ids['building_id'].value_counts()

In [95]:
#grouping neighborhoods by percentile
train_test_ids['top_1_manager'] = train_test_ids['manager_id'].apply(lambda x: 1 if x in 
    manager_count.index.values[manager_count.values >= np.percentile(manager_count.values, 99)] else 0)
train_test_ids['top_2_manager'] = train_test_ids['manager_id'].apply(lambda x: 1 if x in 
    manager_count.index.values[manager_count.values >= np.percentile(manager_count.values, 98)] else 0)
train_test_ids['top_5_manager'] = train_test_ids['manager_id'].apply(lambda x: 1 if x in 
    manager_count.index.values[manager_count.values >= np.percentile(manager_count.values, 95)] else 0)
train_test_ids['top_10_manager'] = train_test_ids['manager_id'].apply(lambda x: 1 if x in 
    manager_count.index.values[manager_count.values >= np.percentile(manager_count.values, 90)] else 0)
train_test_ids['top_20_manager'] = train_test_ids['manager_id'].apply(lambda x: 1 if x in 
    manager_count.index.values[manager_count.values >= np.percentile(manager_count.values, 80)] else 0)
train_test_ids['top_25_manager'] = train_test_ids['manager_id'].apply(lambda x: 1 if x in 
    manager_count.index.values[manager_count.values >= np.percentile(manager_count.values, 75)] else 0)
train_test_ids['top_30_manager'] = train_test_ids['manager_id'].apply(lambda x: 1 if x in 
    manager_count.index.values[manager_count.values >= np.percentile(manager_count.values, 70)] else 0)
train_test_ids['top_40_manager'] = train_test_ids['manager_id'].apply(lambda x: 1 if x in 
    manager_count.index.values[manager_count.values >= np.percentile(manager_count.values, 60)] else 0)
train_test_ids['top_50_manager'] = train_test_ids['manager_id'].apply(lambda x: 1 if x in 
    manager_count.index.values[manager_count.values >= np.percentile(manager_count.values, 50)] else 0)
#getting rid of lower limit for manager_ids
train_test_ids['manager_id_cleaned'] = train_test_ids['manager_id'].apply(lambda x: x if x in 
    manager_count.index.values[manager_count.values >= 50] else 'notapopularmanager_id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#i

In [96]:
train_test_ids['top_1_building'] = train_test_ids['building_id'].apply(lambda x: 1 if x in 
    building_count.index.values[building_count.values >= np.percentile(building_count.values, 99)] else 0)
train_test_ids['top_2_building'] = train_test_ids['building_id'].apply(lambda x: 1 if x in 
    building_count.index.values[building_count.values >= np.percentile(building_count.values, 98)] else 0)
train_test_ids['top_5_building'] = train_test_ids['building_id'].apply(lambda x: 1 if x in 
    building_count.index.values[building_count.values >= np.percentile(building_count.values, 95)] else 0)
train_test_ids['top_10_building'] = train_test_ids['building_id'].apply(lambda x: 1 if x in 
    building_count.index.values[building_count.values >= np.percentile(building_count.values, 90)] else 0)
train_test_ids['top_20_building'] = train_test_ids['building_id'].apply(lambda x: 1 if x in 
    building_count.index.values[building_count.values >= np.percentile(building_count.values, 80)] else 0)
train_test_ids['top_25_building'] = train_test_ids['building_id'].apply(lambda x: 1 if x in 
    building_count.index.values[building_count.values >= np.percentile(building_count.values, 75)] else 0)
train_test_ids['top_30_building'] = train_test_ids['building_id'].apply(lambda x: 1 if x in 
    building_count.index.values[building_count.values >= np.percentile(building_count.values, 70)] else 0)
train_test_ids['top_40_building'] = train_test_ids['building_id'].apply(lambda x: 1 if x in 
    building_count.index.values[building_count.values >= np.percentile(building_count.values, 60)] else 0)
train_test_ids['top_50_building'] = train_test_ids['building_id'].apply(lambda x: 1 if x in 
    building_count.index.values[building_count.values >= np.percentile(building_count.values, 50)] else 0)
train_test_ids['building_id_cleaned'] = train_test_ids['building_id'].apply(lambda x: x if x in 
    building_count.index.values[building_count.values >= 50] else 'notapopularbuildingid')

In [178]:
#saving ids
train_ids = pd.merge(train[['interest_level', 'listing_id']], train_test_ids.drop(['building_id', 'manager_id'], \
                    axis = 1), how = 'left', on = 'listing_id')
test_ids = pd.merge(test[['listing_id']], train_test_ids.drop(['building_id', 'manager_id'], axis = 1), \
                    how = 'left', on = 'listing_id')
train_ids.to_csv('train_ids.csv', index = False)
test_ids.to_csv('test_ids.csv', index = False)

In [170]:
train_test_ids.head()

Unnamed: 0,listing_id,building_id,manager_id,top_1_manager,top_2_manager,top_5_manager,top_10_manager,top_20_manager,top_25_manager,top_30_manager,...,top_1_building,top_2_building,top_5_building,top_10_building,top_20_building,top_25_building,top_30_building,top_40_building,top_50_building,building_id_cleaned
0,7211212,53a5b119ba8f7b61d4e010512e0dfc85,5ba989232d0489da1b5f2c45f6688adc,0,1,1,1,1,1,1,...,0,0,0,0,0,0,1,1,1,notapopularbuildingid
1,7150865,c5c8a357cba207596b04d1afd1e4f130,7533621a882f71e25173b27e3139d83d,0,0,1,1,1,1,1,...,0,0,1,1,1,1,1,1,1,c5c8a357cba207596b04d1afd1e4f130
2,6887163,c3ba40552e2120b0acfc3cb5730bb2aa,d9039c43983f6e564b1482b273bd7b01,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,c3ba40552e2120b0acfc3cb5730bb2aa
3,6888711,28d9ad350afeaab8027513a3e52ac8d5,1067e078446a7897d2da493d2f741316,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,28d9ad350afeaab8027513a3e52ac8d5
4,6934781,0,98e13ad4b495b9613cef886d79a6291f,0,0,0,0,1,1,1,...,1,1,1,1,1,1,1,1,1,0


In [114]:
print train_ids.shape[0], test_ids.shape[0]
for i in (train_test_ids):
    print i + ': ' str(sum(train_test_ids[i].isnull()))

49352 74659
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [179]:
#created
train_test_created = train_test[['listing_id', 'created']]
train_test_created["created"] = pd.to_datetime(train_test_created["created"])
train_test_created["created_month"] = train_test_created["created"].dt.month
train_test_created["created_day"] = train_test_created["created"].dt.day
train_test_created["created_hour"] = train_test_created["created"].dt.hour
train_test_created["created_weekday"] = train_test_created["created"].dt.weekday
train_test_created["created_month_cat"] = train_test_created["created"].dt.month.astype(str)
train_test_created["created_day_cat"] = train_test_created["created"].dt.day.astype(str)
train_test_created["created_hour_cat"] = train_test_created["created"].dt.hour.astype(str)
train_test_created["created_weekday_cat"] = train_test_created["created"].dt.weekday.astype(str)
#saving time
train_created = pd.merge(train[['interest_level', 'listing_id']], train_test_created, how = 'left', on = 'listing_id')
test_created= pd.merge(test[['listing_id']], train_test_created, how = 'left', on = 'listing_id')
train_created.to_csv('train_created.csv', index = False)
test_created.to_csv('test_created.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-

In [171]:
train_test_created.head()

Unnamed: 0,listing_id,created,created_month,created_day,created_hour,created_weekday,created_month_cat,created_day_cat,created_hour_cat,created_weekday_cat
0,7211212,2016-06-24 07:54:24,6,24,7,4,6,24,7,4
1,7150865,2016-06-12 12:19:27,6,12,12,6,6,12,12,6
2,6887163,2016-04-17 03:26:41,4,17,3,6,4,17,3,6
3,6888711,2016-04-18 02:22:02,4,18,2,0,4,18,2,0
4,6934781,2016-04-28 01:32:41,4,28,1,3,4,28,1,3


In [118]:
print train_created.shape[0], test_created.shape[0]
for i in (train_test_created):
    print i + ': ' str(sum(train_test_created[i].isnull()))

49352 74659
0
0
0
0
0
0
0
0
0
0


In [180]:
#basics
train_test_basics = train_test[['listing_id', 'bathrooms', 'bedrooms', 'price']]
train_test_basics['bedbath_ratio'] = train_test_basics['bedrooms']/train_test_basics['bathrooms']
train_test_basics['bathprice_ratio'] = train_test_basics['bathrooms']/train_test_basics['price']
train_test_basics['bedprice_ratio'] = train_test_basics['bedrooms']/train_test_basics['price']
train_test_basics.loc[train_test_basics.bathrooms == 0, 'bedbath_ratio'] = train_test_basics.loc[train_test_basics.bathrooms ==0, 'bedrooms']
train_test_basics.loc[train_test_basics.bedbath_ratio == 0, 'bedbath_ratio'] = train_test_basics.loc[train_test_basics.bedbath_ratio ==0, 'bedbath_ratio'] + 1
train_test_basics['bath_cat'] = [str(5) if x > 4 else str(x) for x in train_test_basics.bathrooms.apply(round).map(int)]
train_test_basics['bed_cat'] = [str(5) if x > 4 else str(x) for x in train_test_basics.bedrooms]
#saving time
train_basics = pd.merge(train[['interest_level', 'listing_id']], train_test_basics, how = 'left', on = 'listing_id')
test_basics= pd.merge(test[['listing_id']], train_test_basics, how = 'left', on = 'listing_id')
train_basics.to_csv('train_basics.csv', index = False)
test_basics.to_csv('test_basics.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-

In [172]:
train_test_basics.head()

Unnamed: 0,listing_id,bathrooms,bedrooms,price,bedbath_ratio,bathprice_ratio,bedprice_ratio,bath_cat,bed_cat
0,7211212,1.5,3,3000,2.0,0.0005,0.001,2,3
1,7150865,1.0,2,5465,2.0,0.000183,0.000366,1,2
2,6887163,1.0,1,2850,1.0,0.000351,0.000351,1,1
3,6888711,1.0,1,3275,1.0,0.000305,0.000305,1,1
4,6934781,1.0,4,3350,4.0,0.000299,0.001194,1,4


In [153]:
print train_basics.shape[0], test_basics.shape[0]
for i in (train_test_basics):
    print i + ': ' str(sum(train_test_basics[i].isnull()))

49352 74659
0
0
0
0
0
0
0
0
0


In [59]:
#functions for changing features
clean = lambda x, a =' ': [i.strip() for i in x.lower().replace('*', '+').split(a)]    
flatten = lambda l: [item for sublist in l for item in sublist]
def changefeatures(x):
    feat = []
    for i in x:
        if ('wood' in i) and ('floor' in i):
            feat += ['hardwood']
        elif any(x in i for x in ['pre-war', 'prewar', 'pre war']):
            feat += ['prewar']
        elif 'dishwasher' in i:
            feat += ['dishwasher']
        elif any(x in i for x in ['laundry', 'dryer', 'washer', 'w/d']):
            feat += ['laundry']
        elif 'roof' in i:
            feat += ['roof']
        elif any(x in i for x in ['outdoor', 'yard']):
            feat += ['outdoor']
        elif 'garden' in i:
            feat += ['garden']
        elif any(x in i for x in ['parking', 'garage']):
            feat += ['parking']
        elif 'park' in i:
            feat += ['park']
        elif any(x in i for x in ['fitness', 'gym', 'health', 'pool']):
            feat += ['gym']
        elif any(x in i for x in ['doorman', 'concierge', 'lobby']):
            feat += ['doorman']
        elif any(x in i for x in ['bike']):
            feat += ['bike']
        elif any(x in i for x in ['ac central', 'central a', 'air condition', 'a/c']) or (i == 'ac'):
            feat += ['ac']
        elif any(x in i for x in ['wifi', 'internet']):
            feat += ['wifi']
        elif any(x in i for x in ['valet']):
            feat += ['valet']
        elif any(x in i for x in ['storage', 'closet']):
            feat += ['storage']
        elif any(x in i for x in ['playroom']):
            feat += ['playroom']
        elif any(x in i for x in ['terrace', 'balcony', 'patio', 'deck']):
            feat += ['patio']
        elif any(x in i for x in ['kitchen']):
            feat += ['kitchen']
        elif any(x in i for x in ['pets']):
            feat += ['pets']
        elif any(x in i for x in ['super']):
            feat += ['pets']
        elif any(x in i for x in ['train', 'subway']):
            feat += ['train']
            #group fees?
        else:
            feat += [i]
    return list(set(feat))

In [61]:
#descriptions and features
train_test_words = train_test[['listing_id', 'description', 'features']]
train_test_words.loc[:, 'description'] = [w.lower() for w in train_test_words.loc[:, 'description']]
train_test_words['description'] = train_test_words['description'].apply(lambda x: x.replace('<p><a  website_redacted ', ''))
train_test_words['description'] = train_test_words['description'].apply(lambda x: x.replace('!<br /><br />', ''))
train_test_words['num_letters_desc'] = train_test_words['description'].apply(lambda x: len(x.strip()))
train_test_words['num_words_desc'] = train_test_words['description'].apply(lambda x: 0 if len(x.strip()) == 0 else len(x.split(' ')))
train_test_words.loc[:, 'features'] = [flatten([clean(w, '+') for w in line]) for line in train_test_words.loc[:, 'features']]
train_test_words['num_features'] = train_test_words['features'].str.len()
train_test_words.loc[:, 'features'] = [changefeatures(line) for line in train_test_words.loc[:, 'features']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [62]:
featsList = []
for i in train_test_words.features:
    featsList += i
featsList = list(set(featsList))
featsdict = {}
for i in featsList:
    featsdict[i] = [0]
for i in train_test_words.features:
    for j in i:
        if j in featsList:
            featsdict[j][0] +=1
z = pd.DataFrame(featsdict).transpose().sort(columns = 0, ascending = False).reset_index()
z = list(z[z[0]>50]['index'])



In [65]:
def changefeatures2(x):
    feat = []
    for i in x:
        if i in z:
            feat += [i]
        else:
            feat += ['other']
    return list(set(feat))

In [66]:
train_test_words.loc[:, 'features'] = [changefeatures2(line) for line in train_test_words.loc[:, 'features']]

In [67]:
d = train_test_words[['listing_id', 'features']].set_index(['listing_id'])
d = d['features'].str.join('@').str.get_dummies('@').reset_index()
train_test_words = pd.merge(train_test_words, d, how = 'left', on = 'listing_id')

In [68]:
train_test_words.head()

Unnamed: 0,listing_id,description,features,num_letters_desc,num_words_desc,num_features,ac,actual apt. photos,bike,brownstone,...,simplex,stainless steel appliances,storage,sublet,train,valet,view,wheelchair access,wheelchair ramp,wifi
0,7211212,a brand new 3 bedroom 1.5 bath apartmentenjoy ...,[],551,90,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7150865,,"[dogs allowed, gym, doorman, elevator, cats al...",0,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6887163,"top top west village location, beautiful pre-w...","[hardwood, dishwasher, laundry, pets]",667,91,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6888711,building amenities - garage - garden - fitness...,"[hardwood, no fee]",455,75,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6934781,beautifully renovated 3 bedroom flex 4 bedroom...,[prewar],478,68,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
train_words = pd.merge(train[['interest_level', 'listing_id']], \
                        train_test_words.drop(['description', 'features'], axis = 1), how = 'left', on = 'listing_id')
test_words = pd.merge(test[['listing_id']], train_test_words.drop(['description', 'features'], axis = 1), \
                        how = 'left', on = 'listing_id')
train_words.to_csv('train_words.csv', index = False)
test_words.to_csv('test_words.csv', index = False)

In [50]:
#combining data
train = pd.read_json('data/train.json')
test = pd.read_json('data/test.json')
train_photos = pd.read_csv('data/train_photos.csv').drop(['interest_level'], axis = 1)
test_photos = pd.read_csv('data/test_photos.csv')
train_location = pd.read_csv('data/train_location.csv').drop(['interest_level'], axis = 1)
test_location = pd.read_csv('data/test_location.csv')
train_ids = pd.read_csv('data/train_ids.csv').drop(['interest_level'], axis = 1)
test_ids = pd.read_csv('data/test_ids.csv')
train_created = pd.read_csv('data/train_created.csv').drop(['interest_level'], axis = 1)
test_created = pd.read_csv('data/test_created.csv')
train_basics = pd.read_csv('data/train_basics.csv').drop(['interest_level'], axis = 1)
test_basics = pd.read_csv('data/test_basics.csv')
train_words = pd.read_csv('data/train_words.csv').drop(['interest_level'], axis = 1)
test_words = pd.read_csv('data/test_words.csv')
train_dfs = [train[['interest_level', 'listing_id']], train_photos, train_location, train_ids, \
           train_created, train_basics, train_words]
test_dfs = [test[['listing_id']], test_photos, test_location, test_ids, \
           test_created, test_basics, test_words]

In [51]:
train_final = reduce(lambda left,right: pd.merge(left,right, how = 'left', on='listing_id'), train_dfs)
test_final = reduce(lambda left,right: pd.merge(left,right, how = 'left', on='listing_id'), test_dfs)

In [55]:
train_final.to_csv('train_final.csv')
test_final.to_csv('test_final.csv')