In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_json("./train.json")

In [3]:
train_df.head(1)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue


### Method 1: Logit Regression

#### Goal 1: Clean/Choose the right features for our logit regression.

In [4]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
#print_full(train_df.created)
#A brief inspection reveals that the dates when the listings are created fall in 3 months: 4, 5, and 6.

In [5]:
train_df1 = train_df.copy()
train_df1["created"] = pd.to_datetime(train_df1["created"])
train_df1['month'] = train_df1['created'].dt.strftime('%b')

In [6]:
train_df1.head(1)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address,month
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,Jun


In [7]:
train_df2 = train_df1.copy()
del train_df2['building_id']
del train_df2['manager_id']
train_df2.head(2)

Unnamed: 0,bathrooms,bedrooms,created,description,display_address,features,interest_level,latitude,listing_id,longitude,photos,price,street_address,month
10,1.5,3,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,Jun
10000,1.0,2,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue,Jun


In [8]:
train_df3 = train_df2.copy()
train_df3['features_number'] = [len(x) for x in train_df3.features]
del train_df3['features']
del train_df3['description']
del train_df3['created']
del train_df3['listing_id']
del train_df3['photos']
train_df3.head(3)

Unnamed: 0,bathrooms,bedrooms,display_address,interest_level,latitude,longitude,price,street_address,month,features_number
10,1.5,3,Metropolitan Avenue,medium,40.7145,-73.9425,3000,792 Metropolitan Avenue,Jun,0
10000,1.0,2,Columbus Avenue,low,40.7947,-73.9667,5465,808 Columbus Avenue,Jun,5
100004,1.0,1,W 13 Street,high,40.7388,-74.0018,2850,241 W 13 Street,Apr,4


In [94]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(timeout = 199)
location = geolocator.reverse("40.7145, -73.9425").address

In [95]:
print location
print location.split(', ')[-6]
print len(train_df4)

792, Metropolitan Avenue, Williamsburg, Kings County, NYC, New York, 11211, United States of America
Williamsburg
49352


In [96]:
train_df4 = train_df3.copy()
#train_df4['latitude_string'] = [str(x) for x in train_df4.latitude]
#train_df4['longitude_string'] = [str(x) for x in train_df4.longitude]
train_df4['location-ll'] = [str(x) + ', ' + str(y) for x,y in zip(train_df4.latitude, train_df4.longitude)]
del train_df4['latitude']
del train_df4['longitude']

In [176]:
len(train_df4)

49352

#### Note from Peng: In the section below, the suburb list stores the address info we want. I used the Nominatim introduced in 3 sections above to get the first 1000 rows' address data. For the rest 49,352 - 1,000 = 48,352 rows, I will used Google Map API. Please DO NOT run this section anymore! It will take almost 10 minutes to finish it. Please jump to the next section to use Google Map API. That will only take about 5 minutes for 1000 rows.

In [40]:
suburb = [None] * len(train_df4)
#[geolocator.reverse(x).address for x in train_df4['location-ll']]
suburb[:1000] = [geolocator.reverse(x).address for x in train_df4['location-ll'][:1000]]

#### Make a copy in case the original one is crashed.

In [225]:
suburb_1 = list(suburb)

#### Store the data into a NEW local file. Did not use 'r+' because that will be appending, which might cause problems if we run it more than one time.

In [None]:
with open('address.txt', 'w') as f:
    for element in suburb_1:
        if element != None:
            f.write(element.encode('utf8') + '\n')

#### After this line, I change the file name 'address.txt' to 'address0-1999.txt' in my laptop.

#### Note from Peng: Below is how I got the data for row 1000 to 1999. I used the Google Map API.

In [None]:
from geopy.geocoders import GoogleV3
geolocator = GoogleV3(api_key = 'AIzaSyArADAYJx1mXQItgqIVWyv5JEOzi6Qt1ts')

points = train_df4['location-ll'][11000:12000]#Change the numbers here to specify the rows that we want to get address data for.
results = [geolocator.reverse(x, timeout = 10000) for x in points]

#### After getting the new address data, please store the data into the suburb list.

In [291]:
results[0][0].address

u'30 W 62nd St, New York, NY 10023, USA'

In [292]:
print train_df4.iloc[10000,]

bathrooms                            1
bedrooms                             1
display_address       West 63rd Street
interest_level                  medium
price                             3300
street_address     30 West 63rd Street
month                              Apr
features_number                     10
location-ll          40.7707, -73.9817
Name: 122563, dtype: object


In [293]:
with open('address10000-11000.txt', 'w') as f:
    for element in results:
        if element != None:
            f.write(element[0].address.encode('utf8') + '\n')
        else:
            f.write('None')

#### Then go back to the get-data section above to get data for new rows.