In [22]:
from IPython.display import HTML

HTML('''<script>
code_show=true;
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [1]:
%%html

<a href="index.html" target="_self">Back to home page</a>

### Table of Contents

In [3]:
%%html

- <a href = '#map'> Mapping latitude/longitude to regions </a>
<br>
- <a href = '#merge'> Merging in data about precincts and school districts </a>


#### To see more on data cleaning: <a href="data_cleaning.html" target="_self">Data Cleaning</a>

-------------

In [23]:
import sys
sys.path.append("/anaconda/lib/python2.7/site-packages")


import pandas as pd
import numpy as np
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
import string
import matplotlib.pyplot as plt
%matplotlib inline
import math
import shapefile
from sklearn.neighbors import KNeighborsClassifier as KNN
import warnings
warnings.filterwarnings('ignore')

# Data collection

<a id='map'></a>
### Mapping latitude/longitude to regions

We found shapefiles for the neighborhoods within Manhattan, the school districts within Manhattan, and the precincts within Manhattan.  Our goal was to map each home in our dataset to a neighborhood, a school district, and a precinct, and to combine this information with data specific to each school district (test scores) and each precinct (crime rates), as we expected each of these things to be significantly affect the price of homes.  The shapefiles contained polygons for each neighborhood, school district, and precinct, so we used the latitude/longitude values for each home in our dataset to find the polygon in which that latitude/longitude pair was contained.

In [2]:
########### Example use of this code:

# path_to_shape_file = '/Users/nbw/Dropbox/CS_109_Project/crime_UNFORMATTED/nypp_16b/nypd_pres'
# regions_dict = generate_regions_to_points_dict(path_to_shape_file, "Precinct")
# get_region_from_latlong_and_shapefile(40.768814, -73.931085, regions_dict)
# #	--> returns 114

#################################################################################################
####### Inputs:
#######     - path to shapefile
#######		- name of the attribute we want as our name of each region
####### Returns:
#######		- dict mapping region names to shape polygons 

def generate_regions_to_points_dict(shapefile_path, region_attribute_name):
    sf = shapefile.Reader(shapefile_path)
    shapes = sf.shapes()
    records = sf.records()
    fields = sf.fields
    
    # Make sure region_attribute_name exists in the shapefile
    found = False
    for i in range(1, len(fields)):
        if fields[i][0] == region_attribute_name:
            attribute_key = i-1
            found = True
    if not found:
        print "Region attribute not found. Available attributes:"
        for i in range(1, len(fields)):
            print str(fields[i][0]) + "\t",
        return     

    
    regions_to_points_dict = {}
    for i in range(len(records)):
        regions_to_points_dict[records[i][attribute_key]] = shapes[i].points
    return regions_to_points_dict
 



#################################################################################################
####### Code from: http://www.ariel.com.au/a/python-point-int-poly.html
####### Inputs:
####### 	- x value
#######		- y value
#######		- polygon: list of (x,y) pairs corresponding to the perimeter/corners of a polygon
####### Returns: true if x,y is in the specified polygon

def point_inside_polygon(x,y,poly):

    n = len(poly)
    inside =False

    p1x,p1y = poly[0]
    for i in range(n+1):
        p2x,p2y = poly[i % n]
        if y > min(p1y,p2y):
            if y <= max(p1y,p2y):
                if x <= max(p1x,p2x):
                    if p1y != p2y:
                        xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x
                    if p1x == p2x or x <= xinters:
                        inside = not inside
        p1x,p1y = p2x,p2y

    return inside




#################################################################################################
####### Inputs:
#######     - latitude
#######		- longitude 
#######     - regions_to_points_dict - keys are region names, values are lists of points outlining the region
#######								- see "generate_regions_to_points_dict" method to create one of these dicts
####### Returns:
#######		- region the latitude,longitude input falls inside 

def get_region_from_latlong_and_shapefile(latitude, longitude, regions_to_points_dict, print_errors = True):
    for key in regions_to_points_dict.keys():
        if point_inside_polygon(longitude, latitude, regions_to_points_dict[key]):
            return key
    if print_errors:
        print "region not found"
    return



In [11]:
# shapefiles:

path_to_neighborhood_shape_file = 'ZillowNeighborhoods-NY/ZillowNeighborhoods-NY'
neighborhood_dict = generate_regions_to_points_dict(path_to_neighborhood_shape_file, 'NAME')

path_to_schooldistrict_shape_file = 'plswork/plswork'
school_district_dict = generate_regions_to_points_dict(path_to_schooldistrict_shape_file, 'SchoolDist')

path_to_precincts_shape_file = 'nypp_16b/nypd_pres'
precincts_dict = generate_regions_to_points_dict(path_to_precincts_shape_file, 'Precinct')


In [13]:
# zillow data
df = pd.read_csv("mn_df.csv")

# drop rows without long/lat (5 rows)
df = df[np.isfinite(df['longitude'])]
df = df[np.isfinite(df['latitude'])]

In [24]:
df["neighborhood"] = None
for i in df.index:
    df.set_value(i, 'neighborhood', get_region_from_latlong_and_shapefile(df['latitude'].loc[i] , df['longitude'].loc[i] , neighborhood_dict, print_errors=False))
    
# fill missing values using sklearn's knn functions
unknown = df[df["neighborhood"] != df["neighborhood"]]
known = df[df["neighborhood"] == df["neighborhood"]]

train_x_matrix = known[["latitude", "longitude"]].values
train_y_matrix = known["neighborhood"]
test_x_matrix = unknown[["latitude", "longitude"]].values

model = KNN(n_neighbors = 1)
model.fit(train_x_matrix, train_y_matrix)
preds = model.predict(test_x_matrix)

unknown["neighborhood"] = preds

df.loc[unknown.index, 'neighborhood'] = pd.Series(unknown["neighborhood"])

In [19]:
df["schooldistrict"] = None
for i in df.index.values:
    df.set_value(i, 'schooldistrict', get_region_from_latlong_and_shapefile(df['latitude'].loc[i] , df['longitude'].loc[i] , school_district_dict, print_errors=False))

# fill missing values using sklearn's knn functions
# NOTE: manhattan only has districts 1 through 6, so we're also re-mapping everything assigned to other districts 
known = df[df["schooldistrict"] == df["schooldistrict"]]
known = df[df["schooldistrict"] < 7]
unknown = df.drop(known.index)
unknown["schooldistrict"] = None 



train_x_matrix = known[["latitude", "longitude"]].values
train_y_matrix = known["schooldistrict"].astype(int)
test_x_matrix = unknown[["latitude", "longitude"]].values

model = KNN(n_neighbors = 1)
model.fit(train_x_matrix, train_y_matrix)
preds = model.predict(test_x_matrix)

unknown["schooldistrict"] = preds

df.loc[unknown.index, 'schooldistrict'] = pd.Series(unknown["schooldistrict"])

In [25]:
df["precinct"] = None
for i in df.index.values:
    df.set_value(i, 'precinct', get_region_from_latlong_and_shapefile(df['latitude'].loc[i] , df['longitude'].loc[i] , precincts_dict, print_errors=False))


In [26]:
# fill missing values using sklearn's knn functions
# NOTE: manhattan only has districts 1-34, so we're also re-mapping everything assigned to other districts 
known = df[df["precinct"] == df["precinct"]]
known = df[df["precinct"] < 35]
unknown = df.drop(known.index)
unknown["precinct"] = None 



train_x_matrix = known[["latitude", "longitude"]].values
train_y_matrix = known["precinct"].astype(int)
test_x_matrix = unknown[["latitude", "longitude"]].values

model = KNN(n_neighbors = 1)
model.fit(train_x_matrix, train_y_matrix)
preds = model.predict(test_x_matrix)

unknown["precinct"] = preds

df.loc[unknown.index, 'precinct'] = pd.Series(unknown["precinct"])

##### Data

In [27]:
df.head()

Unnamed: 0,tax_value,last_sold_price,property_size,zestimate_amount,bathrooms,zestimate_valuation_range_high,tax_year,zestimate_value_change,latitude,zestimate_percentile,...,graph_data_link,home_size,longitude,home_detail_link,home_type,map_this_home_link,Unnamed: 22,neighborhood,schooldistrict,precinct
0,,,,849427.0,1.0,1044795.0,,,40.724448,0.0,...,,400.0,-73.980284,http://www.zillow.com/homedetails/202-E-7th-St...,MultiFamily2To4,http://www.zillow.com/homes/2098748189_zpid/,,East Village,1,9
1,1642000.0,,2250.0,,2.0,,2015.0,,40.72442,0.0,...,http://www.zillow.com/homedetails/204-E-7th-St...,10800.0,-73.9802,http://www.zillow.com/homedetails/204-E-7th-St...,Apartment,http://www.zillow.com/homes/31495862_zpid/,,East Village,1,9
2,1674000.0,,2250.0,7332356.0,1.0,7992268.0,2015.0,,40.724385,0.0,...,http://www.zillow.com/homedetails/206-E-7th-St...,10800.0,-73.98012,http://www.zillow.com/homedetails/206-E-7th-St...,Cooperative,http://www.zillow.com/homes/31495863_zpid/,,East Village,1,9
3,,,,1991289.0,1.0,2708153.0,,,40.724332,0.0,...,,,-73.980007,http://www.zillow.com/homedetails/208-E-7th-St...,MultiFamily2To4,http://www.zillow.com/homes/2098632095_zpid/,,East Village,1,9
4,,,,,,,,,40.724845,0.0,...,,,-73.980804,http://www.zillow.com/homedetails/190-E-7th-St...,Unknown,http://www.zillow.com/homes/2131860614_zpid/,,East Village,1,9


In [28]:
#write data to file before we add demographic stuff
df.to_csv("mn_with_regions.csv")

In [29]:
df = pd.read_csv("mn_with_regions.csv")

<a id='merge'></a>
### Merge in data about precincts/school districts

Once each home was mapped (via its latitude/longitude coordinates) to a specific school district and precinct, we added statistics specific to each school district and precinct as additional predictors in our dataset.

#### Precincts:

We acquired data from the New York Police Department on crime rates, for a variety of different crimes, in each precinct.  We merged this data with our original data based on the precinct that each home had been mapped to, and wrote this new dataset to a CSV.

In [32]:
path_to_crime_files = "/Users/nbw/Dropbox/CS_109_Project/crime_dfs/"

In [273]:
crime_files = os.listdir(path_to_crime_files)

In [317]:
for filename in os.listdir(path_to_crime_files)[1:]:
    crime = filename.split("(")[0].split(".csv")[0].strip()
    crime_df = pd.read_csv(path_to_crime_files+filename)

    crime_df.rename(columns={'last_3_years': crime+"_last_3_years", 'percent_change': crime+"_percent_change"}, inplace=True)

    merged = df.merge(crime_df, how="outer", left_on="precinct", right_on="PCT")
    merged = merged[merged["precinct"] == merged["precinct"]]
    merged = merged.drop("PCT",axis=1)
    if len(merged) != len(df):
        print "something weird! " + filename
    df = merged

In [None]:
df.head()

In [281]:
# write a file containing just crime data
df.to_csv("mn_crimes_only.csv")

#### School Districts

We acquired data from the NYC Education Department on test scores for each school district.  Like with our crime data, we merged this data with our original data based on the school distict that each home had been mapped to, and wrote this new dataset to a CSV.

In [303]:
schools = pd.read_csv('/Users/nbw/Dropbox/CS_109_Project/test_scores/nyctestscores.csv')

In [304]:
schools = schools.drop(["Grade", "Year", "Category"],axis=1)

In [305]:
for colname in schools.columns:
    schools.rename(columns={colname: "schools_" + colname}, inplace=True)


In [315]:
merged = df.merge(schools, how="outer", left_on="schooldistrict", right_on="schools_district")
merged = merged[merged["schooldistrict"] == merged["schooldistrict"]]
merged = merged.drop("schools_district",axis=1)
if len(merged) != len(df):
    print "something weird! " + filename

df = merged

In [316]:
# if we want just schools, dont run "Precincts:" block
# write a file containing just crime data
df.to_csv("mn_schools_only.csv")

In [318]:
# Write data
df.to_csv("mn_crimes_and_schools.csv")