In [1]:
#CrossCompute

# Prompts users to enter a zipcode in the tool
# The default zipcode is 11419
target_folder = '/tmp'

ZipcodeInput = 11419

In [2]:
import subprocess
import sys

# This function is used to install packages using pip
# It's equivalent to doing 'pip install ______'
def install(package):
    subprocess.call([sys.executable, "-m", "pip", "install", package])

install('sodapy') # Package for NYC OpenData API
install('folium') # Package to generate map

In [3]:
import pandas as pd
from sodapy import Socrata # Used to access/ work with NYCOpenData API
import folium

In [4]:
#################################
# WORKING WITH CATCH BASIN DATA #
#################################


# Grabbing data from API
client = Socrata("data.cityofnewyork.us",
                'YFHnlAd1f74IprxACGOlr46td',
                username="nycopendataninjas@gmail.com",
                password="DataNinjas4TheWin!")

# Limits the data to only clogged catch basin complaints in a specified zipcode^
results = client.get("fhrw-4uyv", 
                     incident_zip = ZipcodeInput,
                     complaint_type="Sewer",
                     descriptor = "Catch Basin Clogged/Flooding (Use Comments) (SC)",
                     limit=10000)

# Convert to pandas DataFrame
df_threeOneOneReq = pd.DataFrame.from_records(results)

# Only gets the location of these complaints
complaintLoc = df_threeOneOneReq[['latitude','longitude']]

In [5]:
#################################
# WORKING WITH TREE CENSUS DATA #
#################################


# Limits the data to only trees that are ALIVE in that specified zipcode that was entered above^
results = client.get("5rq2-4hqu",
                     zipcode = ZipcodeInput,
                     status = 'Alive',
                     limit=10000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

# Only get the columns that are useful
results_df = results_df[['tree_dbh', 'health','status','latitude','longitude','spc_latin']]            
            
# Replaces words with numbers so that it is easier to create a 'grade' for each tree
results_df = results_df.replace(['Poor','Fair','Good'],[0,50,100])

# 'tree_dbh' was an object, this converts it to an int so that it can be added to 'health' and 'status'
results_df['tree_dbh'] = pd.to_numeric(results_df['tree_dbh'])

# Anywhere there is an 'NaN', make it a zero
results_df = results_df.fillna(0)

# Looks through list of each species and it's type
df = pd.read_csv('Species_Types.csv')
df = df.set_index('Species')

# Decides whether each tree is deciduous, conferous, etc.
results_df['Type'] = df.loc[results_df.spc_latin,'Type'].values

# Replaces words with numbers so that it is easier to create a 'grade' for each tree
results_df = results_df.replace(['deciduous','coniferous','evergreen','both'],[1,0,0,0])

# Generates a final grade that will be the value of the weight on the heat map for each tree
results_df['Final Grade'] = ((results_df.tree_dbh + results_df.health)/100)*results_df.Type

# Removes all the trees that dont lose leaves
results_df = results_df[results_df.Type != 0]
results_df = results_df.fillna(0)

In [6]:
# Only gets the location of these trees

treesLoc = results_df[['latitude', 'longitude']].copy()
treesLoc.dropna(subset=['latitude','longitude'], inplace=True)

In [7]:
df_threeOneOneReq_LOC = df_threeOneOneReq[['latitude', 'longitude']].copy()
df_threeOneOneReq_LOC.dropna(subset=['latitude','longitude'], inplace=True)

In [9]:
####################################
#   GETTING COMPLAINT COUNTS       #                            
#   WITHIN A 100 METER RADIUS      #
#         OF EACH TREE             #
####################################

import numpy as np
from pysal.cg.kdtree import KDTree
from pysal.cg import RADIUS_EARTH_MILES

complaints_xys = df_threeOneOneReq_LOC[['latitude', 'longitude']].astype(np.float).values 
complaints_tree = KDTree(complaints_xys, distance_metric='Arc', radius=RADIUS_EARTH_MILES)

In [10]:
complaints_count = len(complaints_xys)
complaints_count

617

In [11]:
xy = 40.682460735128025,-73.8300148272251
distances, indices = complaints_tree.query(xy, k=complaints_count, distance_upper_bound=0.5)

In [12]:
indices

indices[~np.isnan(indices)]

len(indices[~np.isnan(indices)])


314

In [13]:
# Setting radius equal to ~ 100 meters
radius_in_miles = 0.0497097

# Function that can find the number of complaints within 100 meters from each tree
def get_complaint_count(r):
    xy = r['latitude'], r['longitude']
    distances, indices = complaints_tree.query(xy, k=complaints_count, distance_upper_bound=radius_in_miles)
    indices = indices[~np.isnan(indices)]
    return len(indices)

# Applying functtion to each tree
treesLoc = treesLoc.apply(pd.to_numeric)
treesLoc['# of Complaints within 0.5 miles'] = treesLoc.apply(get_complaint_count,axis=1)

In [14]:
# Adding that column to the results_df
results_df['complaints'] = treesLoc['# of Complaints within 0.5 miles']

In [15]:
results_df

Unnamed: 0,tree_dbh,health,status,latitude,longitude,spc_latin,Type,Final Grade,complaints
0,13,100,Alive,40.69199129,-73.82114902,Metasequoia glyptostroboides,1,1.13,10
1,4,50,Alive,40.69161541,-73.82096547,Metasequoia glyptostroboides,1,0.54,7
2,3,100,Alive,40.69153616,-73.82092678,Ulmus americana,1,1.03,7
3,15,100,Alive,40.69210017,-73.82832551,Gleditsia triacanthos var. inermis,1,1.15,8
4,18,50,Alive,40.69166655,-73.829862,Gleditsia triacanthos var. inermis,1,0.68,11
5,12,100,Alive,40.6921299,-73.82821961,Gleditsia triacanthos var. inermis,1,1.12,8
6,24,100,Alive,40.69382431,-73.82753579,Quercus palustris,1,1.24,4
7,21,50,Alive,40.69385554,-73.82742485,Quercus palustris,1,0.71,2
8,4,100,Alive,40.69394083,-73.8271218,Quercus palustris,1,1.04,2
9,26,100,Alive,40.69379328,-73.82764605,Quercus palustris,1,1.26,4


In [16]:
# Used to print table in final tool result
# We most likely will not need it
# because we are using a map

from os.path import join
target_path = join(target_folder, 'results.csv')
results_df.to_csv(target_path, index=False)
print('result_table_path = %s' % target_path)

result_table_path = /tmp/results.csv


In [17]:
#################################
#     Generating a Heatmap      #
#################################


from folium import plugins
from folium.plugins import HeatMap

# Centers the map at the first coordinate in that zipcode
starting_Lat = results_df.iloc[0]['latitude']
starting_Long = results_df.iloc[0]['longitude']

# Coverts the starting points from string to float
starting_Lat = pd.to_numeric(starting_Lat, downcast='float')
starting_Long = pd.to_numeric(starting_Long, downcast='float')

# Creates the map centered at that point^, b/w, zoomed in
map_hooray = folium.Map(location=[starting_Lat, starting_Long],
                    tiles = "Stamen Toner",
                    zoom_start = 14.5)

# Ensure you're handing it floats
results_df['Latitude'] = results_df['latitude'].astype(float)
results_df['Longitude'] = results_df['longitude'].astype(float)
results_df['Final_Grade'] = results_df['Final Grade'].astype(float)

results_df = results_df.fillna(0)

# This is what we will be putting onto the map: Latitude, longitude, and a "weight"
heat_data = [[row['Latitude'],row['Longitude'],row['Final Grade']] for index, row in results_df.iterrows()]

# Plot it on the map
HeatMap(heat_data, 
        min_opacity = 0.01, 
        max_val = 1.5, 
        blur = 20, 
       ).add_to(map_hooray)

# Allows the map to go fullscreen
folium.plugins.Fullscreen(position='topright',
                          title='Full Screen',
                          title_cancel='Exit Full Screen',
                          force_separate_button=True
                         ).add_to(map_hooray)

# Display the map
map_hooray

In [18]:
#################################
#       Training a Model        #
#################################


In [19]:
x = results_df[[
    'tree_dbh',
    'health',
    'Type'
]]
y = results_df['complaints']

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [21]:
cross_val_score(model1, x,y,cv=3,scoring = 'neg_mean_absolute_error')

array([-6.66022948, -6.8027934 , -5.40649704])

In [22]:
q = [19,50,1]
model1.predict([q])

array([11.25407455])

In [23]:
from sklearn.linear_model import BayesianRidge
model2 = BayesianRidge()
model2.fit(x,y)

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)

In [24]:
cross_val_score(model2, x,y,cv=3,scoring = 'neg_mean_absolute_error').mean()

-6.287310061959086

In [25]:
model2.predict([q])

array([11.17812326])