In [145]:
import pandas as pd
import geopandas as gpd
import json
from shapely.geometry import Point
import shapely.ops as ops
from shapely.geometry import shape
import matplotlib.pyplot as plt
from difflib import get_close_matches
import pyproj
from functools import partial

In [177]:
def make_points(df):
    """
    Takes a dataframe with specified longitude and latitude columns and turns them into a geometric point for map 
    plotting. Creates a list of geometric points and assigns it to a new column, geometry.
    """
    point_list = []
    for i in range(len(df)):
        point = Point(df.iloc[i]["longitude"], df.iloc[i]["latitude"])
        point_list.append(point)
    df['geometry'] = point_list
    return df
def cleaning_togeo(df):
    """
    Applies make points to dataframe specified, turns dataframe into a GeoDataFrame, filters by type of listing to keep only
    entire home/apt listings, then performs string manipulations to turn string $ data into float.
    """                                                                                                    
    df = make_points(df)
    df = gpd.GeoDataFrame(df)
    df = df[df.room_type == "Entire home/apt"]
    df.price = df.price.str.replace("$","")
    df.price = df.price.str.replace(",","")
    df.price = df.price.astype(float)
    df['monthly'] = df.price * 30
    return df
def nhood_matching(nhood_geo, permits_geo,neighbourhood_column):
    """
    Takes two dataframes and a string that denotes the name of the column that contains neighborhood identification. First
    dataframe is a the dataframe with all the neighborhood geo data. Goal is to match the neighborhood strings in this df
    with the neighborhood strings in the second df using difflib's get_close_matches. If match is found, an entry in a dict
    is created. Dict returned.
    """
    capital_hoods = list(permits_geo[neighbourhood_column].unique())
    nhood_dict = {}
    for index,data in nhood_geo.iterrows():
        closest_match = get_close_matches(data['neighbourhood'], capital_hoods, n = 1)
        if closest_match:
            nhood_dict[closest_match[0]] = data['neighbourhood'] 
    return nhood_dict

def area_in_km(geometry):
    """
    Takes in the geometry coordinates and transforms them into sq km.
    """
    s = shape(geometry)
    area = ops.transform(
    partial(
        pyproj.transform,
        pyproj.Proj(init='EPSG:4326'),
        pyproj.Proj(
            proj='aea',
            lat1=s.bounds[1],
            lat2=s.bounds[3])),
    s)
    sq_km = area.area / 1000000
    return sq_km

def low_risk_high_risk(zhvi):
    """
    Takes the neighborhood's Zhvi, takes half as the home equity and evaluates the inequality.
    """
    equity = zhvi / 2
    if equity * .75 >= 150000:
        return "Low"
    else:
        return "High"


In [176]:
#Loading in 4 dataframes. Airbnb listings, neighborhood geo data, zillow home price neighborhood data and portland permit 
#data

portland = pd.read_csv("portland_listings.csv.gz")
portland = portland[['name','latitude','longitude','property_type','room_type','bedrooms','bathrooms','neighbourhood','neighbourhood_cleansed','price']]
portland_hoods = gpd.read_file("portland_neighbourhoods.geojson")
zillow = pd.read_csv("Neighborhood_Zhvi_Summary_AllHomes.csv")

link = "https://opendata.arcgis.com/datasets/797a38ddb60a471392ea9134448ad6c2_89.geojson"
portland_permits = gpd.read_file(link)

#Portland permits data filtered into ADU permits only. Any ADU permit that doesn't have a neighborhood is dropped. 
#Neighbourhood column that matches portland_hoods created.

portland_permits = portland_permits[portland_permits['IS_ADU'] == 'True']
portland_permits.NBRHOOD = portland_permits.NBRHOOD.str.lower()
null_index = portland_permits[portland_permits.NBRHOOD.isnull()].index
portland_permits.drop(null_index, inplace=True)
nhood_dict = nhood_matching(portland_hoods, portland_permits,'NBRHOOD')
portland_permits['neighbourhood'] = portland_permits.NBRHOOD.map(nhood_dict)

#Load in Zillow Neighborhood data and merge with Portland neighborhood geodataframe, fill na's with median

multonomah = (zillow[zillow.County == 'Multnomah County'])
multonomah.rename(columns = {'RegionName':'neighbourhood'},inplace=True)
multonomah = multonomah[['neighbourhood','Zhvi','YoY']]
zillow_dict = nhood_matching(portland_hoods,multonomah,'neighbourhood')
multonomah['neighbourhood'] = multonomah['neighbourhood'].map(zillow_dict)
portland_hoods = portland_hoods.merge(multonomah, how='left', on='neighbourhood')
portland_hoods['Zhvi'].fillna(portland_hoods.Zhvi.median(),inplace=True)
portland_hoods['YoY'].fillna(portland_hoods.YoY.median(),inplace=True)

#Creates 6 new columns in portland neighborhood geodf by pulling info from AirBnb dataframe and fills na's with median values

portland_geopd = cleaning_togeo(portland)
portland_property = dict(portland_geopd.groupby('neighbourhood')['name'].count())
rental_price = dict(portland_geopd.groupby('neighbourhood')['monthly'].mean())
adu_count = dict(portland_permits.groupby('neighbourhood')['IS_ADU'].count())

portland_hoods['adu_count'] = portland_hoods['neighbourhood'].map(adu_count)
portland_hoods['adu_count'].fillna(portland_hoods.adu_count.median(),inplace=True)

portland_hoods['airbnb_count'] = portland_hoods['neighbourhood'].map(portland_property)
portland_hoods['airbnb_count'].fillna(portland_hoods.airbnb_count.median(),inplace=True)
                                      
portland_hoods['mean_rental_price'] = portland_hoods['neighbourhood'].map(rental_price)
portland_hoods['mean_rental_price'].fillna(portland_hoods.mean_rental_price.median(),inplace=True)
                                      
portland_hoods['total_count'] = portland_hoods.adu_count + portland_hoods.airbnb_count
portland_hoods['sq_km'] = portland_hoods['geometry'].apply(area_in_km)
portland_hoods['per_sq_km'] = portland_hoods['total_count']/portland_hoods['sq_km']

portland_hoods['risk_level'] = portland_hoods['Zhvi'].apply(low_risk_high_risk)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [178]:
portland_hoods.head()

Unnamed: 0,neighbourhood,neighbourhood_group,geometry,Zhvi,YoY,adu_count,airbnb_count,mean_rental_price,total_count,sq_km,per_sq_km,risk_level
0,Sullivan's Gulch,,"(POLYGON Z ((-122.650474 45.535038 0, -122.641...",402300.0,-0.069396,12.0,23.0,3676.956522,35.0,0.710155,49.285036,Low
1,Hollywood,,"(POLYGON Z ((-122.624998 45.537777 0, -122.614...",324600.0,0.035077,5.0,6.0,3585.0,11.0,0.46434,23.689545,High
2,Hillside,,"(POLYGON Z ((-122.724325 45.526045 0, -122.724...",892200.0,-0.027575,6.0,20.0,4476.0,26.0,1.546665,16.81036,Low
3,Sylvan-Highlands,,"(POLYGON Z ((-122.722405 45.51469 0, -122.7224...",668500.0,-0.004023,2.0,6.0,6325.0,8.0,1.177183,6.795885,Low
4,Arbor Lodge,,"(POLYGON Z ((-122.678576 45.577207 0, -122.678...",437600.0,-0.025824,77.0,74.0,3735.0,151.0,2.247989,67.171154,Low


In [156]:
# Plots the neighborhoods axis and then, in this case, ADU permits and AirBnbs. Changed around to make all the plots.
%matplotlib qt 
base_pdx = portland_hoods.plot(column = 'risk_level',edgecolor = 'black',legend=True)
portland_geopd.plot(ax=base_pdx,marker = '.',color='red',markersize = 5, alpha = 0.5, label='AirBnBs')
portland_permits.plot(ax=base_pdx, marker = '.', color = 'black', markersize = 5, alpha = 0.5, label = "ADUs")
plt.legend()

In [179]:
# Takes low risk level neighborhoods and sorts values by competing properties per sq/km and keeps the top half. These are
# plotted over the empty axes to show recommended neighborhoods Ranked list of neighborhoods is below
first_recommendation = portland_hoods[portland_hoods['risk_level'] == 'Low'].sort_values('per_sq_km', ascending = True)[:35]
first_rec = portland_hoods.plot(edgecolor = 'black', color = 'white', legend=True)
first_recommendation.plot(ax = first_rec, color = 'Yellow', legend = True)

<matplotlib.axes._subplots.AxesSubplot at 0x187dee10ba8>

In [180]:
first_recommendation[['neighbourhood','per_sq_km']]

Unnamed: 0,neighbourhood,per_sq_km
78,Pleasant Valley,1.597752
15,Forest Park,2.683396
96,Arnold Creek,2.818522
97,Far Southwest,6.485368
3,Sylvan-Highlands,6.795885
19,Sunderland,6.904575
91,Marshall Park,6.954843
75,Bridlemile,7.827957
13,Linnton,7.916831
36,Parkrose Heights,8.722397
