In [1]:
################################################################################
# File name:    "soil_quality_matching.ipynb"
#
# Project title:    Boston Affordable Housing project (visting scholar porject)
#
# Description:    This file takes the soil quality data .shp file created by 
#                 Jordan and matches onto it our dataset of mapc warren group
#                 properties.
#
# Inputs:    ./Soil_Parcel_Data_shape.shp
#            ./warren_MAPC_all_unique.dta
#
# Outputs:    <various>
#
# Created:    09/23/2022
# Updated:    09/23/2022
#
# Author:    Nicholas Chiumenti
################################################################################

In [2]:
# 1. load in soil quality shape file
# 2. load in warren property points
# 3. define both as a geopandas data frame with geomtetry variable
# 4. match a sample of 10,000
# 5. try to figure out if there are any dupclicates

# https://pandas.pydata.org/docs/
# https://geopandas.org/en/stable/docs/user_guide.html

In [1]:
import datetime
import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
## load soil quality shapefile
# define paths
shape_path = "/home/a1nfc04/Documents/boston_zoning_sdrive/data/shapefiles/soil_quality/Soil_Parcel_Data_Shape.shp"
data_path = "/home/a1nfc04/Documents/boston_zoning_sdrive/data/warren/warren_MAPC_all_unique.dta"

# load in the soil quality shapefile
shape_gdf = gpd.read_file(shape_path)

assert len(shape_gdf) == 1871216 # confirm num of shapes

print(f"total shapes in shape file {len(shape_gdf)}")

## load in warren group address points
data_df = pd.read_stata(data_path)

data_df = data_df[["prop_id", "cousub_name", "warren_latitude", "warren_longitude"]]

# define as a geo dataframe
data_gdf = gpd.GeoDataFrame(data_df, geometry = gpd.points_from_xy(data_df.warren_longitude, data_df.warren_latitude))

# set initial crs as epsg-4269 to match the adm3 file
data_gdf.set_crs("EPSG:4269", inplace=True)

# convert to epsg:26986
data_gdf.to_crs("EPSG:26986", inplace=True)

assert len(data_gdf) == 821237 # confirm num of address points

print(f"total address points in file {len(data_gdf)}")

assert shape_gdf.crs == data_gdf.crs

total shapes in shape file 1871216
total address points in file 821237


In [5]:
## drop duplicate polygons
# LOC_ID unique identifies polygons
assert len(shape_gdf.LOC_ID.unique()) == len(shape_gdf.groupby(["LOC_ID", "AVG_SLOPE", "SLOPE_15", "AVG_RESTRI", "AVG_SAND", "AVG_CLAY"]))

shape_gdf.drop_duplicates(subset = ['LOC_ID'], inplace = True)

assert len(shape_gdf) == 1557088 # confirm num of observations

In [6]:
# spatial join addresses within polygons
matches_gdf = gpd.sjoin(data_gdf , shape_gdf, how="left", op = "within")

assert len(matches_gdf) == 821397 # check before duplicates drop

# drop duplicate matches, keep largest total area
matches_gdf = matches_gdf.sort_values("total_area").drop_duplicates(subset = "prop_id", keep = "last")

assert len(matches_gdf) == 821237 # check after duplicates drop

In [7]:
## export as .csv
# set paths
save_path = "/home/a1nfc04/Documents/boston_zoning_sdrive/data/shapefiles/soil_quality/soil_quality_matches.dta"
log_path = "/home/a1nfc04/Documents/boston_zoning_sdrive/python_programs/soil_quality_data/soil_quality_matches_log.txt"

# trim down variables and deep copy
save_gdf = matches_gdf[
    ["prop_id", "cousub_name", "warren_latitude", "warren_longitude", 
     "LOC_ID", "AVG_SLOPE", "SLOPE_15", "AVG_RESTRI", 
     "AVG_SAND", "AVG_CLAY", "total_area"]].copy(deep = True)

save_gdf.rename({"LOC_ID" : "loc_id", "AVG_SLOPE" : "avg_slope", 
                 "SLOPE_15" : "slope_15", "AVG_RESTRI" : "avg_restri",
                 "AVG_SAND" : "avg_sand", "AVG_CLAY" : "avg_clay"}, axis = 1, inplace = True)

# save .dta file to S drive location
save_gdf.to_stata(save_path, write_index = False)

# save a log .txt file to the S drive
date = datetime.datetime.now().strftime('%D at %I:%M:%S %p')

with open(log_path,'a') as file:
    file.write(f"Finish running on {date}: {len(save_gdf):,} observations written to '{save_path}'.\n")  

# Done!
print(f"Done! {len(matches_gdf):,} observations written")

Done! 821,237 observations written
