### Introduction

In [16]:
# This notebook represents the final tree DBSCAN clustering used for presentation
# Data was used in conjunction with NYC Housing Surveys to draw parallels with resident opinions on their housing and tree density/populations
# As the tree dataset is too large for Github, it can be found here:

## Tree data:
# tree_census_2015.csv = https://data.cityofnewyork.us/api/views/uvpi-gqnh/rows.csv?accessType=DOWNLOAD

## Tree Clustering

#### Import Data, convert coordinates

In [17]:
import pandas as pd
import geopandas as gpd

# cwd = os.getcwd()  # Get the current working directory (cwd)
# files = os.listdir(cwd)  # Get all the files in that directory
# print("Files in %r: %s" % (cwd, files))

# https://cityofnewyork.github.io/opendatatsm/citystandards.html states all geospatial data is in EPSG:3857.


## Tree data:
# tree_census_2015.csv = https://data.cityofnewyork.us/api/views/uvpi-gqnh/rows.csv?accessType=DOWNLOAD
# # save this file where the workbook is

df = pd.read_csv('C:/Users/jrilk/OneDrive/LHL/major-projects/Midterm/mid-term-project-II-main/data/tree_census_2015.csv', sep=',') # change to your directory where the csv is located
df_trees = df

gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df["longitude"], df["Latitude"], crs="EPSG:3857"))

df["spc_common"].value_counts(dropna=False).head(20).index.values

array(['London planetree', 'honeylocust', 'Callery pear', 'pin oak',
       'Norway maple', nan, 'littleleaf linden', 'cherry',
       'Japanese zelkova', 'ginkgo', 'Sophora', 'red maple', 'green ash',
       'American linden', 'silver maple', 'sweetgum', 'northern red oak',
       'silver linden', 'American elm', 'maple'], dtype=object)

In [18]:
gdf[["longitude", "Latitude", "geometry"]].loc[gdf["spc_common"] == "Norway maple"]

Unnamed: 0,longitude,Latitude,geometry
326,-73.893711,40.847658,POINT (-73.894 40.848)
328,-73.928171,40.832306,POINT (-73.928 40.832)
330,-73.904082,40.708712,POINT (-73.904 40.709)
331,-73.881538,40.875677,POINT (-73.882 40.876)
332,-73.978833,40.674917,POINT (-73.979 40.675)
...,...,...,...
680252,-73.979225,40.675069,POINT (-73.979 40.675)
680253,-73.995278,40.687196,POINT (-73.995 40.687)
680254,-73.887025,40.850751,POINT (-73.887 40.851)
680255,-73.740067,40.601315,POINT (-73.740 40.601)


In [19]:
# pulls in GeoJSON of subboroughs to be mapped into individual tree locations

 # change to your directory where the geojson is located
nyc = gpd.GeoDataFrame.from_file("C:/Users/jrilk/OneDrive/LHL/major-projects/Midterm/mid-term-project-II-main/data/old_PUMA_or_Subborough.geo.json") 
nyc = nyc.to_crs({'proj': 'cea'})
nyc_area = nyc.area.sum()/10**6
nyc_area  # https://www.google.com/search?q=nyc+area agrees to publized area approx.

780.9194543794008

#### DBSCAN Clustering for trees

In [20]:
# https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/

from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS

from __future__ import annotations

import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np


def cluster_DBSCAN(
    radius_km: float,
    min_samples_pct: float,
    plot: bool,
    species: str | None = None,
) -> pd.DataFrame:
    trees_df = gdf.loc[:, ["longitude", "Latitude", "geometry"]]

    if species:
        trees_df = trees_df.loc[gdf["spc_common"] == species]

    # All distance calculations are done in radians because a degree represents a different physical distance depending on where on earth you are.
    coords = np.radians(trees_df[["longitude", "Latitude"]].values)
    kms_per_radian = 6371.0088  # physical constant
    epsilon = radius_km / kms_per_radian
    min_samples = int(len(trees_df) * min_samples_pct)

    cluster = DBSCAN(
        eps=epsilon,
        metric="haversine",
        min_samples=min_samples,
        n_jobs=-1,
    ).fit(coords)

    cluster_labels = cluster.labels_
    print(f"min_samples: {min_samples}")
    print(f"clusters: {len(set(cluster_labels))}")
    print(f"trees: {len(cluster_labels)}")

    trees_df["cluster"] = cluster_labels

    if plot:
        fig, ax = plt.subplots(1, figsize=(20, 15))
        trees_df.plot(
            categorical=True,
            legend=True,
            column="cluster",
            ax=ax,
            markersize=0.1,
            alpha=0.8,
            cmap="turbo",
        )
        ax.axis("off")
        ax.set_title(f"{species} clusters", fontsize=20)
        plt.show()
    
    return trees_df

# function for OPTICS clustering but was not used.
def cluster_OPTICS(species: str, plot: bool, min_samples: float):
    trees_df = gdf[["longitude", "Latitude", "geometry"]].loc[
        gdf["spc_common"] == species
    ]
    coords = trees_df[["longitude", "Latitude"]].values
    cluster = OPTICS(
        metric="haversine",
        min_cluster_size=min_samples,
        n_jobs=-1,
    ).fit(np.radians(coords))
    cluster_labels = cluster.labels_
    trees_df["cluster"] = cluster_labels

    print(pd.Series(cluster_labels).value_counts())
    print(len(set(cluster_labels)))
    print(len(cluster_labels))

    if plot:
        fig, ax = plt.subplots(1, figsize=(20, 15))
        trees_df.plot(
            categorical=True,
            legend=True,
            column="cluster",
            ax=ax,
            markersize=0.1,
            alpha=0.8,
            cmap="turbo",
        )
        ax.axis("off")
        ax.set_title(f"{species} clusters", fontsize=20)
        plt.show()

#### Display total clusters and trees in each

In [21]:
treeclusters = pd.DataFrame()

In [22]:
# Produces DBSCAN clusters for the top 20 most common tree species

for col in df["spc_common"].value_counts().head(20).index.values:
    clusterlabels = cluster_DBSCAN(
        species=col, radius_km=0.5, min_samples_pct=0.01, plot=False
    )  # For reference a Manhattan city block is approx 80m x 274m.
    treeclusters = pd.concat([treeclusters, clusterlabels])

print(treeclusters)

min_samples: 870
clusters: 9
trees: 87014
min_samples: 642
clusters: 4
trees: 64264
min_samples: 589
clusters: 5
trees: 58931
min_samples: 531
clusters: 4
trees: 53185
min_samples: 341
clusters: 10
trees: 34189
min_samples: 297
clusters: 6
trees: 29742
min_samples: 292
clusters: 5
trees: 29279
min_samples: 292
clusters: 5
trees: 29258
min_samples: 210
clusters: 6
trees: 21024
min_samples: 193
clusters: 10
trees: 19338
min_samples: 172
clusters: 7
trees: 17246
min_samples: 162
clusters: 11
trees: 16251
min_samples: 135
clusters: 8
trees: 13530
min_samples: 122
clusters: 7
trees: 12277
min_samples: 106
clusters: 10
trees: 10657
min_samples: 84
clusters: 13
trees: 8400
min_samples: 79
clusters: 9
trees: 7995
min_samples: 79
clusters: 11
trees: 7975
min_samples: 70
clusters: 13
trees: 7080
min_samples: 68
clusters: 12
trees: 6879
        longitude   Latitude                geometry  cluster
9      -73.969744  40.586357  POINT (-73.970 40.586)        0
10     -73.911171  40.782428  POINT (-

#### Merging clusters with source data

In [23]:
df_cluster = pd.merge(df_trees, treeclusters, left_index=True, right_index=True).dropna()

#### Merging GeoJSON Data with clustered set

In [25]:
from shapely import wkt

# change to your directory where the geojson is located
geojson = gpd.read_file('C:/Users/jrilk/OneDrive/LHL/major-projects/Midterm/mid-term-project-II-main/data/old_PUMA_or_Subborough.geo.json')

df_new = df_cluster.rename(columns={'the_geom':'geometry'})
coords = pd.DataFrame(df_new['geometry'])

trees_series = coords['geometry'].apply(wkt.loads)
trees_series_gdf = gpd.GeoDataFrame(trees_series, crs=4326)

new_df = trees_series_gdf.sjoin(geojson, how='left')
df_merged = pd.merge(new_df, df_cluster, how='left', left_index=True, right_index=True)

#### Creation of final dataset

In [26]:
df_trees_final = df_merged[['spc_common', 'id', 'tree_dbh', 'cluster', 'Latitude_x', 'longitude_x']]

In [27]:
df_trees_final.to_csv('submission_trees.csv', sep=',')