In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from umap import UMAP
from sklearn.decomposition import PCA

In [13]:

# -------------------------------------------------------
# 1. LOAD DATASET
# -------------------------------------------------------
df = pd.read_csv("trees_cleaned_data_1.csv")

# Keep only the columns we actually use (optional but clean)
df = df.copy()

# -------------------------------------------------------
# 2. TOP 20 SPECIES
# -------------------------------------------------------
top20_species = df['spc_common'].value_counts().head(20).index.tolist()

def map_species(s):
    if s in top20_species:
        return s
    else:
        return "Other"

df["species_reduced"] = df["spc_common"].apply(map_species)

# -------------------------------------------------------
# 3. EXTRACT MONTH FROM created_at
# -------------------------------------------------------
df["created_at"] = pd.to_datetime(df["created_at"])
df["month"] = df["created_at"].dt.month

# -------------------------------------------------------
# 4. NUMERIC FEATURES (scaled)
# -------------------------------------------------------
numeric_cols = ["tree_dbh", "stump_diam", "x_sp", "y_sp"]

scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(df[numeric_cols].fillna(0))

numeric_df = pd.DataFrame(
    numeric_scaled,
    columns=[f"{col}_scaled" for col in numeric_cols]
)

# -------------------------------------------------------
# 5. CATEGORICAL FEATURES → ONE-HOT ENCODING
# -------------------------------------------------------
cat_cols = [
    "health",          # Good/Fair/Poor
    "status",          # Alive/Dead/Stump
    "borough",        # Manhattan, Bronx, etc.
    "curb_loc",        # OnCurb/OffCurb
    "species_reduced", # Top 20 species + Other
    "month"            # month 1–12
]

# Convert to string to avoid issues
cat_df = df[cat_cols].astype(str)

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = ohe.fit_transform(cat_df)
cat_feature_names = ohe.get_feature_names_out(cat_cols)

cat_encoded_df = pd.DataFrame(cat_encoded, columns=cat_feature_names)

# -------------------------------------------------------
# 6. COMBINE ALL FEATURES INTO ONE EMBEDDINGS TABLE
# -------------------------------------------------------
embeddings = pd.concat([
    df[['tree_id']],   # keep ID for linking
    numeric_df,
    cat_encoded_df
], axis=1)

# -------------------------------------------------------
# 7. SAVE FINAL EMBEDDINGS
# -------------------------------------------------------
embeddings.to_csv("embeddings.csv", index=False)

print("DONE! embeddings.csv created successfully.")
print("Shape:", embeddings.shape)
embeddings.head()


DONE! embeddings.csv created successfully.
Shape: (683788, 52)


Unnamed: 0,tree_id,tree_dbh_scaled,stump_diam_scaled,x_sp_scaled,y_sp_scaled,health_Fair,health_Good,health_Poor,health_nan,status_Alive,...,month_11,month_12,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9
0,180683,-0.949186,-0.131438,0.646092,0.24188,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,200540,1.114315,-0.131438,0.850979,1.028703,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,204026,-0.949186,-0.131438,-0.100832,0.179881,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,204337,-0.146713,-0.131438,-0.083404,0.135123,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,189565,1.114315,-0.131438,-0.419019,-0.382833,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:



# -------------------------------------------------------
# 1. LOAD embeddings.csv
# -------------------------------------------------------
emb = pd.read_csv("embeddings.csv")

# Keep the tree_id column for linking later
tree_ids = emb["tree_id"]

# Remove tree_id to get pure numeric embeddings
X = emb.drop(columns=["tree_id"])

# -------------------------------------------------------
# 2. RUN PCA (2D projection)
# -------------------------------------------------------
pca_model = PCA(n_components=2, random_state=42)
X_2d = pca_model.fit_transform(X)

# -------------------------------------------------------
# 3. Create the final 2D projection table
# -------------------------------------------------------
proj = pd.DataFrame({
    "tree_id": tree_ids,
    "x": X_2d[:, 0],
    "y": X_2d[:, 1]
})

# -------------------------------------------------------
# 4. SAVE the resulting CSV
# -------------------------------------------------------
proj.to_csv("embeddings_2d.csv", index=False)

print("PCA 2D projection created successfully.")
print("Saved as embeddings_2d.csv")
print(proj.head())


PCA 2D projection created successfully.
Saved as embeddings_2d.csv
   tree_id         x         y
0   180683  0.695818  0.589733
1   200540  1.500417 -0.789242
2   204026 -0.113610  0.489631
3   204337 -0.079283 -0.053452
4   189565 -0.575779 -0.921499


In [15]:
import pandas as pd

# Load PCA output
emb2d = pd.read_csv("embeddings_2d.csv")

# Load full embeddings (with one-hot columns)
emb = pd.read_csv("embeddings.csv")

# All species dummy columns
species_cols = [c for c in emb.columns if c.startswith("species_reduced_")]

# Function to get species name
def get_species(row):
    for col in species_cols:
        if row[col] == 1:
            return col.replace("species_reduced_", "")
    return "Unknown"

# Create single species column
emb["species_reduced"] = emb.apply(get_species, axis=1)

# Merge with PCA file
merged = emb2d.merge(
    emb[["tree_id", "species_reduced"]],
    on="tree_id",
    how="left"
)

# Save updated CSV
merged.to_csv("embeddings_2d.csv", index=False)

print("✔ embeddings_2d.csv updated with species_reduced column!")


✔ embeddings_2d.csv updated with species_reduced column!


In [16]:
import pandas as pd

df = pd.read_csv("embeddings_2d.csv")

order = df["species_reduced"].value_counts().index.tolist()
print(order)


['Other', 'London planetree', 'honeylocust', 'Callery pear', 'pin oak', 'Norway maple', 'littleleaf linden', 'cherry', 'Japanese zelkova', 'ginkgo', 'Sophora', 'red maple', 'green ash', 'American linden', 'silver maple', 'sweetgum', 'northern red oak', 'silver linden', 'American elm', 'maple', 'purple-leaf plum']


In [21]:
import pandas as pd

# Load both datasets
emb = pd.read_csv("embeddings_2d.csv")
orig = pd.read_csv("trees_cleaned_data_1.csv")

# Ensure tree_id is the same type in both
emb["tree_id"] = emb["tree_id"].astype(int)
orig["tree_id"] = orig["tree_id"].astype(int)

# Select useful columns from original data
cols_to_keep = [
    "tree_id",
    "tree_dbh",
    "borough",
    "health",
    "status",
    "curb_loc",
    "spc_common",
    "x_sp",
    "y_sp"
]

orig_small = orig[cols_to_keep]

# Merge on tree_id
merged = emb.merge(orig_small, on="tree_id", how="left")

# Save final file
merged.to_csv("embeddings_full.csv", index=False)

merged.head()


Unnamed: 0,tree_id,x,y,species_reduced,tree_dbh,borough,health,status,curb_loc,spc_common,x_sp,y_sp
0,180683,0.695818,0.589733,red maple,3,Queens,Fair,Alive,OnCurb,red maple,1027431.148,202756.7687
1,200540,1.500417,-0.789242,pin oak,21,Queens,Fair,Alive,OnCurb,pin oak,1034455.701,228644.8374
2,204026,-0.11361,0.489631,honeylocust,3,Brooklyn,Good,Alive,OnCurb,honeylocust,1001822.831,200716.8913
3,204337,-0.079283,-0.053452,honeylocust,10,Brooklyn,Good,Alive,OnCurb,honeylocust,1002420.358,199244.2531
4,189565,-0.575779,-0.921499,American linden,21,Brooklyn,Good,Alive,OnCurb,American linden,990913.775,182202.426


In [22]:
import geopandas as gpd

# Load your current borough.geo.json
gdf = gpd.read_file("borough.geo.json")

# Convert to EPSG:2263 (NY State Plane Long Island)
gdf = gdf.to_crs("EPSG:2263")

# Save converted version
gdf.to_file("boroughs_2263.geojson", driver="GeoJSON")

print("Converted and saved as boroughs_2263.geojson")


Converted and saved as boroughs_2263.geojson


drjmhgnhfbgdf

In [23]:
import pandas as pd

df = pd.read_csv("embeddings_full.csv")


In [24]:
import geopandas as gpd
trees = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x_sp, df.y_sp))


In [26]:
boroughs = gpd.read_file("borough.geo.json")


In [27]:
df = gpd.sjoin(trees, boroughs, predicate="within")
df = df.drop(columns=["index_right"])


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  df = gpd.sjoin(trees, boroughs, predicate="within")


In [28]:
df_clean = pd.DataFrame(df.drop(columns="geometry"))


In [29]:
df_clean.to_csv("embeddings_full_clean.csv", index=False)


In [32]:
import pandas as pd

# Load dataset
df = pd.read_csv("embeddings_full.csv")

# 1. Remove "Other" BEFORE counting
df_no_other = df[df["species_reduced"] != "Other"]

# 2. Get top 20 REAL species
top20_real = df_no_other["species_reduced"].value_counts().nlargest(20).index

print("Top 20 species (excluding Other):")
print(top20_real.tolist())

# 3. Filter original DF to only those 20 species
df_top20 = df[df["species_reduced"].isin(top20_real)]

# 4. Save cleaned dataset
df_top20.to_csv("embeddings_full_top20.csv", index=False)

print("Saved embeddings_full_top20.csv with shape:", df_top20.shape)


Top 20 species (excluding Other):
['London planetree', 'honeylocust', 'Callery pear', 'pin oak', 'Norway maple', 'littleleaf linden', 'cherry', 'Japanese zelkova', 'ginkgo', 'Sophora', 'red maple', 'green ash', 'American linden', 'silver maple', 'sweetgum', 'northern red oak', 'silver linden', 'American elm', 'maple', 'purple-leaf plum']
Saved embeddings_full_top20.csv with shape: (534514, 12)


In [34]:
import geopandas as gpd

boroughs = gpd.read_file("borough.geo.json")
print(boroughs.crs)


EPSG:4326
