### 2015 NYC Tree Census
An urban tree health classification model based on the NYC Tree Census dataset from 2015.

In [2]:
import pandas as pd
import numpy as np

trees = pd.read_csv('../data/raw/new_york_tree_census_2015.csv')
trees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683788 entries, 0 to 683787
Data columns (total 41 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   tree_id     683788 non-null  int64  
 1   block_id    683788 non-null  int64  
 2   created_at  683788 non-null  object 
 3   tree_dbh    683788 non-null  int64  
 4   stump_diam  683788 non-null  int64  
 5   curb_loc    683788 non-null  object 
 6   status      683788 non-null  object 
 7   health      652172 non-null  object 
 8   spc_latin   652169 non-null  object 
 9   spc_common  652169 non-null  object 
 10  steward     164350 non-null  object 
 11  guards      79866 non-null   object 
 12  sidewalk    652172 non-null  object 
 13  user_type   683788 non-null  object 
 14  problems    225844 non-null  object 
 15  root_stone  683788 non-null  object 
 16  root_grate  683788 non-null  object 
 17  root_other  683788 non-null  object 
 18  trunk_wire  683788 non-null  object 
 19  tr

In [None]:
for col in trees.columns:
    na_prop = np.round(np.sum(trees[col].isna())/len(trees),4)
    print(f'{col}    \t{trees[col].dtype}\t'
          f'{na_prop}')


tree_id    	int64	0.0
block_id    	int64	0.0
created_at    	object	0.0
tree_dbh    	int64	0.0
stump_diam    	int64	0.0
curb_loc    	object	0.0
status    	object	0.0
health    	object	0.0462
spc_latin    	object	0.0462
spc_common    	object	0.0462
steward    	object	0.7596
guards    	object	0.8832
sidewalk    	object	0.0462
user_type    	object	0.0
problems    	object	0.6697
root_stone    	object	0.0
root_grate    	object	0.0
root_other    	object	0.0
trunk_wire    	object	0.0
trnk_light    	object	0.0
trnk_other    	object	0.0
brch_light    	object	0.0
brch_shoe    	object	0.0
brch_other    	object	0.0
address    	object	0.0
zipcode    	int64	0.0
zip_city    	object	0.0
cb_num    	int64	0.0
borocode    	int64	0.0
boroname    	object	0.0
cncldist    	int64	0.0
st_assem    	int64	0.0
st_senate    	int64	0.0
nta    	object	0.0
nta_name    	object	0.0
boro_ct    	int64	0.0
state    	object	0.0
latitude    	float64	0.0
longitude    	float64	0.0
x_sp    	float64	0.0
y_sp    	float64	0.0


In [None]:
na_cols = []
for col in trees.columns:
    na_prop = np.round(np.sum(trees[col].isna())/len(trees),4)
    if na_prop == 0.0:
        continue
    print(f'{col}    \t{trees[col].dtype}\t'
          f'{na_prop}')
    na_cols.append(col)

health    	object	0.0462
spc_latin    	object	0.0462
spc_common    	object	0.0462
steward    	object	0.7596
guards    	object	0.8832
sidewalk    	object	0.0462
problems    	object	0.6697


In [2]:
for col in na_cols:
    print(f'\x1b[42m{col.lower()}\x1b[0m')
    print(trees[col].value_counts())
    print()

NameError: name 'na_cols' is not defined

In [79]:
import re
probs_list = list(trees['problems'].dropna().astype(str).unique())

def camel_words(text: str) -> list[str]:
    return re.findall(r"[A-Z][a-z]*", text)

def filter_two_words(strings: list[str]) -> list[str]:
    return [s for s in strings if len(camel_words(s)) <= 2]

print(filter_two_words(probs_list))

['Stones', 'BranchLights', 'TrunkOther', 'RootOther', 'BranchOther', 'WiresRope', 'MetalGrates', 'Sneakers', 'TrunkLights', 'StonesSneakers']


In [109]:
import json
# 'problems' has 231 unique values, but these are encoded in different variables
print(trees.columns[14:24])

# lets check if these values correspond to one another
# str_format = lambda txt: "".join(word.capitalize() for word in txt.split("_"))
# problems_list = [str_format(name) for name in trees.columns[15:24]]

# calculated above, these are the unique codes which correspond to the columns
problems_list = ['Stones', 'BranchLights', 'TrunkOther', 'RootOther', \
                 'BranchOther', 'WiresRope', 'MetalGrates', 'Sneakers', \
                'TrunkLights']
problems_cols = ['root_stone','brch_light','trnk_other','root_other',\
                 'brch_other','trunk_wire','root_grate','brch_shoe',\
                 'trnk_light']

col_mismatch = {}
str_mismatch = {}

for code, col in zip(problems_list, problems_cols):
    appears = trees['problems'].fillna("").str.contains(code, regex=False)
    expected = trees[col] == True
    str_mismatch[col] = int((~expected & appears).sum())
    col_mismatch[col] = int((expected & ~appears).sum())

print('\nbinary variable is 1 | `problems` does not contain the code')
print(json.dumps(str_mismatch, indent=4))
print('\nbinary variable is 0 | `problems` does contain the code')
print(json.dumps(col_mismatch, indent=4))


Index(['problems', 'root_stone', 'root_grate', 'root_other', 'trunk_wire',
       'trnk_light', 'trnk_other', 'brch_light', 'brch_shoe', 'brch_other'],
      dtype='object')

binary variable is 1 | `problems` does not contain the code
{
    "root_stone": 139999,
    "brch_light": 62365,
    "trnk_other": 32573,
    "root_other": 30322,
    "brch_other": 24355,
    "trunk_wire": 13274,
    "root_grate": 3536,
    "brch_shoe": 411,
    "trnk_light": 1031
}

binary variable is 0 | `problems` does contain the code
{
    "root_stone": 0,
    "brch_light": 0,
    "trnk_other": 0,
    "root_other": 0,
    "brch_other": 0,
    "trunk_wire": 0,
    "root_grate": 0,
    "brch_shoe": 0,
    "trnk_light": 0
}


From _above_ we can tell that the `problems` column is unreliable and might miss information stored in the binary variables. The binary variables are always accurate. We will drop `problems` and use the other variables instead.

In [None]:
print(trees.columns)
# drop columns: tree_id, block_id, created_at (date),
#               state, city, lat/long(choose x_sp,y_sp instead),
#               address,
#               cncldist', 'st_assem', 'st_senate', 'nta'
# keep only one of: (zipcode, zip_city), (borocode, boroname), (spc_latin, spc_common)
# try a model with either: zip, boro, or both

Index(['tree_id', 'block_id', 'created_at', 'tree_dbh', 'stump_diam',
       'curb_loc', 'status', 'health', 'spc_latin', 'spc_common', 'steward',
       'guards', 'sidewalk', 'user_type', 'problems', 'root_stone',
       'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other',
       'brch_light', 'brch_shoe', 'brch_other', 'address', 'zipcode',
       'zip_city', 'cb_num', 'borocode', 'boroname', 'cncldist', 'st_assem',
       'st_senate', 'nta', 'nta_name', 'boro_ct', 'state', 'latitude',
       'longitude', 'x_sp', 'y_sp'],
      dtype='object')


In [167]:
print(trees['guards'].value_counts(dropna=False))
# print(trees['zip_city'].value_counts())

guards
NaN        603922
Helpful     51866
Harmful     20252
Unsure       7748
Name: count, dtype: int64


In [None]:
drop_cols = ['tree_id', 'block_id', 'created_at', 'stump_diam','status','problems','address',\
             'zip_city','cb_num', 'borocode', 'cncldist', 'st_assem', 'st_senate', 'nta', 'boro_ct', 'state',\
             'latitude', 'longitude']

group = ['nta_name']
target = ['health']
spatial_geo = ['x_sp','y_sp']
spatial_fine = ['zipcode']
spatial_coarse = ['boroname']

print(set(trees.columns).difference(set(drop_cols)))
# drop rows where `health` is missing or `status != 'Alive`
# --> drop 'stump_diam'

# maybe keep cb_num but ONLY for feature set G (upper bound) for XGBoost

{'user_type', 'brch_other', 'root_stone', 'boroname', 'root_grate', 'spc_latin', 'spc_common', 'trnk_light', 'trunk_wire', 'brch_light', 'tree_dbh', 'zipcode', 'root_other', 'curb_loc', 'sidewalk', 'y_sp', 'health', 'x_sp', 'steward', 'brch_shoe', 'nta_name', 'guards', 'trnk_other'}


In [5]:
print(len(trees.columns))

41
