In [None]:
import pandas as pd
import matplotlib as plt

pd.set_option('display.max_columns', None)

In [None]:
tree_census = pd.read_csv('Tree_Data.csv')
tree_census

In [None]:
tree_census.columns

In [None]:
#Create subset of the data using the needed information
tree_subset = tree_census[['tree_id', 'tree_dbh', 'stump_diam',
       'curb_loc', 'status', 'health', 'spc_latin', 'steward', 'sidewalk', 'problems', 'root_stone',
       'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other',
       'brch_light', 'brch_shoe', 'brch_other']]
tree_subset

In [None]:
tree_subset.isna().sum()

In [None]:
#Checking for N/A values in the dataset
tree_subset[tree_subset['problems'].isna()].head()

In [None]:
tree_subset[tree_subset['health'].isna()].head()

In [None]:
tree_subset.describe().transpose()

In [None]:
tree_subset.dtypes

In [None]:
#Histograms for distribution of the values
tree_subset.hist(bins=60, figsize=(20,10))

In [None]:
large_trees = tree_subset[tree_subset['tree_dbh']>50]
large_trees

non_zero = tree_subset[tree_subset['stump_diam']>0]

In [None]:
#Scatterplot of the larger trees 'breast height diameter' (Described above)
large_trees[['tree_id', 'tree_dbh']].plot(kind='scatter', x='tree_id', y='tree_dbh',figsize=(15,10))

In [None]:
#Scatterplot of all trees diameter (Described above; greater than 0)
non_zero[['tree_id', 'stump_diam']].plot(kind='scatter', x='tree_id', y='stump_diam',figsize=(20,10))

In [None]:
tree_census['spc_latin'].value_counts()

In [None]:
#Turn the value_counts from above into a DataFrame
pd.DataFrame(tree_census['spc_latin'].value_counts())

In [None]:
#Turn the DataFrame into a chart
pd.DataFrame(tree_census['spc_latin'].value_counts()).plot(kind='bar')

In [None]:
#Adjust Size and show top 50
top_names = tree_census['spc_latin'].value_counts().head(50)

In [None]:
top_names.plot(kind='bar', figsize=(20,10))

In [None]:
#Find the values in different columns
tree_subset['steward'].value_counts()

In [None]:
tree_subset['steward'].isnull().sum()

In [None]:
tree_subset['sidewalk'].value_counts()

In [None]:
tree_subset['sidewalk'].isnull().sum()

In [None]:
tree_subset[tree_subset['health'].isna()]

In [None]:
stumps = tree_subset[tree_subset['status']=='Stump']
stumps

In [None]:
tree_subset['health'].value_counts()

In [None]:
dead_trees = tree_subset[tree_subset['status']=="Dead"]
dead_trees

In [None]:
tree_subset.columns

In [None]:
tree_problems = tree_subset[['root_stone', 'root_grate', 'root_other', 'trunk_wire', 'trnk_light', 'trnk_other',
       'brch_light', 'brch_shoe', 'brch_other']]
tree_problems

In [None]:
#Root stone has/causes the most problems
tree_problems.apply(pd.Series.value_counts)

In [None]:
#FURTHER DATA CLEANING AND MAKING DATA UNIFORM

In [None]:
mask = ((tree_subset['status'] == 'Stump') | (tree_subset['status'] == 'Dead'))

In [None]:
tree_subset.loc[mask] = tree_subset.loc[mask].fillna('Not Applicable')

In [None]:
tree_subset[tree_subset['status']=='Stump'].head(5)

In [None]:
tree_subset.isna().sum()

In [None]:
#Fill in missing data and
tree_subset[tree_subset['problems'].isna()]

In [None]:
tree_subset['problems'].fillna("None", inplace=True)
tree_subset['health'].fillna("Good", inplace=True)
tree_subset['sidewalk'].fillna("No Damage", inplace=True)
tree_subset['spc_latin'].fillna("No Observation", inplace=True)
tree_subset['steward'].fillna("Unknown", inplace=True)

In [None]:
tree_subset.isna().sum()

In [None]:
#outliers for huge trees, only 245 results / 600,000
large_trees = tree_subset[(tree_subset['tree_dbh']>60) | (tree_subset['stump_diam']>60)]
large_trees

In [None]:
tree_subset = tree_subset[(tree_subset['tree_dbh']<=60) | (tree_subset['stump_diam']<=60)]

In [None]:
tree_subset_alive = tree_subset[tree_subset['status'] == 'Alive']
tree_subset_deadorstump = tree_subset[(tree_subset['status'] == 'Dead') | (tree_subset['status'] == 'Stump')]

In [None]:
#size and averages of the trees; grouped by name
tree_subset_alive.groupby('spc_latin')['tree_dbh'].describe()