In [1]:
import pandas as pd

SPRUCE = '/kaggle/input/spruce-tree-type-detection/Spruce.csv'
df = pd.read_csv(filepath_or_buffer=SPRUCE)
df.head()

Unnamed: 0,Elevation,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Horizontal_Distance_To_Fire_Points,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,...,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Tree_Type
0,2596,3,258,0,510,6279,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Other
1,2590,2,212,-6,390,6225,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Other
2,2804,9,268,65,3180,6121,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Other
3,2785,18,242,118,3090,6211,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Other
4,2595,2,153,-1,391,6172,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Other


In [2]:
import warnings
from plotly import express
warnings.filterwarnings(action='ignore', category=FutureWarning)

express.pie(data_frame=df, names='Tree_Type', color='Tree_Type')

Our classes are unbalanced, so we may be in for a difficult haul.

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 45 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Elevation                           15120 non-null  int64 
 1   Slope                               15120 non-null  int64 
 2   Horizontal_Distance_To_Hydrology    15120 non-null  int64 
 3   Vertical_Distance_To_Hydrology      15120 non-null  int64 
 4   Horizontal_Distance_To_Roadways     15120 non-null  int64 
 5   Horizontal_Distance_To_Fire_Points  15120 non-null  int64 
 6   Soil_Type1                          15120 non-null  int64 
 7   Soil_Type2                          15120 non-null  int64 
 8   Soil_Type3                          15120 non-null  int64 
 9   Soil_Type4                          15120 non-null  int64 
 10  Soil_Type5                          15120 non-null  int64 
 11  Soil_Type6                          15120 non-null  in

In [4]:
from umap import UMAP
from arrow import now

target = 'Tree_Type'
columns = [column for column in df.columns if column != target]

time_start = now()
# running UMAP over all our data takes forever, and we should be able to get a sense
# of how strong a signal is in our data from a sample
sample_df = df.sample(n=5000, random_state=2024).copy()
reducer = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=100)
sample_df[['x', 'y']] = pd.DataFrame(data=reducer.fit_transform(X=sample_df[columns]))
express.scatter(data_frame=sample_df, x='x', y='y', color=target, height=800, ).show()
print('UMAP done in {}'.format(now() - time_start))

2024-03-14 18:06:37.611517: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 18:06:37.611686: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 18:06:37.805158: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(n_epochs=100, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Thu Mar 14 18:06:54 2024 Construct fuzzy simplicial set
Thu Mar 14 18:06:54 2024 Finding Nearest Neighbors
Thu Mar 14 18:06:54 2024 Building RP forest with 9 trees
Thu Mar 14 18:07:00 2024 NN descent for 12 iterations
	 1  /  12
	 2  /  12
	Stopping threshold met -- exiting after 2 iterations
Thu Mar 14 18:07:20 2024 Finished Nearest Neighbor Search
Thu Mar 14 18:07:25 2024 Construct embedding


Epochs completed:   0%|            0/100 [00:00]

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
Thu Mar 14 18:07:28 2024 Finished embedding


UMAP done in 0:00:34.975662


This is not encouraging; dimension reduction with UMAP definitely clusters our trees, but the spruce trees are scattered everywhere.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from arrow import now

time_start = now()
X_train, X_test, y_train, y_test = train_test_split(df[columns], df[target], test_size=0.2, random_state=2024, stratify=df[target])

regression = LogisticRegression(max_iter=1000, tol=1e-6)
regression.fit(X=X_train, y=y_train)
print('fit complete after {} iterations.'.format(regression.n_iter_[0]))
print('accuracy: {:5.4f}'.format(regression.score(X=X_test, y=y_test)))
express.histogram(x=columns, y=regression.coef_[0]).show()
# print(regression.coef_)
print('done in {}'.format(now() - time_start))

fit complete after 284 iterations.
accuracy: 0.8767


done in 0:00:00.940600


At first glance this looks good; let's look at our classification report.

In [6]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=regression.predict(X=X_test)))

              precision    recall  f1-score   support

       Other       0.90      0.97      0.93      2592
      Spruce       0.63      0.33      0.44       432

    accuracy                           0.88      3024
   macro avg       0.76      0.65      0.68      3024
weighted avg       0.86      0.88      0.86      3024



This is discouraging; our model is finding spruces only about a third of the time.

In [7]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_true=y_test, y_pred=regression.predict(X=X_test)))

[[2507   85]
 [ 288  144]]


Let's try KNN and see if we do any better.

In [8]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X=X_train, y=y_train)
print('accuracy: {:5.4f}'.format(knn.score(X=X_test, y=y_test)))
print(classification_report(y_true=y_test, y_pred=knn.predict(X=X_test)))
print(confusion_matrix(y_true=y_test, y_pred=knn.predict(X=X_test)))

accuracy: 0.9193
              precision    recall  f1-score   support

       Other       0.94      0.97      0.95      2592
      Spruce       0.76      0.63      0.69       432

    accuracy                           0.92      3024
   macro avg       0.85      0.80      0.82      3024
weighted avg       0.91      0.92      0.92      3024

[[2507   85]
 [ 159  273]]


Here at least we are finding more than half of the spruces; our 0.92 accuracy is hiding the fact that we still find only about 63% of spruces.