In [1]:
import pandas as pd
df = pd.read_csv(filepath_or_buffer='/kaggle/input/tree-survival-prediction/Tree_Data.csv', index_col=['No']).sort_index()
# we want to predict Event, so there's no point in keeping the few rows where we don't have Event data
df = df.dropna(subset='Event')
df['Event'] = df['Event'].astype(int)
df.head()

Unnamed: 0_level_0,Plot,Subplot,Species,Light_ISF,Light_Cat,Core,Soil,Adult,Sterile,Conspecific,...,AMF,EMF,Phenolics,Lignin,NSC,Census,Time,Event,Harvest,Alive
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1,A,Prunus serotina,0.108,Med,2017,Prunus serotina,H,Non-Sterile,Conspecific,...,23.31,,0.2,10.42,13.92,12,42.0,1,,
11,1,C,Quercus alba,0.106,Med,2017,Quercus rubra,970,Non-Sterile,Heterospecific,...,15.82,31.07,5.19,20.52,19.29,33,115.5,0,,X
12,1,C,Quercus rubra,0.106,Med,2017,Prunus serotina,J,Non-Sterile,Heterospecific,...,24.45,28.19,3.36,24.74,15.01,18,63.0,1,,
14,1,C,Prunus serotina,0.106,Med,2017,Prunus serotina,G,Non-Sterile,Conspecific,...,35.6,,0.37,10.22,13.87,16,56.0,1,,
18,1,C,Prunus serotina,0.106,Med,2016,Acer rubrum,1332,Non-Sterile,Heterospecific,...,35.29,,0.3,10.8,13.79,7,24.5,1,,


In [2]:
df.shape

(2782, 23)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2782 entries, 3 to 7772
Data columns (total 23 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Plot         2782 non-null   int64  
 1   Subplot      2782 non-null   object 
 2   Species      2782 non-null   object 
 3   Light_ISF    2782 non-null   float64
 4   Light_Cat    2782 non-null   object 
 5   Core         2782 non-null   int64  
 6   Soil         2782 non-null   object 
 7   Adult        2782 non-null   object 
 8   Sterile      2782 non-null   object 
 9   Conspecific  2782 non-null   object 
 10  Myco         2782 non-null   object 
 11  SoilMyco     2782 non-null   object 
 12  PlantDate    2782 non-null   object 
 13  AMF          2782 non-null   float64
 14  EMF          1282 non-null   float64
 15  Phenolics    2782 non-null   float64
 16  Lignin       2782 non-null   float64
 17  NSC          2782 non-null   float64
 18  Census       2782 non-null   int64  
 19  Time       

In [4]:
df.nunique()

Plot             18
Subplot           5
Species           4
Light_ISF        53
Light_Cat         3
Core              2
Soil              7
Adult            36
Sterile           2
Conspecific       3
Myco              2
SoilMyco          3
PlantDate        19
AMF             923
EMF             681
Phenolics       494
Lignin         1094
NSC             998
Census           22
Time             22
Event             2
Harvest           1
Alive             1
dtype: int64

In [5]:
from plotly.express import bar
for column in ['Plot', 'Subplot', 'Species', 'Light_ISF', 'Light_Cat', 'Core', 'Soil',
       'Adult', 'Sterile', 'Conspecific', 'Myco', 'SoilMyco', 'PlantDate', 
               'Census', 'Time', ]:
    bar(data_frame=df[[column, 'Event']].groupby(by=[column, 'Event']).size().reset_index(), x=column, y=0, color='Event',
     color_continuous_scale='bluered').show()

In [6]:
from plotly.express import histogram
for column in ['AMF', 'EMF', 'Phenolics', 'Lignin', 'NSC',]:
    histogram(data_frame=df, x=column, color='Event').show()

We really see modal behavior with two chemicals: Lignin and Phenolics.

In [7]:
from plotly.express import scatter
scatter(data_frame=df, x='Lignin', y='Phenolics', color='Event', color_continuous_scale='bluered')

Clearly once we know the Lignin and Phenolics content we know most of what need to know.

In [8]:
from sklearn.metrics import f1_score
f1_score(y_true = df['Event'].values, y_pred=[int(value < 1.2) for value in df['Phenolics'].values])

0.8245614035087718

In [9]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true = df['Event'].values, y_pred=[int(value < 1.2) for value in df['Phenolics'].values])

array([[ 973,  222],
       [ 318, 1269]])

In [10]:
from sklearn.manifold import TSNE
tsne = TSNE(random_state=2023, verbose=1, n_components=2)
columns = ['Phenolics', 'Lignin', 'Time',]
df[['t0', 't1',]] = tsne.fit_transform(X=df[columns])
scatter(data_frame=df, x='t0', y='t1', color='Event', color_continuous_scale='bluered', hover_name=df.index)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2782 samples in 0.002s...
[t-SNE] Computed neighbors for 2782 samples in 0.062s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2782
[t-SNE] Computed conditional probabilities for sample 2000 / 2782
[t-SNE] Computed conditional probabilities for sample 2782 / 2782
[t-SNE] Mean sigma: 0.374585
[t-SNE] KL divergence after 250 iterations with early exaggeration: 51.882046
[t-SNE] KL divergence after 1000 iterations: 0.250920


If we do a little dimension reduction and we choose our variables carefully we can see that we have four distinct subgroups, three of which are easily classified.