In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder 
from sklearn.metrics import classification_report, confusion_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
df = pd.read_csv('Tree_Data.csv')

## About dataset

Tree seedling functional traits mediate plant-soil feedback survival responses across a gradient of light availability.

#### Methodology:

(the following information provided by the authors of the experiment)

We conducted a factorial blocked design field experiment, consisting of four tree species, seven soil sources (sterilized conspecific, live conspecific, and five heterospecific), and a gradient of forest understory light levels (low, medium, and high), for a total of 3,024 seedlings. We monitored seedling survival twice per week over one growing season, and we randomly selected subsets of seedlings to measure mycorrhizal colonization and phenolics, lignin, and NSC measurements at three weeks. We used Cox proportional hazards survival models to evaluate survival and linear mixed effects models to test how light availability and soil source influence traits. 


#### Detailed information about each column follows:

No: Seedling unique ID number.  
Plot: Number of the field plot the seedling was planted in. (1-18)  
Subplot: Subplot within the main plot the seedling was planted in. Broken into 5 subplots (1 per corner, plus 1 in the middle). (A-E 
Species: Includes Acer saccharum, Prunus serotina, Quercus alba, and Quercus rubra  
Light ISF: Light level quantified with HemiView software. Represents the amount of light reaching each subplot at a height of 1m.  
Light Cat: Categorical light level created by splitting the range of Light_ISF values into three bins (low, med, high).  
Core: Year the soil core was removed from the field.  
Soil: Species from which the soil core was taken. Includes all species, plus Acer rubrum, Populus grandidentata, and a sterilized conspecific for each species.  
Adult: Individual tree that soil was taken from. Up to 6 adults per species. Used as a random effect in analyses.  
Sterile: Whether the soil was sterilized or not.  
Conspecific: Whether the soil was conspecific, heterospecific, or sterilized conspecific.  
Myco: Mycorrhizal type of the seedling species (AMF or EMF).  
SoilMyco: Mycorrhizal type of the species culturing the soil (AMF or EMF).  
PlantDate: The date that seedlings were planted in the field pots.  
AMF: Percent arbuscular mycorrhizal fungi colonization on the fine roots of harvested seedlings.  
EMF: Percent ectomycorrhizal fungi colonization on the root tips of harvested seedlings.  
Phenolics: Calculated as nmol Gallic acid equivalents per mg dry extract (see manuscript for detailed methods)  
NSC: Calculated as percent dry mass nonstructural carbohydrates (see manuscript for detailed methods)  
Lignin: Calculated as percent dry mass lignin (see manuscript for detailed methods)  
Census: The census number at which time the seedling died or was harvested.  
Time: The number of days at which time the seedling died or was harvested.  
<span style="color:green">
Event: Used for survival analysis to indicate status of each individual seedling at a given time (above)  
    0 = harvested or experiment ended  
    1 = dead  
</span>
Harvest: Indicates whether the seedling was harvested for trait measurement.  
Alive: Indicates if the seedling was alive at the end of the second growing season. "X" in this field indicates alive status.
#### Missing data is coded as NA.

# 1. Scoping

The scope of this analysis is to predict tree survival on the basis of 3 factors (as chosen in the experiment): tree species, soil sources, and light levels.  
      
A second goal is to practise logistic regression. There might be other better options and approaches to predict survival,
but since the scope is to learn, the above-mentioned algorithm has been chosen.

# 2. EDA - Exploratory Data Analysis

In [3]:
df.head()

Unnamed: 0,No,Plot,Subplot,Species,Light_ISF,Light_Cat,Core,Soil,Adult,Sterile,...,AMF,EMF,Phenolics,Lignin,NSC,Census,Time,Event,Harvest,Alive
0,126,1,C,Acer saccharum,0.106,Med,2017,Prunus serotina,I,Non-Sterile,...,22.0,,-0.56,13.86,12.15,4,14.0,1.0,,
1,11,1,C,Quercus alba,0.106,Med,2017,Quercus rubra,970,Non-Sterile,...,15.82,31.07,5.19,20.52,19.29,33,115.5,0.0,,X
2,12,1,C,Quercus rubra,0.106,Med,2017,Prunus serotina,J,Non-Sterile,...,24.45,28.19,3.36,24.74,15.01,18,63.0,1.0,,
3,2823,7,D,Acer saccharum,0.08,Med,2016,Prunus serotina,J,Non-Sterile,...,22.23,,-0.71,14.29,12.36,4,14.0,1.0,,
4,5679,14,A,Acer saccharum,0.06,Low,2017,Prunus serotina,689,Non-Sterile,...,21.15,,-0.58,10.85,11.2,4,14.0,1.0,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2783 entries, 0 to 2782
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   No           2783 non-null   int64  
 1   Plot         2783 non-null   int64  
 2   Subplot      2783 non-null   object 
 3   Species      2783 non-null   object 
 4   Light_ISF    2783 non-null   float64
 5   Light_Cat    2783 non-null   object 
 6   Core         2783 non-null   int64  
 7   Soil         2783 non-null   object 
 8   Adult        2783 non-null   object 
 9   Sterile      2783 non-null   object 
 10  Conspecific  2783 non-null   object 
 11  Myco         2783 non-null   object 
 12  SoilMyco     2783 non-null   object 
 13  PlantDate    2783 non-null   object 
 14  AMF          2783 non-null   float64
 15  EMF          1283 non-null   float64
 16  Phenolics    2783 non-null   float64
 17  Lignin       2783 non-null   float64
 18  NSC          2783 non-null   float64
 19  Census

##### The size of 2783 entries is robust for this analysis. There is a missing value that can be dropped in Event.

In [5]:
df.dropna(subset=['Event'], inplace=True)

In [6]:
df.shape

(2782, 24)

In [7]:
df.describe()

Unnamed: 0,No,Plot,Light_ISF,Core,AMF,EMF,Phenolics,Lignin,NSC,Census,Time,Event
count,2782.0,2782.0,2782.0,2782.0,2782.0,1282.0,2782.0,2782.0,2782.0,2782.0,2782.0,2782.0
mean,3913.127606,9.558591,0.08568,2016.648814,20.554349,26.479844,1.932261,15.75849,14.217074,15.275701,53.464953,0.570453
std,2252.733097,5.202133,0.025603,0.477427,12.311615,16.643013,1.969693,6.780478,4.29691,9.162042,32.067148,0.4951
min,3.0,1.0,0.032,2016.0,0.0,0.0,-1.35,2.23,4.3,4.0,14.0,0.0
25%,1970.5,5.0,0.066,2016.0,13.4,13.78,0.17,10.3525,11.6025,7.0,24.5,0.0
50%,3931.5,10.0,0.082,2017.0,18.0,27.73,0.75,14.04,12.655,13.0,45.5,1.0
75%,5877.75,14.0,0.1,2017.0,24.4475,35.71,3.78,21.1175,17.27,18.0,63.0,1.0
max,7772.0,18.0,0.161,2017.0,100.0,87.5,6.1,32.77,29.45,33.0,115.5,1.0


##### There seem to be no unrealistic outliers.

In [8]:
print(df['Species'].value_counts())

Species
Acer saccharum     751
Prunus serotina    749
Quercus alba       672
Quercus rubra      610
Name: count, dtype: int64


<div style="display: flex; justify-content: space-around;">
  <div style="text-align: center; margin: 0 10px;">
    <div style="width: 200px; height: 150px; overflow: hidden;">
      <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/a/a1/Acer_saccharum_1-jgreenlee_%285098070608%29.jpg/800px-Acer_saccharum_1-jgreenlee_%285098070608%29.jpg?20131123060807"
 alt="Image 1" style="width: 200px; height: auto;" />
    </div>
    <p>Acer saccharum (sugar maple)</p>
  </div>
  <div style="text-align: center; margin: 0 10px;">
    <div style="width: 200px; height: 150px; overflow: hidden;">
      <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/8e/Quercus_alba_2-acorn_branch.jpg/777px-Quercus_alba_2-acorn_branch.jpg?20161009181747"
 alt="Image 2" style="width: 200px; height: auto;" />
    </div>
    <p>Quercus alba (white oak)</p>
  </div>
  <div style="text-align: center; margin: 0 10px;">
    <div style="width: 200px; height: 150px; overflow: hidden;">
      <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/Quercus_rubra_1.jpg/220px-Quercus_rubra_1.jpg"
 alt="Image 3" style="width: 200px; height: auto;" />
    </div>
    <p>Quercus rubra (northern red oak)</p>
  </div>
  <div style="text-align: center; margin: 0 10px;">
    <div style="width: 200px; height: 150px; overflow: hidden;">
      <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Amerikaanse_vogelkers_Prunus_serotina_closeup.jpg/800px-Amerikaanse_vogelkers_Prunus_serotina_closeup.jpg" alt="Image 4" style="width: 200px; height: auto;" />
    </div>
    <p>Prunus serotina (wild black cherry)</p>
  </div>
</div>

In [9]:
print(df['Soil'].value_counts())

Soil
Sterile                  422
Prunus serotina          413
Quercus rubra            402
Acer saccharum           397
Populus grandidentata    391
Quercus alba             381
Acer rubrum              376
Name: count, dtype: int64


In [10]:
print(df['Light_Cat'].value_counts())

Light_Cat
Med     1474
Low     1005
High     303
Name: count, dtype: int64


# 3. Logistic regression

### 3.1 Pre-processing

In [11]:
# Scoping the df
scoped_df = df[['Species', 'Soil', 'Light_Cat', 'Event']]

In [12]:
scoped_df.head(3)

Unnamed: 0,Species,Soil,Light_Cat,Event
0,Acer saccharum,Prunus serotina,Med,1.0
1,Quercus alba,Quercus rubra,Med,0.0
2,Quercus rubra,Prunus serotina,Med,1.0


In [13]:
def encode_one_hot():
    # One hot encoding
    encoder = OneHotEncoder()
    categorical_columns = ['Species', 'Soil', 'Light_Cat']
    encoded_data = encoder.fit_transform(scoped_df[categorical_columns]).toarray()
    # Create column names
    encoded_columns = []
    for i, category in enumerate(encoder.categories_):
        encoded_columns.extend([f"{categorical_columns[i]}_{cat}" for cat in category])
    # print(encoded_columns)

    encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)
    encoded_df['Event'] = scoped_df['Event'].values
    return encoded_df

In [14]:
encoded_df = encode_one_hot()

### 3.2 Model

In [15]:
y = encoded_df['Event']
X = encoded_df.drop('Event', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print(X)

In [16]:
model = LogisticRegression()

In [17]:
model.fit(X_train, y_train)

### 3.3 Performance Evaluation

In [18]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.76      0.85      0.80       234
         1.0       0.88      0.80      0.84       323

    accuracy                           0.82       557
   macro avg       0.82      0.83      0.82       557
weighted avg       0.83      0.82      0.83       557



In [19]:
print(confusion_matrix(y_test, y_pred))

[[200  34]
 [ 64 259]]


200 - true negatives  
64 - false negatives  
34 - false positives  
259 - true positives  

### 3.4 Testing

In [20]:
# We want to use a df in the same format as the data we submit to .predict()
columns = X_test.columns
print(columns)

Index(['Species_Acer saccharum', 'Species_Prunus serotina',
       'Species_Quercus alba', 'Species_Quercus rubra', 'Soil_Acer rubrum',
       'Soil_Acer saccharum', 'Soil_Populus grandidentata',
       'Soil_Prunus serotina', 'Soil_Quercus alba', 'Soil_Quercus rubra',
       'Soil_Sterile', 'Light_Cat_High', 'Light_Cat_Low', 'Light_Cat_Med'],
      dtype='object')


In [21]:
test_df = pd.DataFrame([[1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0]], 
                       index=[0], columns=columns)

test_df.head(3)

Unnamed: 0,Species_Acer saccharum,Species_Prunus serotina,Species_Quercus alba,Species_Quercus rubra,Soil_Acer rubrum,Soil_Acer saccharum,Soil_Populus grandidentata,Soil_Prunus serotina,Soil_Quercus alba,Soil_Quercus rubra,Soil_Sterile,Light_Cat_High,Light_Cat_Low,Light_Cat_Med
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [22]:
model.predict(test_df)

array([1.])

The model predicts that Acer Saccharum (sugar maple), growing in soil conditions typically altered by Prunus Serotina (black cherry), and under high light conditions, will not survive.

Thanks!