# Import Required Libraries

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn import preprocessing
import pickle as p
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read and Sort CSV File

In [50]:
# Read CSV file into DataFrame, set 'No' as index, and sort by index
#df = pd.read_csv(filepath_or_buffer='Tree_Data.csv', index_col=['No']).sort_index()

# # Read CSV file into DataFrame ONLY
df = pd.read_csv('Tree_Data.csv')

# Display the DataFrame
df

Unnamed: 0,No,Plot,Subplot,Species,Light_ISF,Light_Cat,Core,Soil,Adult,Sterile,...,AMF,EMF,Phenolics,Lignin,NSC,Census,Time,Event,Harvest,Alive
0,126,1,C,Acer saccharum,0.106,Med,2017,Prunus serotina,I,Non-Sterile,...,22.00,,-0.56,13.86,12.15,4,14.0,1.0,,
1,11,1,C,Quercus alba,0.106,Med,2017,Quercus rubra,970,Non-Sterile,...,15.82,31.07,5.19,20.52,19.29,33,115.5,0.0,,X
2,12,1,C,Quercus rubra,0.106,Med,2017,Prunus serotina,J,Non-Sterile,...,24.45,28.19,3.36,24.74,15.01,18,63.0,1.0,,
3,2823,7,D,Acer saccharum,0.080,Med,2016,Prunus serotina,J,Non-Sterile,...,22.23,,-0.71,14.29,12.36,4,14.0,1.0,,
4,5679,14,A,Acer saccharum,0.060,Low,2017,Prunus serotina,689,Non-Sterile,...,21.15,,-0.58,10.85,11.20,4,14.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2778,7165,17,B,Prunus serotina,0.111,Med,2017,Populus grandidentata,891,Non-Sterile,...,40.89,,0.83,9.15,11.88,16,56.0,1.0,,
2779,7217,17,D,Quercus alba,0.118,Med,2017,Acer rubrum,1468,Non-Sterile,...,15.47,32.82,4.88,19.01,23.50,16,56.0,1.0,,
2780,7306,17,D,Quercus alba,0.118,Med,2017,Quercus rubra,1454,Non-Sterile,...,11.96,37.67,5.51,21.13,19.10,16,56.0,1.0,,
2781,7771,18,D,Quercus alba,0.161,High,2017,Sterile,1297,Sterile,...,16.99,22.51,4.28,19.38,21.36,33,115.5,,,


## About dataset

Tree seedling functional traits mediate plant-soil feedback survival responses across a gradient of light availability.

#### Methodology:

(the following information provided by the authors of the experiment)

We conducted a factorial blocked design field experiment, consisting of four tree species, seven soil sources (sterilized conspecific, live conspecific, and five heterospecific), and a gradient of forest understory light levels (low, medium, and high), for a total of 3,024 seedlings. We monitored seedling survival twice per week over one growing season, and we randomly selected subsets of seedlings to measure mycorrhizal colonization and phenolics, lignin, and NSC measurements at three weeks. We used Cox proportional hazards survival models to evaluate survival and linear mixed effects models to test how light availability and soil source influence traits. 


#### Detailed information about each column follows:

No: Seedling unique ID number.  
Plot: Number of the field plot the seedling was planted in. (1-18)  
Subplot: Subplot within the main plot the seedling was planted in. Broken into 5 subplots (1 per corner, plus 1 in the middle). (A-E 
Species: Includes Acer saccharum, Prunus serotina, Quercus alba, and Quercus rubra  
Light ISF: Light level quantified with HemiView software. Represents the amount of light reaching each subplot at a height of 1m.  
Light Cat: Categorical light level created by splitting the range of Light_ISF values into three bins (low, med, high).  
Core: Year the soil core was removed from the field.  
Soil: Species from which the soil core was taken. Includes all species, plus Acer rubrum, Populus grandidentata, and a sterilized conspecific for each species.  
Adult: Individual tree that soil was taken from. Up to 6 adults per species. Used as a random effect in analyses.  
Sterile: Whether the soil was sterilized or not.  
Conspecific: Whether the soil was conspecific, heterospecific, or sterilized conspecific.  
Myco: Mycorrhizal type of the seedling species (AMF or EMF).  
SoilMyco: Mycorrhizal type of the species culturing the soil (AMF or EMF).  
PlantDate: The date that seedlings were planted in the field pots.  
AMF: Percent arbuscular mycorrhizal fungi colonization on the fine roots of harvested seedlings.  
EMF: Percent ectomycorrhizal fungi colonization on the root tips of harvested seedlings.  
Phenolics: Calculated as nmol Gallic acid equivalents per mg dry extract (see manuscript for detailed methods)  
NSC: Calculated as percent dry mass nonstructural carbohydrates (see manuscript for detailed methods)  
Lignin: Calculated as percent dry mass lignin (see manuscript for detailed methods)  
Census: The census number at which time the seedling died or was harvested.  
Time: The number of days at which time the seedling died or was harvested.  
<span style="color:green">
Event: Used for survival analysis to indicate status of each individual seedling at a given time (above)  
    0 = harvested or experiment ended  
    1 = dead  
</span>
Harvest: Indicates whether the seedling was harvested for trait measurement.  
Alive: Indicates if the seedling was alive at the end of the second growing season. "X" in this field indicates alive status.
#### Missing data is coded as NA.

# 1. Scoping

The scope of this analysis is to predict tree survival on the basis of 3 factors (as chosen in the experiment): tree species, soil sources, and light levels.  
      
A second goal is to practise logistic regression. There might be other better options and approaches to predict survival,
but since the scope is to learn, the above-mentioned algorithm has been chosen.

# 3. EDA - Exploratory Data Analysis

In [51]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,No,Plot,Subplot,Species,Light_ISF,Light_Cat,Core,Soil,Adult,Sterile,...,AMF,EMF,Phenolics,Lignin,NSC,Census,Time,Event,Harvest,Alive
0,126,1,C,Acer saccharum,0.106,Med,2017,Prunus serotina,I,Non-Sterile,...,22.0,,-0.56,13.86,12.15,4,14.0,1.0,,
1,11,1,C,Quercus alba,0.106,Med,2017,Quercus rubra,970,Non-Sterile,...,15.82,31.07,5.19,20.52,19.29,33,115.5,0.0,,X
2,12,1,C,Quercus rubra,0.106,Med,2017,Prunus serotina,J,Non-Sterile,...,24.45,28.19,3.36,24.74,15.01,18,63.0,1.0,,
3,2823,7,D,Acer saccharum,0.08,Med,2016,Prunus serotina,J,Non-Sterile,...,22.23,,-0.71,14.29,12.36,4,14.0,1.0,,
4,5679,14,A,Acer saccharum,0.06,Low,2017,Prunus serotina,689,Non-Sterile,...,21.15,,-0.58,10.85,11.2,4,14.0,1.0,,


In [52]:
# Display information about the DataFrame, including data types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2783 entries, 0 to 2782
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   No           2783 non-null   int64  
 1   Plot         2783 non-null   int64  
 2   Subplot      2783 non-null   object 
 3   Species      2783 non-null   object 
 4   Light_ISF    2783 non-null   float64
 5   Light_Cat    2783 non-null   object 
 6   Core         2783 non-null   int64  
 7   Soil         2783 non-null   object 
 8   Adult        2783 non-null   object 
 9   Sterile      2783 non-null   object 
 10  Conspecific  2783 non-null   object 
 11  Myco         2783 non-null   object 
 12  SoilMyco     2783 non-null   object 
 13  PlantDate    2783 non-null   object 
 14  AMF          2783 non-null   float64
 15  EMF          1283 non-null   float64
 16  Phenolics    2783 non-null   float64
 17  Lignin       2783 non-null   float64
 18  NSC          2783 non-null   float64
 19  Census

In [53]:
# Display summary statistics for numerical columns in the DataFrame
df.describe()

Unnamed: 0,No,Plot,Light_ISF,Core,AMF,EMF,Phenolics,Lignin,NSC,Census,Time,Event
count,2783.0,2783.0,2783.0,2783.0,2783.0,1283.0,2783.0,2783.0,2783.0,2783.0,2783.0,2782.0
mean,3914.513834,9.561624,0.085707,2016.64894,20.553069,26.47675,1.933105,15.759792,14.219641,15.28207,53.487244,0.570453
std,2253.515063,5.203659,0.025638,0.477387,12.309587,16.63689,1.969842,6.779607,4.298271,9.166555,32.082942,0.4951
min,3.0,1.0,0.032,2016.0,0.0,0.0,-1.35,2.23,4.3,4.0,14.0,0.0
25%,1971.0,5.0,0.066,2016.0,13.4,13.78,0.17,10.355,11.605,7.0,24.5,0.0
50%,3932.0,10.0,0.082,2017.0,18.0,27.72,0.75,14.04,12.66,13.0,45.5,1.0
75%,5879.0,14.0,0.1,2017.0,24.445,35.71,3.78,21.115,17.275,18.0,63.0,1.0
max,7772.0,18.0,0.161,2017.0,100.0,87.5,6.1,32.77,29.45,33.0,115.5,1.0


In [54]:
# Display the data types of each column in a DataFrame
df.dtypes

No               int64
Plot             int64
Subplot         object
Species         object
Light_ISF      float64
Light_Cat       object
Core             int64
Soil            object
Adult           object
Sterile         object
Conspecific     object
Myco            object
SoilMyco        object
PlantDate       object
AMF            float64
EMF            float64
Phenolics      float64
Lignin         float64
NSC            float64
Census           int64
Time           float64
Event          float64
Harvest         object
Alive           object
dtype: object

In [55]:
# Count the number of unique values in each column of a DataFrame
df.nunique()

No             2783
Plot             18
Subplot           5
Species           4
Light_ISF        53
Light_Cat         3
Core              2
Soil              7
Adult            36
Sterile           2
Conspecific       3
Myco              2
SoilMyco          3
PlantDate        19
AMF             924
EMF             682
Phenolics       494
Lignin         1095
NSC             998
Census           22
Time             22
Event             2
Harvest           1
Alive             1
dtype: int64

# 4. Handling Missing Values

In [56]:
# Count the number of missing (or NaN)
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 No                0
Plot              0
Subplot           0
Species           0
Light_ISF         0
Light_Cat         0
Core              0
Soil              0
Adult             0
Sterile           0
Conspecific       0
Myco              0
SoilMyco          0
PlantDate         0
AMF               0
EMF            1500
Phenolics         0
Lignin            0
NSC               0
Census            0
Time              0
Event             1
Harvest        2079
Alive          2292
dtype: int64


##### The size of 2783 entries is robust for this analysis. There is a missing value that can be filled in or dropped in Event.

#### Option 1: Fill any missing values in the DataFrame with the value 0

In [57]:
# Fills any missing values in the DataFrame with the value 0
# df = df.fillna(0)
# df

### OR

#### Option 2: Drop rows where the 'Event' column has missing values

In [58]:
### We predict "Event", so there's no point in keeping the few rows where we don't have Event data

# Drop rows where the 'Event' column has missing values
df.dropna(subset=['Event'], inplace=True)

# Convert 'Event' to integers
#df['Event'] = df['Event'].astype(int)

### Data Exploration after Handling Missing Values

In [59]:
# Display the first few rows of the DataFrame after dropping missing values and converting 'Event' to integers
df.head()

Unnamed: 0,No,Plot,Subplot,Species,Light_ISF,Light_Cat,Core,Soil,Adult,Sterile,...,AMF,EMF,Phenolics,Lignin,NSC,Census,Time,Event,Harvest,Alive
0,126,1,C,Acer saccharum,0.106,Med,2017,Prunus serotina,I,Non-Sterile,...,22.0,,-0.56,13.86,12.15,4,14.0,1.0,,
1,11,1,C,Quercus alba,0.106,Med,2017,Quercus rubra,970,Non-Sterile,...,15.82,31.07,5.19,20.52,19.29,33,115.5,0.0,,X
2,12,1,C,Quercus rubra,0.106,Med,2017,Prunus serotina,J,Non-Sterile,...,24.45,28.19,3.36,24.74,15.01,18,63.0,1.0,,
3,2823,7,D,Acer saccharum,0.08,Med,2016,Prunus serotina,J,Non-Sterile,...,22.23,,-0.71,14.29,12.36,4,14.0,1.0,,
4,5679,14,A,Acer saccharum,0.06,Low,2017,Prunus serotina,689,Non-Sterile,...,21.15,,-0.58,10.85,11.2,4,14.0,1.0,,


In [60]:
# Display the shape (number of rows and columns) of the DataFrame
df.shape

(2782, 24)

In [61]:
# Count the number of missing (or NaN)
df.any().isnull()

No             False
Plot           False
Subplot        False
Species        False
Light_ISF      False
Light_Cat      False
Core           False
Soil           False
Adult          False
Sterile        False
Conspecific    False
Myco           False
SoilMyco       False
PlantDate      False
AMF            False
EMF            False
Phenolics      False
Lignin         False
NSC            False
Census         False
Time           False
Event          False
Harvest        False
Alive          False
dtype: bool

##### Missing values in the dataset have been filled in or dropped.

In [62]:
# Display summary statistics for numerical columns in the DataFrame
df.describe()

Unnamed: 0,No,Plot,Light_ISF,Core,AMF,EMF,Phenolics,Lignin,NSC,Census,Time,Event
count,2782.0,2782.0,2782.0,2782.0,2782.0,1282.0,2782.0,2782.0,2782.0,2782.0,2782.0,2782.0
mean,3913.127606,9.558591,0.08568,2016.648814,20.554349,26.479844,1.932261,15.75849,14.217074,15.275701,53.464953,0.570453
std,2252.733097,5.202133,0.025603,0.477427,12.311615,16.643013,1.969693,6.780478,4.29691,9.162042,32.067148,0.4951
min,3.0,1.0,0.032,2016.0,0.0,0.0,-1.35,2.23,4.3,4.0,14.0,0.0
25%,1970.5,5.0,0.066,2016.0,13.4,13.78,0.17,10.3525,11.6025,7.0,24.5,0.0
50%,3931.5,10.0,0.082,2017.0,18.0,27.73,0.75,14.04,12.655,13.0,45.5,1.0
75%,5877.75,14.0,0.1,2017.0,24.4475,35.71,3.78,21.1175,17.27,18.0,63.0,1.0
max,7772.0,18.0,0.161,2017.0,100.0,87.5,6.1,32.77,29.45,33.0,115.5,1.0


##### There seem to be no unrealistic outliers.

In [63]:
# Print the counts of unique values in the 'Species' column
print(df['Species'].value_counts())

Species
Acer saccharum     751
Prunus serotina    749
Quercus alba       672
Quercus rubra      610
Name: count, dtype: int64


<div style="display: flex; justify-content: space-around;">
  <div style="text-align: center; margin: 0 10px;">
    <div style="width: 200px; height: 150px; overflow: hidden;">
      <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/a/a1/Acer_saccharum_1-jgreenlee_%285098070608%29.jpg/800px-Acer_saccharum_1-jgreenlee_%285098070608%29.jpg?20131123060807"
 alt="Image 1" style="width: 200px; height: auto;" />
    </div>
    <p>Acer saccharum (sugar maple)</p>
  </div>
  <div style="text-align: center; margin: 0 10px;">
    <div style="width: 200px; height: 150px; overflow: hidden;">
      <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/8e/Quercus_alba_2-acorn_branch.jpg/777px-Quercus_alba_2-acorn_branch.jpg?20161009181747"
 alt="Image 2" style="width: 200px; height: auto;" />
    </div>
    <p>Quercus alba (white oak)</p>
  </div>
  <div style="text-align: center; margin: 0 10px;">
    <div style="width: 200px; height: 150px; overflow: hidden;">
      <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/Quercus_rubra_1.jpg/220px-Quercus_rubra_1.jpg"
 alt="Image 3" style="width: 200px; height: auto;" />
    </div>
    <p>Quercus rubra (northern red oak)</p>
  </div>
  <div style="text-align: center; margin: 0 10px;">
    <div style="width: 200px; height: 150px; overflow: hidden;">
      <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Amerikaanse_vogelkers_Prunus_serotina_closeup.jpg/800px-Amerikaanse_vogelkers_Prunus_serotina_closeup.jpg" alt="Image 4" style="width: 200px; height: auto;" />
    </div>
    <p>Prunus serotina (wild black cherry)</p>
  </div>
</div>

In [64]:
# Print the counts of unique values in the 'Soil' column
print(df['Soil'].value_counts())

Soil
Sterile                  422
Prunus serotina          413
Quercus rubra            402
Acer saccharum           397
Populus grandidentata    391
Quercus alba             381
Acer rubrum              376
Name: count, dtype: int64


In [65]:
# Print the counts of unique values in the 'Light_Cat' column
print(df['Light_Cat'].value_counts())

Light_Cat
Med     1474
Low     1005
High     303
Name: count, dtype: int64


# 5. Feature Engineering

In [66]:
# df['Alive'] = df['Alive'].replace('X', 1)
# df['Harvest'] = df['Harvest'].replace('X', 1)

# label = preprocessing.LabelEncoder()
# for column in df.columns:
#     df[column] = label.fit_transform(df[column])
    
# df

# 6. Data Visualization

In [67]:
from plotly.express import bar
for column in ['Plot', 'Subplot', 'Species', 'Light_ISF', 'Light_Cat', 'Core', 'Soil',
       'Adult', 'Sterile', 'Conspecific', 'Myco', 'SoilMyco', 'PlantDate', 
               'Census', 'Time', ]:
    bar(data_frame=df[[column, 'Event']].groupby(by=[column, 'Event']).size().reset_index(), x=column, y=0, color='Event',
     color_continuous_scale='bluered').show()

In [68]:
from plotly.express import histogram
for column in ['AMF', 'EMF', 'Phenolics', 'Lignin', 'NSC',]:
    histogram(data_frame=df, x=column, color='Event').show()

We really see modal behavior with two chemicals: Lignin and Phenolics.

In [69]:
from plotly.express import scatter
scatter(data_frame=df, x='Lignin', y='Phenolics', color='Event', color_continuous_scale='bluered')

Clearly once we know the Lignin and Phenolics content we know most of what need to know.

# 7. Modeling

In [70]:
# Import the F1 score metric from sklearn
from sklearn.metrics import f1_score

# Calculate the F1 score
f1_score_result = f1_score(
    y_true=df['Event'].values,  # True labels
    y_pred=[int(value < 1.2) for value in df['Phenolics'].values]  # Predicted labels based on a threshold
)

# Print the calculated F1 score
print("F1 Score:", f1_score_result)

F1 Score: 0.8245614035087718


In [71]:
# Import the confusion matrix metric from sklearn
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
conf_matrix = confusion_matrix(
    y_true=df['Event'].values,  # True labels
    y_pred=[int(value < 1.2) for value in df['Phenolics'].values]  # Predicted labels based on a threshold
)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[ 973  222]
 [ 318 1269]]


In [72]:
# Import t-SNE (t-distributed Stochastic Neighbor Embedding) from sklearn
from sklearn.manifold import TSNE

# Initialize t-SNE with specific parameters
tsne = TSNE(random_state=2023, verbose=1, n_components=2)

# Specify columns to be used for t-SNE
columns = ['Phenolics', 'Lignin', 'Time']

# Apply t-SNE and add the resulting components to the DataFrame
df[['t0', 't1']] = tsne.fit_transform(X=df[columns])

# Import the scatter function from plotly.express
from plotly.express import scatter

# Create a scatter plot with t-SNE components, color-coded by 'Event'
scatter(data_frame=df, x='t0', y='t1', color='Event', color_continuous_scale='bluered', hover_name=df.index)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2782 samples in 0.002s...
[t-SNE] Computed neighbors for 2782 samples in 0.072s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2782
[t-SNE] Computed conditional probabilities for sample 2000 / 2782
[t-SNE] Computed conditional probabilities for sample 2782 / 2782
[t-SNE] Mean sigma: 0.374585
[t-SNE] KL divergence after 250 iterations with early exaggeration: 51.891567
[t-SNE] KL divergence after 1000 iterations: 0.251987


If we do a little dimension reduction and we choose our variables carefully we can see that we have four distinct subgroups, three of which are easily classified.

# 8. Logistic Regression

### 8.1 Pre-processing

In [73]:
# Scoping the df
scoped_df = df[['Species', 'Soil', 'Light_Cat', 'Event']]

In [74]:
scoped_df.head(3)

Unnamed: 0,Species,Soil,Light_Cat,Event
0,Acer saccharum,Prunus serotina,Med,1.0
1,Quercus alba,Quercus rubra,Med,0.0
2,Quercus rubra,Prunus serotina,Med,1.0


In [75]:
def encode_one_hot():
    # One hot encoding
    encoder = OneHotEncoder()
    categorical_columns = ['Species', 'Soil', 'Light_Cat']
    encoded_data = encoder.fit_transform(scoped_df[categorical_columns]).toarray()
    # Create column names
    encoded_columns = []
    for i, category in enumerate(encoder.categories_):
        encoded_columns.extend([f"{categorical_columns[i]}_{cat}" for cat in category])
    # print(encoded_columns)

    encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)
    encoded_df['Event'] = scoped_df['Event'].values
    return encoded_df

In [76]:
encoded_df = encode_one_hot()

### 8.2 Model

In [77]:
y = encoded_df['Event']
X = encoded_df.drop('Event', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print(X)

In [78]:
model = LogisticRegression()

In [79]:
model.fit(X_train, y_train)

### 8.3 Performance Evaluation

In [80]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.76      0.85      0.80       234
         1.0       0.88      0.80      0.84       323

    accuracy                           0.82       557
   macro avg       0.82      0.83      0.82       557
weighted avg       0.83      0.82      0.83       557



In [81]:
print(confusion_matrix(y_test, y_pred))

[[200  34]
 [ 64 259]]


200 - true negatives  
64 - false negatives  
34 - false positives  
259 - true positives  

### 8.4 Testing

In [82]:
# We want to use a df in the same format as the data we submit to .predict()
columns = X_test.columns
print(columns)

Index(['Species_Acer saccharum', 'Species_Prunus serotina',
       'Species_Quercus alba', 'Species_Quercus rubra', 'Soil_Acer rubrum',
       'Soil_Acer saccharum', 'Soil_Populus grandidentata',
       'Soil_Prunus serotina', 'Soil_Quercus alba', 'Soil_Quercus rubra',
       'Soil_Sterile', 'Light_Cat_High', 'Light_Cat_Low', 'Light_Cat_Med'],
      dtype='object')


In [83]:
test_df = pd.DataFrame([[1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0]], 
                       index=[0], columns=columns)

test_df.head(3)

Unnamed: 0,Species_Acer saccharum,Species_Prunus serotina,Species_Quercus alba,Species_Quercus rubra,Soil_Acer rubrum,Soil_Acer saccharum,Soil_Populus grandidentata,Soil_Prunus serotina,Soil_Quercus alba,Soil_Quercus rubra,Soil_Sterile,Light_Cat_High,Light_Cat_Low,Light_Cat_Med
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [84]:
model.predict(test_df)

array([1.])

The model predicts that Acer Saccharum (sugar maple), growing in soil conditions typically altered by Prunus Serotina (black cherry), and under high light conditions, will not survive.

# 9. Random Forest

In [85]:
# Display the first few rows of the DataFrame after dropping missing values and converting 'Event' to integers
df.head()

Unnamed: 0,No,Plot,Subplot,Species,Light_ISF,Light_Cat,Core,Soil,Adult,Sterile,...,Phenolics,Lignin,NSC,Census,Time,Event,Harvest,Alive,t0,t1
0,126,1,C,Acer saccharum,0.106,Med,2017,Prunus serotina,I,Non-Sterile,...,-0.56,13.86,12.15,4,14.0,1.0,,,-60.385208,2.421222
1,11,1,C,Quercus alba,0.106,Med,2017,Quercus rubra,970,Non-Sterile,...,5.19,20.52,19.29,33,115.5,0.0,,X,49.527573,6.706789
2,12,1,C,Quercus rubra,0.106,Med,2017,Prunus serotina,J,Non-Sterile,...,3.36,24.74,15.01,18,63.0,1.0,,,18.951324,37.580952
3,2823,7,D,Acer saccharum,0.08,Med,2016,Prunus serotina,J,Non-Sterile,...,-0.71,14.29,12.36,4,14.0,1.0,,,-60.387596,2.426194
4,5679,14,A,Acer saccharum,0.06,Low,2017,Prunus serotina,689,Non-Sterile,...,-0.58,10.85,11.2,4,14.0,1.0,,,-60.412868,2.339119


### 9.1 Replacing X with Int=1

In [86]:
df['Alive'] = df['Alive'].replace('X', 1)
df['Harvest'] = df['Harvest'].replace('X', 1)

label = preprocessing.LabelEncoder()
for column in df.columns:
    df[column] = label.fit_transform(df[column])
    
df

Unnamed: 0,No,Plot,Subplot,Species,Light_ISF,Light_Cat,Core,Soil,Adult,Sterile,...,Phenolics,Lignin,NSC,Census,Time,Event,Harvest,Alive,t0,t1
0,50,0,2,0,40,2,1,3,34,0,...,27,433,306,0,0,1,1,1,2,1136
1,1,0,2,2,40,2,1,5,30,0,...,457,680,776,21,21,0,1,0,2110,1228
2,2,0,2,3,40,2,1,3,35,0,...,283,898,522,12,12,1,1,1,1634,1960
3,1002,6,3,0,24,2,0,3,35,0,...,13,467,326,0,0,1,1,1,1,1137
4,2022,13,0,0,9,1,1,3,26,0,...,25,309,216,0,0,1,1,1,0,1134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2777,2522,16,2,2,46,0,1,5,15,0,...,478,740,755,10,10,1,1,1,1518,2107
2778,2549,16,1,1,44,2,1,2,27,0,...,156,192,281,10,10,1,1,1,1847,188
2779,2566,16,3,2,45,2,1,0,13,0,...,429,577,959,10,10,1,1,1,1508,2108
2780,2603,16,3,2,45,2,1,5,12,0,...,477,721,765,10,10,1,1,1,1516,2106


### 9.2 Modeling

In [87]:
the_first_target = df['Harvest']
the_second_target = df['Alive']

df = df.drop(columns=['Harvest','Alive'])

X_train_first, x_val_first, y_train_first, y_val_first = train_test_split(df, the_first_target, test_size=0.2, random_state=64)
print('Shape of train', X_train_first.shape)
print('Shape of Validation ', x_val_first.shape)

Shape of train (2225, 24)
Shape of Validation  (557, 24)


In [88]:
X_train_second, x_val_second, y_train_second, y_val_second = train_test_split(df, the_first_target, test_size=0.2, random_state=64)
print('Shape of train', X_train_second.shape)
print('Shape of Validation ', y_train_second.shape)

Shape of train (2225, 24)
Shape of Validation  (2225,)


In [89]:
the_first_rfc = RandomForestClassifier(criterion='entropy', n_estimators=20000)

In [90]:
the_first_rfc.fit(X_train_first, y_train_first)

In [91]:
filemodelname = 'the_first_rfc'

with open(filemodelname, 'wb') as handle:
          p.dump(the_first_rfc, handle, protocol=p.HIGHEST_PROTOCOL)

In [92]:
the_second_rfc = RandomForestClassifier(criterion='entropy', n_estimators=20000)

In [93]:
the_second_rfc.fit(X_train_second, y_train_first)

In [94]:
filemodelname = 'the_second_rfc'

with open(filemodelname, 'wb') as handle:
          p.dump(the_first_rfc, handle, protocol=p.HIGHEST_PROTOCOL)

### 9.3 Result's Wathing

In [95]:
def predict(inpt, model):
    try:
        with open(model, 'rb') as handle:
            model = p.load(handle)
        inn = []            
        val_pred = model.predict(inpt)
        return val_pred
    
    except:
        return 'The exeption is in RandomForest.predict'

In [96]:
y_val_first = y_val_first.to_list()
y_val_second = y_val_second.to_list()

In [97]:
inpt = x_val_first
the_first_res = predict(inpt, 'the_first_rfc')
inpt = x_val_second
the_second_res = predict(inpt, 'the_second_rfc')

In [98]:
accuracy_el = []
for i in range(0, len(y_val_first)):
    accuracy_el.append(y_val_first[i] - the_first_res[i])
accuracy_first = accuracy_el.count(0) / len(y_val_first)
accuracy_first

1.0

In [99]:
accuracy_el = []
for i in range(0, len(y_val_first)):
    accuracy_el.append(y_val_second[i] - the_second_res[i])
accuracy_second = accuracy_el.count(0) / len(y_val_second)
accuracy_second

1.0