In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


This notebook is used to get LDA data of daily imputed data from both ViT and Thresholding (side and top), after deleting the dead samples.



In [24]:
slow_growers = Path("../data/slow_grower.txt").read_text().strip().split(",")

In [25]:
df_vit = pd.read_csv('../result/side_top_features_vit_linearimputed_day.csv')
df_vit = df_vit[~df_vit['Plant Info'].isin(slow_growers)]


df_threshold = pd.read_csv('../result/side_top_features_thresholding_linearimputed_day.csv')
df_threshold = df_threshold[~df_threshold['Plant Info'].isin(slow_growers)]

In [26]:
print(df_vit.shape, df_threshold.shape)

(15588, 104) (15588, 96)


In [5]:
print(df_vit.columns, df_threshold.columns)

Index(['Plant Info', 'Date', 'side_area', 'side_convex_hull_area',
       'side_solidity', 'side_perimeter', 'side_width', 'side_height',
       'side_longest_path', 'side_center_of_mass_x',
       ...
       'top_hue_max', 'top_saturation_mean', 'top_saturation_var',
       'top_saturation_max', 'top_value_mean', 'top_value_var',
       'top_value_min', 'top_value_max', 'Genotype', 'Replicate'],
      dtype='object', length=104) Index(['Plant Info', 'Date', 'side_area', 'side_convex_hull_area',
       'side_solidity', 'side_perimeter', 'side_width', 'side_height',
       'side_longest_path', 'side_center_of_mass_x', 'side_center_of_mass_y',
       'side_convex_hull_vertices', 'side_ellipse_center_x',
       'side_ellipse_center_y', 'side_ellipse_major_axis',
       'side_ellipse_minor_axis', 'side_ellipse_angle',
       'side_ellipse_eccentricity', 'side_blue_mean', 'side_blue_var',
       'side_blue_min', 'side_blue_max', 'side_green_mean', 'side_green_var',
       'side_green_min', 

In [27]:
metric_cols_vit = [col for col in df_vit.columns if col not in ['Plant Info','Date', 'Genotype','Replicate']]
metric_cols_threshold = [col for col in df_threshold.columns if col not in ['Plant Info','Date', 'Genotype','Replicate']]

### ViT

In [28]:
y_vit = df_vit['Genotype']

In [29]:
metric_cols_vit_side = [col for col in metric_cols_vit if 'side' in col]
metric_cols_vit_top = [col for col in metric_cols_vit if 'top' in col]
print(len(metric_cols_vit_side), len(metric_cols_vit_top),len(metric_cols_vit))

50 50 100


In [32]:
# Scale features
X_vit = df_vit[metric_cols_vit]

scaler = StandardScaler()
X_vit_scaled = scaler.fit_transform(X_vit)


# Run LDA
lda = LinearDiscriminantAnalysis()
X_lda_vit = lda.fit_transform(X_vit_scaled, y_vit)



In [47]:
X_lda_vit

array([[ 0.11114098, -1.27642939,  0.94964383, ...,  0.68943922,
        -0.77998201, -0.14942455],
       [ 0.11114098, -1.27642939,  0.94964383, ...,  0.68943922,
        -0.77998201, -0.14942455],
       [ 0.15682442, -1.24801157,  0.84524372, ...,  0.49728282,
        -0.82696591, -0.00945259],
       ...,
       [ 1.0032638 ,  0.6248521 ,  0.26817769, ...,  1.46593005,
         0.37290832, -1.72060214],
       [ 1.0032638 ,  0.6248521 ,  0.26817769, ...,  1.46593005,
         0.37290832, -1.72060214],
       [ 1.0032638 ,  0.6248521 ,  0.26817769, ...,  1.46593005,
         0.37290832, -1.72060214]])

In [38]:
X_vit_scaled.shape

(15588, 100)

In [None]:
n_components = lda.scalings_.shape[1]
ld_names = [f"LD{i+1}" for i in range(n_components)]

loadings = pd.DataFrame(
    lda.scalings_,
    index=metric_cols_vit,
    columns=ld_names
)



(100, 59)


In [46]:
print(loadings.head)

<bound method NDFrame.head of                                 LD1           LD2           LD3           LD4  \
side_area              1.516137e+00  1.353104e-01 -1.555387e-01 -3.988216e-01   
side_convex_hull_area  2.954174e-01 -7.524865e-01  2.001419e-01  1.603881e+00   
side_solidity          3.896261e-01  3.456757e-01 -4.296772e-01 -2.540946e-01   
side_perimeter        -2.291959e-01 -4.723562e-01  9.424469e-02 -6.264710e-01   
side_width             2.700787e-01  9.014515e-02 -7.079006e-02 -1.811999e-01   
...                             ...           ...           ...           ...   
top_saturation_max    -9.390496e-15  7.316686e-15 -1.195439e-14 -9.534590e-16   
top_value_mean         1.691958e+00  1.345686e+00  1.826420e+00 -6.276589e-01   
top_value_var          8.639795e-03 -5.390625e-02  2.886505e-02  3.240206e-02   
top_value_min         -1.023184e-01  7.637017e-02 -1.972447e-02  4.732046e-02   
top_value_max         -3.336747e-01 -6.440465e-01 -1.073885e+00 -6.262377e-01  

In [44]:
loadings.loc['top_blue_min']

LD1     5.496523e-14
LD2     1.387508e-13
LD3    -9.500614e-14
LD4     1.208720e-13
LD5    -2.053345e-13
LD6    -1.035488e-13
LD7     4.257183e-13
LD8     5.528912e-13
LD9     6.250290e-13
LD10   -2.050546e-13
LD11    4.509047e-13
LD12    1.098471e-12
LD13   -5.712412e-13
LD14   -3.346091e-13
LD15    6.010263e-13
LD16   -5.119395e-14
LD17   -8.543848e-13
LD18   -4.753688e-13
LD19    2.407131e-13
LD20    1.906018e-13
LD21   -4.973235e-14
LD22   -8.343391e-14
LD23   -4.783458e-13
LD24   -3.677798e-13
LD25   -4.081869e-13
LD26   -1.170514e-12
LD27    3.027611e-13
LD28    8.400502e-14
LD29   -3.606406e-13
LD30   -6.515215e-13
LD31    4.222615e-13
LD32   -5.302224e-13
LD33   -2.649848e-13
LD34    6.188125e-13
LD35    1.447053e-13
LD36   -4.867004e-14
LD37   -1.744448e-13
LD38    1.127288e-14
LD39    3.409503e-13
LD40    2.297048e-13
LD41    1.069563e-13
LD42   -2.049787e-13
LD43    5.735244e-13
LD44    3.202596e-13
LD45   -2.189586e-13
LD46   -4.057244e-14
LD47   -2.408956e-13
LD48   -3.708

In [45]:
loadings.loc['top_saturation_max']

LD1    -9.390496e-15
LD2     7.316686e-15
LD3    -1.195439e-14
LD4    -9.534590e-16
LD5    -2.626775e-15
LD6     2.302327e-14
LD7    -4.837961e-14
LD8    -4.759883e-14
LD9    -6.672312e-14
LD10    2.421781e-14
LD11   -4.179643e-14
LD12   -7.159649e-14
LD13    2.647955e-14
LD14    2.983620e-14
LD15   -4.734986e-14
LD16    2.068549e-14
LD17    4.925493e-14
LD18    2.278738e-14
LD19    1.654703e-14
LD20   -1.508907e-14
LD21   -4.071043e-14
LD22    1.650468e-14
LD23    1.606805e-14
LD24    2.773335e-14
LD25    3.968096e-14
LD26    9.055766e-14
LD27   -3.196450e-14
LD28   -7.207461e-15
LD29    3.385992e-14
LD30    3.318550e-14
LD31   -3.950663e-14
LD32    3.925494e-14
LD33    2.003324e-14
LD34   -4.480440e-14
LD35   -2.600465e-14
LD36    6.208300e-15
LD37    6.867085e-16
LD38   -8.124083e-15
LD39   -3.655754e-14
LD40   -2.282342e-14
LD41    6.925427e-15
LD42    1.175781e-15
LD43   -2.956298e-14
LD44   -3.711427e-14
LD45    2.447350e-14
LD46    5.209710e-15
LD47    2.876437e-14
LD48    2.505

In [34]:
# variance of each trait column
var_series = X_vit.var()

zero_var_traits = var_series[var_series == 0].index.tolist()
print("Zero-variance traits:", zero_var_traits)


Zero-variance traits: ['top_blue_min', 'top_saturation_max']


In [36]:
var_series[var_series == 0]

top_blue_min          0.0
top_saturation_max    0.0
dtype: float64

In [21]:
# variance of each trait column
var_series = X_scaled_vit.var()

zero_var_traits = var_series[var_series == 0].index.tolist()
print("Zero-variance traits:", zero_var_traits)


AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [22]:
X_sub_lda.shape

(15588, 2)

In [None]:

df_lda_sub = pd.DataFrame({
    "LD1": X_sub_lda[:,0],
    "LD2": X_sub_lda[:,1],
    "Genotype": y,
    "Day": features_df['Day'],
    "Replicate": features_df['Replicate']
})