In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Awesome package for non-linear correlation analysis
# should be really useful for feature selection
# https://www.sciencedirect.com/science/article/pii/S2352711021000315#fig2
# https://github.com/ElsevierSoftwareX/SOFTX-D-20-00028
from ennemi import pairwise_mi, pairwise_corr

# Datadir for location of in/out vars
datadir = '/n/home02/pbb/scripts/SelenkayDiversity/data/in'

figd = '/n/home02/pbb/scripts/SelenkayDiversity/figs/mango'

# toggle radius of inquiry
radius = 30 

# Load input data (cleaned in R)
XY_scale = pd.read_csv(f'{datadir}/XY_scaled_{radius}mRadius.csv')
XY = pd.read_csv(f'{datadir}/XY.csv')

# Drop factor and character arrays from X df
X = XY_scale.drop(['Unnamed: 0', 'Richness',
                   'Abundance', 'Evenness', 'shannonH',
                   'Spot', 'Soil_f',
                   'Treatment', 'Treatment_f', 'Transect_f',
                   'nlayers_plot_f',
                   'Simpson'], axis=1)
X = X.dropna(axis=1)

# Make y df
y = XY_scale.filter(['Spot', 'Abundance', 'Richness', 'shannonH', 'Evenness', 'Simpson', 'Soil', 'Treatment'])
y.head()

Unnamed: 0,Spot,Abundance,Richness,shannonH,Evenness,Simpson,Soil,Treatment
0,1A,253,55,3.476387,0.867506,21.690613,Black,Inside
1,1B,152,40,3.404402,0.922882,23.867769,Black,Inside
2,1C,153,40,3.353972,0.909212,21.816403,Black,Inside
3,1D,207,50,3.61848,0.924964,26.433683,Black,Inside
4,1E,158,44,3.488376,0.921829,26.112971,Black,Inside


In [10]:
# Filter column and make dataframe
# cols = [c for c in X.columns if not (('iqr' in c) | ('max_' in c) | ('median' in c ) | ('Spot' in c))]
# cols = [c for c in X.columns if not (('iqr' in c) | ('max_' in c) | ('Spot' in c) | ('Spot' in c)| ('Soil' in c))]
# cols = [c for c in X.columns if (('mean' in c) | ('std' in c))]
cols = [c for c in X.columns if not (('max_' in c) | ('Spot' in c))]
# XYcorr = XYcorr.drop(columns = ['PAIsum_0m', 'PAIsum_0mto1p5m', 'PAImean_0mto1p5m'])
XYcorr = pd.concat([X[cols],
                    y[['Abundance', 'Richness',
                       'shannonH', 'Simpson',
                       'Evenness']]],
                   axis=1)
len(cols)


146

In [5]:
# Compute MI correlation
corr = pairwise_corr(XYcorr.drop('Soil', axis=1), drop_nan=True)

In [7]:
# Turn it back into a dataframe
corr = pd.DataFrame(corr[['Abundance', 'Richness',
                          'shannonH', 'Evenness',
                          'Simpson']].iloc[0:-5])

shancorr = pd.DataFrame(corr.shannonH.sort_values(ascending=False))

simpcorr = pd.DataFrame(corr.Simpson.sort_values(ascending=False))

richcorr = pd.DataFrame(corr.Richness.sort_values(ascending=False))

abuncorr = pd.DataFrame(corr.Abundance.sort_values(ascending=False))

In [8]:
# Top 10 vars 
shancorr.iloc[0:10], abuncorr.iloc[0:10], richcorr.iloc[0:10], simpcorr.iloc[0:10]

(                      shannonH
 cv_CD_AboveG          0.721042
 sd_CD_AboveG          0.692585
 sd_sdH_vegtype_grass  0.666100
 sd_CD_G               0.652806
 sd_nlayers            0.650367
 mean_CD_AboveG        0.649473
 mean_cvH              0.645154
 sd_cscore             0.638323
 FHD_plot              0.632821
 Cover0p5m_plot        0.628313,
                       Abundance
 horzcover_grass        0.592427
 X                      0.566541
 sd_cvH_vegtype_grass   0.540966
 FHD_plot               0.524338
 mean_sdH               0.522702
 sd_FHD                 0.485800
 sd_PAI_AboveG          0.483594
 mean_sdHgrasslayer     0.469406
 mean_maxHgrasslayer    0.459184
 mean_maxH              0.458546,
                           Richness
 sd_sdHgrasslayer          0.706925
 sd_maxHgrasslayer         0.690158
 mean_cscore               0.637144
 sd_herbh                  0.627520
 mean_PAI_AboveG           0.599016
 sd_CD_AboveG              0.595896
 mean_CD_AboveGgrasslayer  0.58

### Soil Type

In [None]:
XYcorr_black = XYcorr.loc[XYcorr.Soil == 'Black']
XYcorr_black = XYcorr_black.drop('Soil', axis=1)

corr_black = pairwise_corr(XYcorr_black,
                           drop_nan=True)

In [9]:
# Top 10 vars 
# Shannon
print(corr_black['shannonH'].sort_values(ascending=False)[:12])

sd_CD_AboveG      0.822093
sd_nlayers        0.805459
sd_cscore         0.764159
mean_stdpeakh     0.761102
mean_maxpeakh     0.748249
sd_stdpeakh       0.745507
sd_gapsize        0.739555
sd_maxpeakh       0.732250
Cover1p5m_plot    0.721031
sd_maxH           0.712362
mean_CD_AboveG    0.709129
cv_nlayers        0.697994
Name: shannonH, dtype: float64


In [10]:
XYcorr_red = XYcorr.loc[XYcorr.Soil == 'Red']
XYcorr_red= XYcorr_red.drop('Soil', axis=1)

corr_red = pairwise_corr(XYcorr_red, drop_nan=True)

In [11]:
# Top 10 vars 
# Shannon
print(corr_red['shannonH'].sort_values(ascending=False)[:12])

horzcover_grass           0.824078
Richness                  0.759139
mean_ptoh                 0.748097
cv_CD_Ggrasslayer         0.746930
sd_cvH_vegtype_woody      0.724775
X                         0.699406
mean_cvH_vegtype_woody    0.682244
sd_PAI_G                  0.672571
sd_PAI_AboveG             0.667695
X50thPerc_plot            0.656653
cv_maxHgrasslayer         0.653384
sd_sdH_vegtype_grass      0.651516
Name: shannonH, dtype: float64
