In [1]:
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

In [2]:
import statsmodels.api as sm
import pandas as pd
from statsmodels.formula.api import ols

In [3]:
a_classified = pd.read_excel('../input/quotient_deviation_df_A_CLASSIFIED.xlsx')

## ANOVA

In [23]:
names = []
sum_sq = []
mean_sq = []
Fs = []
PRs = []
for j in range(15):
    lm = ols(f'D{j+1} ~ Class_4',data=a_classified.rename({'Class 4':'Class_4'}, axis=1)).fit()
    table = sm.stats.anova_lm(lm)
    names += [f"D{j+1}"]
    sum_sq += [table['sum_sq']['Class_4']]
    mean_sq += [table['mean_sq']['Class_4']]
    Fs += [table['F']['Class_4']]
    PRs += [table['PR(>F)']['Class_4']]

for j in range(15):
    lm = ols(f'CWT{j+1} ~ Class_4',data=a_classified.rename({'Class 4':'Class_4'}, axis=1)).fit()
    table = sm.stats.anova_lm(lm)
    names += [f"CWT{j+1}"]
    sum_sq += [table['sum_sq']['Class_4']]
    mean_sq += [table['mean_sq']['Class_4']]
    Fs += [table['F']['Class_4']]
    PRs += [table['PR(>F)']['Class_4']]

trach_anova = pd.DataFrame({
    'Feature': names,
    'sum_sq': sum_sq,
    'mean_sq': mean_sq,
    'F':Fs,
    'PR(>F)':PRs
})

In [25]:
trach_anova

Unnamed: 0,Feature,sum_sq,mean_sq,F,PR(>F)
0,D1,1.60386,1.60386,249.928035,4.629524e-37
1,D2,1.833837,1.833837,314.054995,7.220491e-43
2,D3,1.910133,1.910133,344.113012,2.416706e-45
3,D4,2.171801,2.171801,410.061271,2.522064e-50
4,D5,2.401825,2.401825,439.071456,2.3959230000000003e-52
5,D6,2.796036,2.796036,496.526304,4.293045e-56
6,D7,3.383801,3.383801,572.352169,1.369849e-60
7,D8,3.949869,3.949869,581.345766,4.295595e-61
8,D9,4.441544,4.441544,526.538108,6.2682069999999995e-58
9,D10,4.357845,4.357845,414.310005,1.257831e-50


In [None]:
trach_anova.to_excel('../output/tracheides_ANOVA.xlsx', index=False)

## Kruskal–Wallis test

In [19]:
from scipy.stats import mstats

def get_class_values(df, cl, col):
    return list(df[df['Class 4']==cl][col])

d_col = []
d_stats = []
d_pvalue = []

df_diam = a_classified.iloc[:,list(range(0,16))+[33]]
for column in df_diam.columns:

    if column == 'Year' or column == 'Class 4':
        continue
    s, p = mstats.kruskalwallis(get_class_values(df_diam, 1, column),
                                get_class_values(df_diam, 2, column),
                                get_class_values(df_diam, 3, column),
                                get_class_values(df_diam, 4, column))
    d_col += [column]
    d_stats += [s]
    d_pvalue += [p]
d_df = pd.DataFrame({
    'Feature': d_col,
    'Statistic': d_stats,
    'P-value': d_pvalue
})
cwt_col = []
cwt_stats = []
cwt_pvalue = []

df_cwt=a_classified.iloc[:,[0, 33] + list(range(16,31))]
for column in df_cwt.columns:

    if column == 'Year' or column == 'Class 4':
        continue
    s, p = mstats.kruskalwallis(get_class_values(df_cwt, 1, column),
                                get_class_values(df_cwt, 2, column),
                                get_class_values(df_cwt, 3, column),
                                get_class_values(df_cwt, 4, column))
    cwt_col += [column]
    cwt_stats += [s]
    cwt_pvalue += [p]

cwt_df = pd.DataFrame({
    'Feature': cwt_col,
    'Statistic': cwt_stats,
    'P-value': cwt_pvalue
})

In [29]:
d_df

Unnamed: 0,Feature,Statistic,P-value
0,D1,74.855297,3.891526e-16
1,D2,81.401479,1.535995e-17
2,D3,83.840611,4.60257e-18
3,D4,96.176684,1.031384e-20
4,D5,101.50631,7.372053e-22
5,D6,111.120138,6.29886e-24
6,D7,120.524413,5.949678e-26
7,D8,120.363253,6.444704e-26
8,D9,116.407641,4.5815940000000005e-25
9,D10,112.372778,3.385684e-24


In [30]:
cwt_df

Unnamed: 0,Feature,Statistic,P-value
0,CWT1,25.216458,1.391221e-05
1,CWT2,19.49583,0.0002158818
2,CWT3,15.294168,0.001581761
3,CWT4,14.427983,0.002376861
4,CWT5,20.102615,0.00016163
5,CWT6,27.727381,4.143453e-06
6,CWT7,38.007346,2.816372e-08
7,CWT8,45.673314,6.655055e-10
8,CWT9,60.733989,4.096564e-13
9,CWT10,75.164893,3.340145e-16


In [27]:
d_df.to_excel('../output/tracheides_kruskalwallis_diam.xlsx', index=False)
cwt_df.to_excel('../output/tracheides_kruskalwallis_cwt.xlsx', index=False)