# ERBI Image Analysis - baseline exploratory data analysis

## Package loading

In [12]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sns
import imageio
from collections import Counter
import statsmodels.stats.power as pwr
from tqdm import tqdm

## Loading metadata

In [2]:
print(os.getcwd())
files_ = os.listdir()
meta = pd.read_csv(files_[1], sep = "\t")
print(meta.shape)
meta.head()

D:\MCB\Arpan
(5157, 10)


Unnamed: 0,SampleID,date,genotype,tissue_type,dop,parameter,batch,replicate,plant_no,z_axis
1,20190606_gfph_cot_dop-8_param1_b1_r1_1_z00,20190606,gfph,cot,dop-8,param1,b1,r1,1,z00
2,20190606_gfph_cot_dop-8_param1_b1_r1_1_z01,20190606,gfph,cot,dop-8,param1,b1,r1,1,z01
3,20190606_gfph_cot_dop-8_param1_b1_r1_1_z02,20190606,gfph,cot,dop-8,param1,b1,r1,1,z02
4,20190606_gfph_cot_dop-8_param1_b1_r1_1_z03,20190606,gfph,cot,dop-8,param1,b1,r1,1,z03
5,20190606_gfph_cot_dop-8_param1_b1_r1_1_z04,20190606,gfph,cot,dop-8,param1,b1,r1,1,z04


## Checking the base meta parameters

In [3]:
parameters = Counter(meta.parameter)
print(parameters)

Counter({'param1': 2791, 'param2': 2366})


In [4]:
z_levels = Counter(meta.z_axis)
print(z_levels)


Counter({'z04': 235, 'z05': 235, 'z06': 235, 'z07': 235, 'z08': 235, 'z09': 235, 'z10': 235, 'z11': 235, 'z12': 234, 'z13': 233, 'z14': 231, 'z15': 226, 'z16': 222, 'z17': 212, 'z18': 203, 'z20': 189, 'z19': 185, 'z21': 143, 'z22': 124, 'z00': 118, 'z01': 118, 'z02': 118, 'z03': 118, 'z23': 107, 'z24': 89, 'z25': 81, 'z26': 66, 'z27': 55, 'z28': 43, 'z29': 31, 'z30': 24, 'z31': 20, 'z32': 16, 'z33': 12, 'z34': 9, 'z35': 7, 'z36': 5, 'z37': 5, 'z38': 5, 'z39': 5, 'z40': 5, 'z41': 2, 'z42': 1, 'z43': 1, 'z44': 1, 'z45': 1, 'z46': 1, 'z47': 1, 'z48': 1, 'z49': 1, 'z50': 1, 'z51': 1, 'z52': 1, 'z53': 1, 'z54': 1, 'z55': 1, 'z56': 1, 'z57': 1})


In [5]:
tissue = Counter(meta.tissue_type)
print(tissue)


Counter({'cot': 5157})


In [6]:
genotype = Counter(meta.genotype)
print(genotype)

Counter({'gfph': 2902, 'nai1-gfph': 2255})


In [7]:
No_plants = Counter(meta.plant_no)
print(No_plants)

Counter({1: 1077, 3: 1003, 4: 1000, 2: 964, 5: 943, 8: 47, 7: 42, 6: 41, 10: 20, 9: 20})


In [8]:
batch = Counter(meta.batch)
print(batch)

Counter({'b3': 1047, 'b7': 663, 'b8': 557, 'b6': 496, 'b2': 487, 'b9': 450, 'b1': 448, 'b5': 373, 'b4': 331, 'b10': 305})


In [9]:
replicate = Counter(meta.replicate)
print(replicate)

Counter({'r1': 2357, 'r2': 1704, 'r3': 692, 'r4': 218, 'r5': 186})


## Summary of basic counting

The current best possible scenario right now:

- 2366 merged channel photos
- 11 z-stacks per sample
-  ~215 3D samples in total in training dataset !

## POWER in current best possible scenario - Study Design


### Finding the aligned param1 & param2 photographs

In [10]:
meta.shape[0]

5157

In [13]:
photos_list = []
for i in tqdm(range(meta.shape[0])):
    photo = str(meta.iloc[i,1]) + str(meta.iloc[i,2]) + str(meta.iloc[i,3]) + str(meta.iloc[i,4]) + str(meta.iloc[i,5]) + str(meta.iloc[i,6]) + str(meta.iloc[i,7]) + str(meta.iloc[i,8])
    photos_list.append(photo)

100%|███████████████████████████████████████████████████████████████████████████| 5157/5157 [00:00<00:00, 17351.72it/s]


In [30]:
photo_count = dict(Counter(photos_list))
proper_photos_list = []
for k,v in photo_count.items():
    if v >= 11:
        proper_photos_list.append(k)
print("Estimated amount of aligned complete samples in dataset = " + str(len(proper_photos_list)//2))
    

Estimated amount of aligned complete samples in dataset = 116


### Finding the proporion of classes

In [15]:
positive = 0
negative = 0
for item in proper_photos_list:
    if 'nai1-gfph' in item:
        negative += 1
    else:
        positive +=1
print("Number of samples in positive class = " + str(positive//2))
print("Number of samples in negative class = " + str(negative//2))
    

Number of samples in positive class = 59
Number of samples in negative class = 56


### Sample size computation

In [33]:
effect_size = positive/(positive + negative)
alpha = 0.05
power = 0.8
power_analysis = pwr.FTestPower()

In [34]:
sample_size = power_analysis.solve_power(effect_size = effect_size,
                                        df_denom = ((positive + negative)//2)-1 ,
                                        power = power,
                                        alpha = alpha)
print('Required sample size: {0:.2f}'.format(sample_size))

Required sample size: 129.22


# CONCLUSION

- For single parameter logistic regression on such data, sample size must be equal to 130 per class
- Number for 3D Convolutional Neural Network is highly insufficient
- The best way for obtaining baseline model for binary classification is to multiply number of samples at least by 10e2 (best 10e3 OR 10e4)