### How to use this notebook
 - Enter the details requested in part 1
 - In part 2 each cell will execute a different analysis. Feel free to use and modify based on your own needs.
 
 ##### Assumptions made by this notebook
  - Experimental data are a timeseries of images of duckweed growing in 24 plat wells. Image files are stored in the same directory according to the format "exptname_plate_wellID_yyyymmdd.jpg.
  - Experiment metadata are stored in a .json file, with information on which samples and media are in which wells. 

In [1]:
import random
import pandas as pd
import DuckbotExptSetupUtils as exp
import os
import json
import datetime
import pathlib
import re
import numpy as np
import cv2
from PIL import Image, ExifTags
# from plantcv import plantcv as pcv
import matplotlib

In [None]:

def file_to_datetime(filename):
    no_file_ending = filename.split(".")
    split_name = no_file_ending[0].split("_")
    date = split_name[3].split("-")
    year = int(date[0])
    month = int(date[1])
    day = int(date[2])
    return year, month, day

def calculate_dpi(y, m, d):
    split_start_date = startdate.split("-")
    y0 = int(split_start_date[0])
    m0 = int(split_start_date[1])
    d0 = int(split_start_date[2])
    start = datetime.datetime(y0, m0, d0)
    end = datetime.datetime(y, m, d)
    diff = (end-start).days
    return(diff)

def find_plate_well_id(str):
    well_id = re.search('well(.+?)_', str).group(1)
    plate_id = re.search('plate(.+?)_well', str).group(1)
    return(f'Plate_{plate_id}_Well_{well_id}')

def find_metadata(input_value, shared_column_name, search_df, desired_column):
    df_subset = search_df[search_df[shared_column_name] == input_value][desired_column] #Find the desired column from the matching part of the search_df
    df_as_list = list(df_subset) #Convert to list to wipe clean the index carried over from search dataframe. 
    desired_value = df_as_list[0] #Specify that we want what should be a single value, rather than a whole list with one entry
    return desired_value


### Part 1 - Define filenames and paths

In [2]:
#Where is the Json file with the experiment metadata and what is it called?
metadata_dir = '/Users/Orlando/Documents/Github_clones/duckbot/test_data/TestExptDir/'
metadata_filename = 'TestFile.json'
metadata_path = metadata_dir + metadata_filename

#Where are the image files that you want to analyze and what is the experiment ID (should be first part of each image filename)
img_data_dir = "/Users/Orlando/Documents/Github_clones/duckbot/test_data/odl_dataanalaysistest1"
expt_prefix = "TestData"

#Where do you want the figures produced by this script to end up?
output_figure_dir = '/Users/Orlando/Documents/Github_clones/duckbot/test_data/'

#What was day 0 for this experiment in format (yyyy-mm-dd)
startdate = "2022-07-01"

### Part 2 - Building a dataframe with percentage duckweed coverage in each image

In [25]:

p = pathlib.Path(img_data_dir)   # current directory, insert your directory here
data_filenames = [x for x in p.glob('*.jpg')]

data_df = pd.DataFrame({"filename": data_filenames})
data_df['plate_well_id'] = data_df.apply(lambda row: find_plate_well_id(row.filename), axis=1)
data_df['date'] = data_df.apply(lambda row: row.filename[-10:], axis=1)

TypeError: 'PosixPath' object is not iterable

In [4]:
#Processing images
with open(metadata_path) as metadata:
    expt_data = json.load(metadata)

plate_set_up = expt_data["sample_info"]
md_df = pd.DataFrame(plate_set_up)

  genotype media  condition_replicate    plate_well_id    Plate Well
0   Sp7498  Mock                    1  Plate_1_Well_A1  Plate_1   A1
1   Sp7498  Salt                    2  Plate_1_Well_A2  Plate_1   A2
2   Sp7498  Salt                    1  Plate_1_Well_A3  Plate_1   A3
3   Sp7498  Mock                    2  Plate_1_Well_A4  Plate_1   A4


In [5]:
data_df['media'] = data_df.apply(lambda row: find_metadata(row.plate_well_id, 'plate_well_id', md_df, 'media'), axis=1)
data_df['genotype'] = data_df.apply(lambda row: find_metadata(row.plate_well_id, 'plate_well_id', md_df, 'genotype'), axis=1)
data_df['replicate'] = data_df.apply(lambda row: find_metadata(row.plate_well_id, 'plate_well_id', md_df, 'condition_replicate'), axis=1)

print(data_df.head())                


                            filename    plate_well_id        date media  \
0  TestData_plate1_wellA1_2022-07-05  Plate_1_Well_A1  2022-07-05  Mock   
1  TestData_plate1_wellA1_2022-07-03  Plate_1_Well_A1  2022-07-03  Mock   
2  TestData_plate1_wellA1_2022-07-01  Plate_1_Well_A1  2022-07-01  Mock   
3  TestData_plate1_wellA4_2022-07-03  Plate_1_Well_A4  2022-07-03  Mock   
4  TestData_plate1_wellA2_2022-07-05  Plate_1_Well_A2  2022-07-05  Salt   

  genotype  replicate  
0   Sp7498          1  
1   Sp7498          1  
2   Sp7498          1  
3   Sp7498          2  
4   Sp7498          2  


In [19]:
#Remove non-green pixels from images
filtered_img_dir = img_data_dir + "/green_filtered"
if not os.path.exists(filtered_img_dir):
    os.mkdir(filtered_img_dir)

amount_green = []

for file in os.listdir(img_data_dir):
    if ".jpg" in file:
        y, m, d = file_to_datetime(file)
        days_post_initiation = calculate_dpi(y, m, d)
        im = cv2.imread(os.path.join(img_data_dir, file))
        dpi= days_post_initiation
        green_pixels = 0   
        for i in range(im.shape[0]): #im.shape[0] is the y-axis
            for j in range(im.shape[1]): #im.shape[1] is the x returns X and Y axes of the image
                if im[i, j][0] > 0.9*im[i, j][1]:  #If more blue than green in pixel than set to black.         
                    im[i, j] = [0, 0, 0]
                if np.array([x < 90 for x in im[i, j]]).all():
                    im[i, j] = [0, 0, 0]
                else:
                    green_pixels += 1
        amount_green.append({"filename": file, "dpi": dpi, "green_pixels": green_pixels})
        im_onlygreen = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(im)
        img.save(filtered_img_dir + "/" + "GREEN" + file)

#Some notes to make sense of the above code
#im[i,j] refers to all pixels in the image, each having an x (j) and y(i) coordinate. 
#if np.array([x < 90 for x in im[i, j]]).all() is looking for any pixels with saturation values under 90 in any channel
#Each pixel has a blue [0], green [1] and red [2] value. 

In [23]:
pixel_pd = pd.DataFrame(amount_green)
print(pixel_pd)
data_df['dpi'] = data_df.apply(lambda row: find_metadata(row.filename, 'filename', pixel_pd, 'dpi'), axis=1)
# data_df['green_pixels'] = data_df.apply(lambda row: find_metadata(row.filename, 'filename', pixel_pd, 'green_pixels'), axis=1)
print(data_df)

##### FIX THIS (There's a discrepency around .jpg)

                                 filename  dpi  green_pixels
0   TestData_plate1_wellA1_2022-07-05.jpg    4         12320
1   TestData_plate1_wellA1_2022-07-03.jpg    2         10210
2   TestData_plate1_wellA1_2022-07-01.jpg    0         13053
3   TestData_plate1_wellA4_2022-07-03.jpg    2         10054
4   TestData_plate1_wellA2_2022-07-05.jpg    4          7552
5   TestData_plate1_wellA3_2022-07-01.jpg    0          7713
6   TestData_plate1_wellA3_2022-07-03.jpg    2          8987
7   TestData_plate1_wellA4_2022-07-01.jpg    0         15720
8   TestData_plate1_wellA4_2022-07-05.jpg    4          4667
9   TestData_plate1_wellA2_2022-07-03.jpg    2          3311
10  TestData_plate1_wellA3_2022-07-05.jpg    4          7006
11  TestData_plate1_wellA2_2022-07-01.jpg    0          8273
                             filename    plate_well_id        date media  \
0   TestData_plate1_wellA1_2022-07-05  Plate_1_Well_A1  2022-07-05  Mock   
1   TestData_plate1_wellA1_2022-07-03  Plate_1_Well_A1 

### Part 3 - Produce figures

In [2]:
#Building a master dataframe

#### Part 2


In [None]:
#