In [2]:
import pandas as pd
import numpy as np
import csv
import re
import string
from itertools import product

In [3]:
list_A = "C:\\Users\\Bartek\\OneDrive\\Documents\\Programming\\Python\\Test data\\listA.csv"
list_A_repeat = "C:\\Users\\Bartek\\OneDrive\\Documents\\Programming\\Python\\Test data\\listA_repeat.csv"
list_B = "C:\\Users\\Bartek\\OneDrive\\Documents\\Programming\\Python\\Test data\\listB.csv"
list_B_repeat_end = "C:\\Users\\Bartek\\OneDrive\\Documents\\Programming\\Python\\Test data\\listB_repeat _end.csv"
list_B_repeat_96 = "C:\\Users\\Bartek\\OneDrive\\Documents\\Programming\\Python\\Test data\\listB_repeat _96.csv"
list_C = "C:\\Users\\Bartek\\OneDrive\\Documents\\Programming\\Python\\Test data\\listC.csv"

In [4]:
def read_in_list(csv_file):

    """ Iterates over the csv file to find  the line numbers at which the metadata table and raw data table
    begin. Creates two pandas data frames: for the raw data and metadata.
    
    :param csv_file: Raw data file in csv format.
    :param type: str
    :return: A tuple with raw data and metadata dfs."""
    
    with open(csv_file) as file:  
        all_data_lines = list(csv.reader(file, delimiter=',')) # read the csv file and cast it into a list
    
    # set the skiprows to be greater than the total number of lines in the files to enable the evaluation of if statement until the 'skiprows' parameter is found
    skiprows = len(all_data_lines) + 1 
    # list containing indices of all blank rows
    blank_indices = list(index for index, item in enumerate(all_data_lines) if item == [])   # list containing indices of all blank rows
    blanks = np.array(blank_indices)   
    
    # iterate over all lines to find the beggining of the data table ('skiprows') and determine the format of data i.e. list A, B, or C
    for index, item in enumerate(all_data_lines):   
        if item != [] and len(item) == 1 and re.findall(r"Plate information", item[0]) == ["Plate information"]:
            skiprows_meta = index + 1
            end_of_metadata = blanks[blanks > skiprows_meta].min()   # find the end of metadata by finding the smallest blank index after the beginning of metadata
            nrows_meta = end_of_metadata - skiprows_meta - 1   # calucalte the length of metadata table
            
        if item != [] and len(item) >= 2 and re.findall(r"PlateNumber", item[0]) == ['PlateNumber'] and re.findall(r"PlateRepeat", item[1]) == ['PlateRepeat']:   # find line number with the beggining of the data
            skiprows = index - 1
            data_format = 'listA'

        if item != [] and len(item) >= 2 and re.findall(r"Plate", item[0]) == ['Plate'] and re.findall(r"Barcode", item[1]) == ['Barcode']:   # find line number with the beggining of the data
            skiprows = index
            data_format = 'listB'

        if item != [] and len(item) >= 2 and re.findall(r"Plate", item[0]) == ['Plate']  and re.findall(r"Well", item[1]) == ['Well']:
            skiprows = index
            data_format = 'listC'

        if item != [] and re.findall(r"G-factor", item[0]) == ["G-factor"]:   # find the g factor
            g_factor = float(item[4])
        
        # find the index of the first blank row after the data table i.e. the end of data table and break out of the loop
        if item == [] and index > skiprows:   
            end_of_data = index
            break  

    nrows = end_of_data - skiprows - 1   # calculate the length of data table

    raw_data = pd.read_csv(csv_file, sep=',', engine='python', skiprows=skiprows, nrows=nrows, encoding='utf-8')
    metadata = pd.read_csv(csv_file, sep=',', engine='python', skiprows=skiprows_meta, nrows=nrows_meta, encoding='utf-8')

    return raw_data, metadata, data_format

In [5]:
a = read_in_list(list_A)
ar = read_in_list(list_A_repeat)
b = read_in_list(list_B)
br = read_in_list(list_B_repeat_end)
br96 = read_in_list(list_B_repeat_96)
c = read_in_list(list_C)

In [8]:
def process_list(csv_file, raw_data):
    
    """Extracts the data for each channel and repeat from the raw data table and adds to a dictionary.
    
    :param csv_file: Raw data file in csv format.
    :param type: str
    :param  raw_data: A tuple containing data frames for raw data, metadata and astring representing type of list (A, B, or C).
    :param type: tuple
    :return: A dictionary containg a dictionary for each repeat containg the metadata df and a dictionary with s and p channel dfs."""
    
    data_frames = {}   # dictionary to store data frames
    repeats = list(raw_data[1]['Repeat'].to_numpy())   # generate a list with repeats based on the metadata table
    
    # remove the 0 from middle position of well numbers (i.e. convert A01 to A1)
    for i in range(raw_data[0].shape[0]):
        if raw_data[0].loc[i, 'Well'][2] != '0':
            raw_data[0].replace(raw_data[0].loc[i, 'Well'], raw_data[0].loc[i, 'Well'].replace('0', ''), inplace=True)
    
    for index, repeat in enumerate(repeats):   # iterate over the number of repeats
    
        if raw_data[2] == 'listA':
            groupped_data = raw_data[0].groupby(raw_data[0].PlateRepeat).get_group(repeat)   # group and extract the data by the plate repeat column, i.e. in each iteration get data only for the current repeat 
            
            p_groupped = groupped_data.iloc[::3, :]   # extract data only for the p channel, i.e. each third row starting from the first row
            s_groupped = groupped_data.iloc[1::3, :]   # extract data only for the s channel, i.e. each third row starting from the second row

            p_raw = p_groupped[['Well', 'Signal']]   # extract only the two relevant columns
            s_raw = s_groupped[['Well', 'Signal']]   # for each channel
            
        if raw_data[2] == 'listB' or raw_data[2] == 'listC': 
            # the column naming is different for the first repeat, i.e. just 'Signal', then it's 'Signal.1', 'Signal.2', etc.
            if repeat == 1: 
                p_raw = raw_data[0][['Well', 'Signal']]   
                s_raw = raw_data[0][['Well', f'Signal.{repeat}']]
            else:
                p_raw = raw_data[0][['Well', f'Signal.{repeat + index - 1}']]   # the column to be extracted is calculated in each iteration
                s_raw = raw_data[0][['Well', f'Signal.{repeat + index}']]
        
        # set row indices as the well numbers and rename the 'Signal' column to 'p' or 's'
        p_raw.set_index('Well', inplace=True)
        p_raw.set_axis(['p'], axis=1, inplace=True)
        s_raw.set_index('Well', inplace=True)
        s_raw.set_axis(['s'], axis=1, inplace=True)
        
        meta = raw_data[1].iloc[[repeat-1]].astype({'Measurement date': 'datetime64[ns]'})   # extract the row with metadata relevant for each repeat and covert date and time into a datetime object
        data_frames[f'repeat_{repeat}'] = {'metadata': meta, 'data': {'p': p_raw, 's':s_raw}}   # add data frames to the dictionary
        
    return data_frames

In [9]:
data = process_list(list_C, c)

In [10]:
data['repeat_1']['data']['p']

Unnamed: 0_level_0,p
Well,Unnamed: 1_level_1
A1,18890869
A2,27377968
A3,15983855
A4,21051671
A5,22837285
...,...
K20,24948851
K21,10411102
K22,8137464
K23,13595684


In [11]:
data['repeat_1']['metadata']

Unnamed: 0,Plate,Repeat,Barcode,Measured height,Chamber temperature at start,Chamber temperature at end,Humidity at start,Humidity at end,Ambient temperature at start,Ambient temperature at end,Measurement date,Unnamed: 11
0,1,1,,14.4,18.98,19,61.5,61.1,18.98,19.39,2020-11-17 13:30:28,


In [12]:
data['repeat_1']['data']['s']

Unnamed: 0_level_0,s
Well,Unnamed: 1_level_1
A1,20668058
A2,29442193
A3,18411616
A4,23222549
A5,25009835
...,...
K20,26243862
K21,11340261
K22,8952581
K23,14268154
