In [24]:
from pathlib import Path
import os
from os.path import join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import re
from gelgenie.segmentation.helper_functions.general_functions import create_dir_if_empty, index_converter
from collections import defaultdict
import pickle
import math
from scipy.stats import linregress

In [5]:
# Update your paths here
gg_path = Path("/Users/matt/Documents/PhD/research_output/Automatic_Gel_Analyzer/quantitative_results/qupath_data/james_data_v3_fixed_global/Data_with_norm_and_corrections")
ga_path = Path("/Users/matt/Documents/PhD/research_output/Automatic_Gel_Analyzer/quantitative_results/gelanalyzer")
reference_path = Path("/Users/matt/Documents/PhD/research_output/Automatic_Gel_Analyzer/quantitative_results/reference_ladder_masses.csv")

In [6]:
def identify_ladder_names_from_files(folder_path):
    """
    Reads in and assigns each gel a ladder type
    """
    ladder_dict = {}  
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_number = int(filename.split("_")[0])
            ladder_name = filename.split("_")[1]
            ladder_dict[file_number] = ladder_name
    return ladder_dict

def load_gg_csv_files_to_dict(folder_path, prefix="prefix_"):
    """
    Load CSV files from a folder and create a dictionary of DataFrames with prefixed keys.

    Parameters:
    - folder_path (str): The path to the folder containing CSV files.
    - prefix (str): The prefix to be added to the keys of the dictionary (default is "prefix_").

    Returns:
    - dataframes_dict (dict): A dictionary where keys are prefixed numbers, and values are corresponding DataFrames.
    """
    dataframes_dict = {}  # Dictionary to store DataFrames

    # Iterate through files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is a CSV file
        if filename.endswith(".csv"):
            # Extract the number from the filename
            file_number = filename.split("_")[0]

            # Read the CSV file into a DataFrame
            file_path = os.path.join(folder_path, filename)
            dataframe = pd.read_csv(file_path)

            # Add the prefix to the number and use it as the key in the dictionary
            key = f"{prefix}{file_number}"
            dataframes_dict[key] = dataframe

    return dataframes_dict
def load_ga_csv_files_from_folders(parent_folder, prefix="ga_"):
    """
    Load CSV files from folders and create a dictionary of DataFrames with prefixed names.

    Parameters:
    - parent_folder (str): The path to the parent folder containing numbered subfolders.
    - prefix (str): The prefix to be added to the names of the DataFrames (default is "ga_").

    Returns:
    - dataframes (dict): A dictionary where keys are prefixed folder names, and values are corresponding DataFrames.
    """
    dataframes = {}  # Dictionary to store DataFrames

    # Iterate through folders in the parent folder
    for folder_name in os.listdir(parent_folder):
        folder_path = os.path.join(parent_folder, folder_name)

        # Check if the item in the parent folder is a directory
        if os.path.isdir(folder_path):
            csv_file_path = os.path.join(folder_path, "collated_data_with_band_quality.csv")

            # Check if "collated_data_with_band_quality.csv" exists in the folder
            if os.path.isfile(csv_file_path):
                # Read the CSV file into a DataFrame
                dataframe = pd.read_csv(csv_file_path)
                dataframe = dataframe.rename(columns={'Raw Volume':'GA-Raw-Vol', 'Background Corrected Volume':'GA-BC-Vol'})

                # Add the prefix to the folder name and use it as the key in the dictionary
                prefixed_folder_name = f"{prefix}{folder_name}"
                dataframes[prefixed_folder_name] = dataframe

    return dataframes

In [7]:
# loads in GG data
gg_dfs = load_gg_csv_files_to_dict(gg_path, "gg_") # loads data and converts to dictionary
gg_dfs = {key: gg_dfs[key] for key in sorted(gg_dfs.keys(), key=lambda s: [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)])}  # sorts by gel ID

# loads in GA data
ga_dfs = load_ga_csv_files_from_folders(ga_path)
ga_dfs = {key: ga_dfs[key] for key in sorted(ga_dfs.keys(), key=lambda s: [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)])} # sorts by gel ID

# loads in ladder values
ladder_dict = identify_ladder_names_from_files(gg_path)

# reads in, combines and formats ladder reference mass values
reference_df = pd.read_csv(reference_path)
reference_df.rename(columns={"NEB ladder": "NEB", " ThermoFisher ladder": "Thermo"}, inplace=True)
reference_df["Band ID"] = range(1, len(reference_df) + 1)
reference_df = pd.melt(reference_df, id_vars=["Band ID"], value_vars=["NEB", "Thermo"], var_name="Ladder", value_name="Intensity")
reference_df['Normalized_Intensity'] = reference_df.groupby("Ladder")["Intensity"].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

In [8]:
#1 removes bands from both gg and ga that have the low quality mark from ga
#2 removes bands that are not present in both ga and gg
#3 gels 28 and 30 have been completely excluded from analysis as bands cannot be identified properly (done prior to this notebook)
#4 combines all data into a single dataframe for each gel
# gels 25,26,27 are also screwing up the analysis as their lower bands are more bleached than the top ones, removing for now
origin_data = {} # this contains the combined GA and GG data
for key, df in ga_dfs.items():
    id = int(key.split('_')[-1])
    if id in [25,26,27]:
        continue
    dfgg =  gg_dfs['gg_%s' % id]
    merged_df = pd.merge(df, dfgg, on=['Lane ID', 'Band ID'])
    filtered_df = merged_df[merged_df['Reliable Band'] == 1]
    origin_data[id] = filtered_df

In [9]:
# Bins each band in terms of rectangularity i.e. how rectangular a band is, intensity and background intensity (calculated from the background found by the local background detection system)
# Each 'Cat_' column contains the categorised value for all 3 quantities
# TODO: check that everything is worked properly as intended here, and see if any hardcoded values can be removed
bin_count = 15
for key, val in origin_data.items():
    val['Rectangularity'] = val['Pixel Count']/(val['Width']*val['Height'])
    val['Cat_Rectangularity'] = pd.cut(val['Rectangularity'], bins=bin_count, labels=False)
    if key in [29,30]: # 16-bit images
        val['Rel.A. Intensity'] = val['Average Intensity']/65535
        val['Rel.S. Intensity'] = val['Intensity SD']/65535
    else:
        val['Rel.A. Intensity'] = val['Average Intensity']/255
        val['Rel.S. Intensity'] = val['Intensity SD']/255
    
    val['Background Level'] = (val['Raw Volume'] - val['Local Corrected Volume'])/(val['Raw Volume'])

    val['Cat_Intensity'] = pd.cut(val['Rel.A. Intensity'], bins=bin_count, labels=False)
    
    val['Cat_Range'] = pd.cut(val['Rel.S. Intensity'], bins=bin_count, labels=False)

    val['Cat_Background'] = pd.cut(val['Background Level'], bins=bin_count, labels=False)

    # print(key, val['Rel.A. Intensity'].max(), val['Rel.S. Intensity'].max(), val['Background Level'].max())


In [10]:
origin_data[0]

In [3]:
# functions for adaptive normalisation
def norm_by_lane(df, norm_col, lane_col = 'Lane ID', max_only=False):
    """
    Direct min-max normalisation, grouped by the Lane ID
    """

    min_values = df.groupby(lane_col)[norm_col].transform('min')
    max_values = df.groupby(lane_col)[norm_col].transform('max')

    if max_only:
        return df[norm_col]/max_values
    else:
        return (df[norm_col] - min_values)/(max_values-min_values)
    
def adaptive_normalisation(data_dict, ladder_dict, columns_to_extract, max_only=False):
    """
    Since some lanes have bands missing, the ladder normalisation needs to be re-calculated to match the number of bands available.
    This function runs the adaptive normalisation and fuses the reference ladder data with the band data.
    """
    temp_combo_df = origin_data[gelid][columns_to_extract].copy()
    temp_combo_df['Reference Value'] = 0.0
    for lane in data_dict['Lane ID'].unique():
        
        available_bands = data_dict[data_dict['Lane ID'] == lane]['Band ID'].unique()
        ladder_crop = ladder_dict[ladder_dict['Band ID'].isin(available_bands)].copy() # only retain the bands that are available in the data
        # usual norm here
        min_value = ladder_crop['Reference Value'].min()
        max_value = ladder_crop['Reference Value'].max()
        if max_only:            
            ladder_crop['Reference Value'] = ladder_crop['Reference Value'] / max_value
        else:
            ladder_crop['Reference Value'] = (ladder_crop['Reference Value'] - min_value) / (max_value - min_value)
        # (value)/(maximum) - result is a value between 0 0.2 and 1
        # 0 -0.3  2 3 4 5 5 6 7
        ladder_crop['Lane ID'] = lane
        
        # combine data together here
        temp_combo_df.set_index(['Lane ID', 'Band ID'], inplace=True)
        ladder_crop.set_index(['Lane ID', 'Band ID'], inplace=True)

        temp_combo_df.update(ladder_crop['Reference Value'])
        # Reset the indices to make them regular columns
        temp_combo_df.reset_index(inplace=True)
        ladder_crop.reset_index(inplace=True)
  
    return temp_combo_df


In [94]:
reference_df

In [95]:
# per gel boxplots, categorised by one of the binned values generated earlier
# TODO: how are there so many high-error values?  Need to investigate the adaptive normalisation
selected_gels = [13]
target_x = 'Cat_Rectangularity'
quantities = ['GA-Raw-Vol', 'GA-BC-Vol','Raw Volume', 'Rolling Ball Corrected Volume', 'Global Corrected Volume', 'Local Corrected Volume']

all_bins = range(15)
max_only = True
for gelid in selected_gels:
    ref_vals = reference_df[reference_df['Ladder'] == ladder_dict[gelid]] # extract the ladder related to the selected gel
    ref_vals = ref_vals.drop(columns=['Ladder', 'Normalized_Intensity'])
    ref_vals.rename(columns={'Intensity':'Reference Value'},inplace=True)
    columns_to_extract = ['Lane ID', 'Band ID', 'Rectangularity', target_x]
    temp_combo_df = adaptive_normalisation(origin_data[gelid], ref_vals, columns_to_extract, max_only=max_only) # run adaptive norm on ladder

    for index, quantity in enumerate(quantities): # calculate error quantities
        
        quant_data = norm_by_lane(origin_data[gelid], quantity, max_only=max_only).reset_index(drop=True)
        temp_combo_df[quantity] = quant_data
        temp_combo_df['E-%s' % quantity] = np.abs(temp_combo_df['Reference Value']-quant_data)

    plt.figure(figsize=(15, 6))
    # melts data for plotting
    df_melted = pd.melt(temp_combo_df, id_vars=[target_x], 
                        value_vars=['E-GA-Raw-Vol', 'E-GA-BC-Vol', 'E-Raw Volume', 'E-Rolling Ball Corrected Volume'], 
                        var_name='Values')
    
    ax = sns.boxplot(x=target_x, y='value', hue='Values', data=df_melted, width=0.5,order=all_bins)
    x_label = []
    # Adds text annotations for the number of instances in each boxplot
    for category in all_bins:
        num_instances = len(temp_combo_df[temp_combo_df[target_x] == category])
        x_label.append('%s (%s)' % (category, num_instances))
    ax.set_xticks(ax.get_xticks());
    ax.set_xticklabels(x_label)
    plt.title('Gel %s' % gelid)
    plt.show()
    plt.close()
    

In [98]:
temp_combo_df[temp_combo_df['Lane ID'] == 2]

In [87]:
from scipy.stats import linregress

plt.figure(figsize=(10, 6))

scatter_size = 1.5
target_quant = 'Background Level'

colorwheel = ['blue', 'red', 'green', 'orange', 'purple', 'black']


# Plot each line
plt.scatter(temp_combo_df[target_quant], temp_combo_df['E-GA-Raw-Vol'], label='GA-Raw',s=scatter_size, c=colorwheel[0])
plt.scatter(temp_combo_df[target_quant], temp_combo_df['E-GA-BC-Vol'], label='GA-BG',s=scatter_size, c=colorwheel[1])
# plt.scatter(temp_combo_df[target_quant], temp_combo_df['E-Raw Volume'], label='GG-Raw',s=scatter_size, c=colorwheel[2])
# plt.scatter(temp_combo_df[target_quant], temp_combo_df['E-Rolling Ball Corrected Volume'], label='GG-RB',s=scatter_size, c=colorwheel[3])
# plt.scatter(temp_combo_df[target_quant], temp_combo_df['E-Global Corrected Volume'], label='GG-GB',s=scatter_size, c=colorwheel[4])
# plt.scatter(temp_combo_df[target_quant], temp_combo_df['E-Local Corrected Volume'], label='GG-LB',s=scatter_size, c=colorwheel[5])


# Fit lines
# for index, column in enumerate(['E-GA-Raw-Vol', 'E-GA-BC-Vol', 'E-Raw Volume', 'E-Rolling Ball Corrected Volume', 'E-Global Corrected Volume', 'E-Local Corrected Volume']):
#     slope, intercept, rval, _, _ = linregress(temp_combo_df[target_quant], temp_combo_df[column])
#     print(f'Rval for {column} is {rval}')
#     plt.plot(temp_combo_df[target_quant], slope * temp_combo_df[target_quant] + intercept, linestyle='--', color=colorwheel[index])

# Set labels and title
plt.xlabel(target_quant)
plt.ylabel('Error')
plt.title('Line Graph with Multiple Columns')

# Add legend
plt.legend()

# Show plot
plt.show()

In [93]:
# full dataset boxplots
# TODO: Again, why are there so many high error values?  Something must be going wrong with the adaptive normalisation
selected_gels = origin_data.keys()
quantities = ['GA-Raw-Vol', 'GA-BC-Vol','Raw Volume', 'Rolling Ball Corrected Volume', 'Global Corrected Volume', 'Local Corrected Volume']
all_bins = range(15)

target_x = ['Cat_Range', 'Cat_Intensity', 'Cat_Rectangularity', 'Cat_Background']
max_only = True

for g_index, gelid in enumerate(selected_gels):
    ref_vals = reference_df[reference_df['Ladder'] == ladder_dict[gelid]]
    ref_vals = ref_vals.drop(columns=['Ladder', 'Normalized_Intensity'])
    ref_vals.rename(columns={'Intensity':'Reference Value'},inplace=True)
    columns_to_extract = ['Lane ID', 'Band ID', 'Rectangularity', 'Background Level']
    columns_to_extract.extend(target_x)
    # combines all data into a single big dataframe here
    if g_index == 0:
        temp_combo_df = adaptive_normalisation(origin_data[gelid].reset_index(), ref_vals, columns_to_extract, max_only=max_only)
        temp_combo_df['Gel ID'] = gelid
        for index, quantity in enumerate(quantities):
            quant_data = norm_by_lane(origin_data[gelid], quantity, max_only=max_only).reset_index(drop=True)
            temp_combo_df[quantity] = quant_data
            temp_combo_df['E-%s' % quantity] = np.abs(temp_combo_df['Reference Value']-quant_data)
    else:
        tdf2 = adaptive_normalisation(origin_data[gelid].reset_index(), ref_vals, columns_to_extract, max_only=max_only)
        tdf2['Gel ID'] = gelid
        for index, quantity in enumerate(quantities):
            quant_data = norm_by_lane(origin_data[gelid], quantity, max_only=max_only).reset_index(drop=True)
            tdf2[quantity] = quant_data
            tdf2['E-%s' % quantity] = np.abs(tdf2['Reference Value']-quant_data)
        temp_combo_df = pd.concat([temp_combo_df, tdf2], ignore_index=True)

plt.figure(figsize=(15, 6))
ax = sns.boxplot(data=temp_combo_df[['E-%s' % x for x in quantities]])
plt.title('Entire Dataset (%s bands)' % len(temp_combo_df))
plt.tight_layout()
#plt.savefig('/Users/matt/Desktop/full_dataset_plot.png', dpi=300)
plt.show()
plt.close()

print(temp_combo_df[['E-%s' % x for x in quantities]].mean())

for tx in target_x:
    plt.figure(figsize=(15, 6))
    # melts data for plotting
    df_melted = pd.melt(temp_combo_df, id_vars=tx, value_vars=['E-GA-Raw-Vol', 'E-GA-BC-Vol', 'E-Raw Volume', 'E-Rolling Ball Corrected Volume',
                                                              'E-Global Corrected Volume', 'E-Local Corrected Volume'], var_name='Values')
    ax = sns.boxplot(x=tx, y='value', hue='Values', data=df_melted, width=0.5,order=all_bins)
    
    x_label = []
    # Adds text annotations for the number of instances in each boxplot
    for category in all_bins:
        num_instances = len(temp_combo_df[temp_combo_df[tx] == category])
        x_label.append('%s (%s)' % (category, num_instances))
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels(x_label)   
    plt.tight_layout()
    # plt.savefig('/Users/matt/Desktop/%s_gel_plot.png' % tx, dpi=300)
    plt.show()
    plt.close()

## Testing new Analysis Types

In [42]:
correl_df = pickle.load(open('/Users/matt/Desktop/full_dataset.pkl','rb'))

In [39]:
data_name = '29_NEB'
sel_df = correl_df[data_name]

figs_per_row = 3
rows = math.ceil((len(np.unique(sel_df['Lane ID']) + 1) / figs_per_row))
if rows == 1:
    double_indexing = False
else:
    double_indexing = True

fig, ax = plt.subplots(rows, figs_per_row, figsize=(18, 15))

all_corr_coeff = {}
color_wheel = ['b', 'g', 'r', 'k', 'yellow', 'purple', 'orange', 'lilac', 'yellow', 'steelblue']
plot_col_index = 0
for col_index, column in enumerate(sel_df.columns):
    if column == 'Lane ID' or column == 'Band ID' or column == 'Ref.':
        continue
    for lane in np.unique(sel_df['Lane ID']):
        ref = sel_df[sel_df['Lane ID'] == lane]['Ref.']
        target = sel_df[sel_df['Lane ID'] == lane][column]
        slope, intercept, r_value, p_value, std_err = linregress(ref, target)
        ax[index_converter(lane-1, figs_per_row, double_indexing)].scatter(
                             sel_df[sel_df['Lane ID'] == lane]['Ref.'],
                             sel_df[sel_df['Lane ID'] == lane][column],
                             label=f'{column}, R2: {r_value**2:.3f}', c=color_wheel[plot_col_index])

        ref_plot = np.linspace(np.min(ref), np.max(ref), num=10)
        ax[index_converter(lane-1, figs_per_row, double_indexing)].plot(ref_plot, slope * ref_plot + intercept, color=color_wheel[plot_col_index], linestyle='dotted')
        ax[index_converter(lane-1, figs_per_row, double_indexing)].legend()
        ax[index_converter(lane - 1, figs_per_row, double_indexing)].set_title(f'Lane {lane}')
        ax[index_converter(lane - 1, figs_per_row, double_indexing)].set_yscale('log')

    plot_col_index += 1
plt.suptitle(data_name)
plt.tight_layout()

In [85]:


def hidden_linreg(target, ref, num_hide=3, num_reps=10):

    # Initialize an empty list to store the selected sets
    selected_sets = set()
    
    # Loop to select unique sets
    for _ in range(num_reps):
        # Select the first 3 numbers as a set
        selected_set = tuple(np.random.choice(range(len(target)), num_hide, replace=False))
        # Check if the set is already selected
        while selected_set in selected_sets:
            selected_set = tuple(np.random.choice(range(len(target)), num_hide, replace=False))
        selected_sets.add(selected_set)

    errors = []
    for combo in selected_sets:
        tfrac, rfrac = [t for ind, t in enumerate(target) if ind not in combo], [r for ind, r in enumerate(ref) if ind not in combo]
        slope, intercept, r_value, p_value, std_err = linregress(tfrac, rfrac)
        pred = [slope * target[sel_ind] + intercept for sel_ind in combo]
        tsel = [ref[sel_ind] for sel_ind in combo]
        errors.append(np.average([np.abs(t-p) for t,p in zip(tsel,pred)]))
    return errors
    

In [89]:
hidden_linreg([1,2,3,4,5,6,7], [1,2,3,4,9,6,8], 3, 10)

In [None]:
figs_per_row = 3
rows = math.ceil((len(np.unique(sel_df['Lane ID']) + 1) / figs_per_row))
if rows == 1:
    double_indexing = False
else:
    double_indexing = True

all_corr_coeff = {}
correl_dict = defaultdict(list)

for big_key in correl_df.keys():
    sel_df = correl_df[big_key]
    for col_index, column in enumerate(sel_df.columns):
        if column == 'Lane ID' or column == 'Band ID' or column == 'Ref.':
            continue
        for lane in np.unique(sel_df['Lane ID']):
            ref = sel_df[sel_df['Lane ID'] == lane]['Ref.']
            target = sel_df[sel_df['Lane ID'] == lane][column]
            slope, intercept, r_value, p_value, std_err = linregress(ref, target)
            correl_dict[column].append(r_value**2)
        
rsquared_df = pd.DataFrame.from_dict(correl_dict)

In [43]:
figs_per_row = 3
rows = math.ceil((len(np.unique(sel_df['Lane ID']) + 1) / figs_per_row))
if rows == 1:
    double_indexing = False
else:
    double_indexing = True

all_corr_coeff = {}
correl_dict = defaultdict(list)

for big_key in correl_df.keys():
    sel_df = correl_df[big_key]
    for col_index, column in enumerate(sel_df.columns):
        if column == 'Lane ID' or column == 'Band ID' or column == 'Ref.':
            continue
        for lane in np.unique(sel_df['Lane ID']):
            ref = sel_df[sel_df['Lane ID'] == lane]['Ref.']
            target = sel_df[sel_df['Lane ID'] == lane][column]
            slope, intercept, r_value, p_value, std_err = linregress(ref, target)
            correl_dict[column].append(r_value**2)
        
rsquared_df = pd.DataFrame.from_dict(correl_dict)

In [44]:
ax = sns.boxplot(data=rsquared_df, width=0.5)
ax.tick_params(axis='x', rotation=90)

In [52]:
figs_per_row = 3
rows = math.ceil((len(np.unique(sel_df['Lane ID']) + 1) / figs_per_row))
if rows == 1:
    double_indexing = False
else:
    double_indexing = True

all_corr_coeff = {}

gel_sets = [['0_Thermo','1_Thermo','2_Thermo','3_Thermo','4_Thermo'],
           ['5_Thermo','6_Thermo','7_Thermo','8_Thermo'],
           ['9_Thermo','10_Thermo','11_Thermo','12_Thermo'],
           ['14_NEB', '17_NEB', '16_NEB'],
           ['13_NEB', '15_NEB'],
           ['18_NEB', '19_NEB', '20_NEB'],
           ['21_NEB', '22_NEB', '23_NEB', '24_NEB'],
           ['29_NEB'],
           ['31_NEB'],
           ['32_Thermo'],
           ['33_NEB'],
           ['34_Thermo']]

for gel_set in gel_sets:
    correl_dict = defaultdict(list)
    for big_key in gel_set:
        sel_df = correl_df[big_key]
        for col_index, column in enumerate(sel_df.columns):
            if column == 'Lane ID' or column == 'Band ID' or column == 'Ref.':
                continue
            for lane in np.unique(sel_df['Lane ID']):
                ref = sel_df[sel_df['Lane ID'] == lane]['Ref.']
                target = sel_df[sel_df['Lane ID'] == lane][column]
                slope, intercept, r_value, p_value, std_err = linregress(ref, target)
                correl_dict[column].append(r_value**2)
        
    rsquared_df = pd.DataFrame.from_dict(correl_dict)
    plt.figure()
    ax = sns.boxplot(data=rsquared_df, width=0.5)
    ax.tick_params(axis='x', rotation=90)
    plt.close()

In [51]:
figs_per_row = 3
rows = math.ceil((len(np.unique(sel_df['Lane ID']) + 1) / figs_per_row))
if rows == 1:
    double_indexing = False
else:
    double_indexing = True

all_corr_coeff = {}

gel_sets = [['0_Thermo','1_Thermo','2_Thermo','3_Thermo','4_Thermo'],
           ['5_Thermo','6_Thermo','7_Thermo','8_Thermo'],
           ['9_Thermo','10_Thermo','11_Thermo','12_Thermo']]

for big_key in gel_sets[2]:
    sel_df = correl_df[big_key]
    correl_dict = defaultdict(list)
    for col_index, column in enumerate(sel_df.columns):
        if column == 'Lane ID' or column == 'Band ID' or column == 'Ref.':
            continue
        for lane in np.unique(sel_df['Lane ID']):
            ref = sel_df[sel_df['Lane ID'] == lane]['Ref.']
            target = sel_df[sel_df['Lane ID'] == lane][column]
            slope, intercept, r_value, p_value, std_err = linregress(ref, target)
            correl_dict[column].append(r_value**2)
    
    rsquared_df = pd.DataFrame.from_dict(correl_dict)
    plt.figure()
    ax = sns.boxplot(data=rsquared_df, width=0.5)
    ax.tick_params(axis='x', rotation=90)

### JUST TESTING

In [37]:
selected_gels = [6]
ladder = 'Thermo'
ref_vals = reference_df[reference_df['Ladder'] == ladder]
ref_vals = ref_vals.drop(columns=['Ladder', 'Intensity'])
ref_vals.rename(columns={'Normalized_Intensity':'Reference Value'},inplace=True)
for gelid in selected_gels:
    dfi = {'GG':gg_dfs['gg_%d' % gelid], 
           'GA': ga_dfs['ga_%d' % gelid]}
    
    quantities = ['Raw Volume', 'Raw Volume', 'Background Corrected Volume', 'Rolling Ball Corrected Volume', 'Global Corrected Volume', 'Local Corrected Volume']
    abbrvs = ['GG', 'GA', 'GA', 'GG', 'GG', 'GG']
    columns_to_extract = ['Lane ID', 'Band ID']
    plt.figure(figsize=(15, 6))
    temp_combo_df = gg_dfs['gg_%d' % gelid][columns_to_extract].copy()
    temp_combo_df = pd.merge(temp_combo_df, ref_vals, on=['Band ID'], how='left')
    
    for index, (quantity, abbrv) in enumerate(zip(quantities, abbrvs)):
        quant_data = norm_by_lane(dfi[abbrv], quantity)
        temp_combo_df['%s %s' % (abbrv, quantity)] = quant_data
        temp_combo_df['E-%s %s' % (abbrv, quantity)] = np.abs(temp_combo_df['Reference Value']-quant_data)

    lane_chop = 6
    temp_combo_df = temp_combo_df[temp_combo_df['Lane ID'] <= lane_chop]
    
    df_melted = pd.melt(temp_combo_df, id_vars=['Band ID'], value_vars=['E-GA Raw Volume', 'E-GA Background Corrected Volume', 'E-GG Raw Volume', 'E-GG Rolling Ball Corrected Volume'], var_name='Values')
    
    sns.boxplot(x='Band ID', y='value', hue='Values', data=df_melted, width=0.5)
    # sns.boxplot(temp_combo_df, x='Band ID', y='GG Raw Volume')

    # plt.scatter(reference_df[reference_df['Ladder'] == 'NEB']['Band ID']-1, reference_df[reference_df['Ladder'] == 'NEB']['Normalized_Intensity'],c='red')
    print(temp_combo_df.mean())

In [124]:
lane_count = 4
sel_df = temp_combo_df[temp_combo_df['Band ID'] == 15]
plt.scatter(range(lane_count), sel_df['E-GG Rolling Ball Corrected Volume'], label='GG Rolling Ball')
plt.scatter(range(lane_count), sel_df['E-GA Background Corrected Volume'],c='red', label='GA')
plt.scatter(range(lane_count), sel_df['E-GG Raw Volume'],c='green',label='raw GG')
plt.legend()

In [6]:
for key, df in ga_dfs.items():
    id = int(key.split('_')[-1])
    print(id,len(ga_dfs[key]), len(gg_dfs['gg_%s' % id]), len(origin_data[id]))

In [110]:
reference_df