## Gel Lane Trace Analysis

This notebook reads in gel lane date generated via the accompanying gel analysis code.  It will format data, apply background correction and assemble publication-ready plots.

### Imports and File Prep

This code takes care of reading pre-selected gel lanes and bands using the manual band/lane selector.  It will automatically remove background and print out graphs for the selected input file.  Some specific images also have certain formatting which needs to be tweaked for different images.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from PIL import Image
from numpy.linalg import lstsq
import copy
import json
from scipy.signal import find_peaks
import scipy.signal as signal
from scipy import sparse
from scipy.sparse.linalg import spsolve
from scipy.optimize import curve_fit

In [None]:
# Input file selection and loading (re-configure for your own system)
%matplotlib inline

base_path = r'SET INPUT FOLDER'
base_output_loc = 'SET OUTPUT LOCATION'

analysis_type = 'aldos'

if analysis_type == 'rna':
    df = pd.read_csv(os.path.join(base_loc, 'lane_file_rna.csv'), header=0, index_col=0)
elif analysis_type == 'dna':
    df = pd.read_csv(os.path.join(base_loc, 'lane_file_dna.csv'), header=0, index_col=0)
elif analysis_type == 'dna_24':
    df = pd.read_csv(os.path.join(base_loc, 'lane_file_dna_24.csv'), header=0, index_col=0)
elif analysis_type == 'dna_tae_24_better':
    df = pd.read_csv(os.path.join(base_loc, 'lane_file_dna_tae_24_better.csv'), header=0, index_col=0)
elif analysis_type == 'aldos':
    df = pd.read_csv(os.path.join(base_loc, 'lane_file_aldos.csv'), header=0, index_col=0)
elif analysis_type == 'mutations':
    df = pd.read_csv(os.path.join(base_loc, 'lane_file_mutations.csv'), header=0, index_col=0)
elif analysis_type == 'mutations_tbe':
    df = pd.read_csv(os.path.join(base_loc, 'lane_file_mutations_tbe.csv'), header=0, index_col=0)
elif analysis_type == 'aldos_fbs':
    df = pd.read_csv(os.path.join(base_loc, 'lane_file_aldos_fbs.csv'), header=0, index_col=0)
      
df['trace'] = df['trace'].apply(json.loads)  # converting sequence to array
df['band'] = df['band'].apply(json.loads)
profiles = df['trace']
bands = df['band'].to_dict()
profiles = profiles.to_dict()

### Function Prep

In [None]:
plt.rcParams.update({'font.sans-serif':'Helvetica'})  # formatting for paper figures

full_plots = False  # set to true to print graph for each individual gel band present in image
aldos_trendline = False  # set to true to display aldosterone linear trendline
linear_only = True  # set to true to only allow linear trendlines to be printed

def exp_func(x, a, b, c):  # exponential fit
    return a * np.exp(-b * x) + c


def baseline_als(y, lam=1000, p=0.001, niter=10):
    # from Baseline Correction with Asymmetric Least Squares Smoothing - https://stackoverflow.com/questions/29156532/python-baseline-correction-library
    # all hyperparameters are fixed for all input images.
    L = len(y)
    D = sparse.diags([1,-2,1],[0,-1,-2], shape=(L,L-2))
    w = np.ones(L)
    for i in range(niter):
        W = sparse.spdiags(w, 0, L, L)
        Z = W + lam * D.dot(D.transpose())
        z = spsolve(Z, w*y)
        w = p * (y > z) + (1-p) * (y < z)
    return z

### Band Selection, Background Removal and Output Plotting (non-FBS plots)

In [None]:
# SIGNAL EXTRACTION
vols = {}
for name, sprof in profiles.items():  # loops through each available lane

    baseline = baseline_als(sprof)  # extracts baseline signal
    new_signal = np.array(sprof) - baseline  # removes baseline from signal
    
    if full_plots:  # direct plots of raw signal, baseline and corrected signal
        plt.figure(figsize=(10,6))
        plt.plot(sprof, label='Raw Profile')
        plt.plot(baseline, label='Baseline Background')
        plt.plot(new_signal, label = 'Corrected Signal')

        plt.axvline(bands[name][0],color='r')
        plt.axvline(bands[name][1],color='r')

        plt.title(name,fontsize=22)
        plt.xlabel('Pixel Position',fontsize=22)
        plt.ylabel('Average Intensity (a.u.)',fontsize=22)
        plt.legend(fontsize=18)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)
    
    volume = sum(new_signal[bands[name][0]:bands[name][1]])  # volume calculated as sum of corrected signal within pre-selected band

    vols[name] = volume


xs = []
ys = []
if 'mutations' in analysis_type:  # BAR GRAPH ANALYSIS (for mutation specificity)
    
    for key,val in vols.items():
        xs.append(key)
        ys.append(val)
    if analysis_type == 'mutations':
        xs = ['Target', 'Bulk\n Mutation', 'Toehold\n Mutation', 'Double\n Mutation', 'Multiple\n Mutations']
    elif analysis_type == 'mutations_tbe':
        xs = ['Target', 'Double\n Mutation', 'Bulk\n Mutation', 'Toehold\n Mutation', 'Multiple\n Mutations']
    fig, ax = plt.subplots(figsize=(8,6))

    x_pos = [i for i, _ in enumerate(xs)]
    
    plt.bar(x_pos, ys, color='blue')
    # plt.xlabel("DNA strand present (111nM)", fontsize=25)
    plt.ylabel("Total Band Intensity (a.u.)", fontsize=22)
    plt.xticks(x_pos, xs,fontsize=18)
    
    plt.ylim((0,3500))
    current_ticks = ax.get_yticks().tolist()
    current_ticks = [int(v) for v in current_ticks]
    current_ticks[-2] = ''
    current_ticks[-1] = ''
    plt.yticks(fontsize=18)
    ax.set_yticklabels(current_ticks)
    plt.savefig(os.path.join(base_output_loc, 'mutation_bar_chart.png'), bbox_inches='tight', dpi=300)
    
else:  # TRENDLINE PLOT ANALYSIS (LOD QUANTIFICATION)
    dilution = 9 # dilution factor (with respect to labels provided in input data)
    for key,val in vols.items():
        if (key == '500n' and 'dna' in analysis_type) or key == '3m' or key=='1n':  # specific changes for specific files
            if key == '3m':
                mconc = (int(key[:-1])/dilution) * 1000000
                mval = int(val)
                ys.append(mval)
                xs.append(1500) # not actual value, just a placeholder for manual editing later
            continue
        if 'no_target' in key:  # control
            conc = 0
        elif 'm' in key:  # conversion from milli to nano
            conc = (int(key[:-1])/dilution) * 1000000
        else:
            conc = int(key[:-1])/dilution
        xs.append(conc)  # final concentration/intensity data
        ys.append(int(val))

    fig, ax = plt.subplots(figsize=(8,6))
    plt.scatter(xs,ys, s=60)  # initial data scatter plot

    # REGRESSION ANALYSIS
    if linear_only and analysis_type != 'dna_24' and analysis_type != 'dna_tae_24_better':
        value_range = 6
        if analysis_type == 'dna_24':  # selecting data for linear plot
            start = 3
        else:
            start = 0
        st_point = len(xs)-value_range-start
        end_point = len(xs)-start 

        x_linear = xs[st_point:end_point]
        y_linear = ys[st_point:end_point]

        z = np.polyfit(x_linear, y_linear, 1)  # fitting library call
        p = np.poly1d(z)
        trend = p(x_linear)

        print ("Trendline Equation: y=%.3fx+(%.3f)"%(z[0],z[1]))
        plt.plot(sorted(x_linear),sorted(trend),"r--", label='Linear Fit')
        
    elif analysis_type != 'dna_24':  # non-linear fitting
        if analysis_type == 'dna':
            popt, pcov = curve_fit(exp_func, xs, ys)
            trend = exp_func(np.array(xs), *popt)
            print ("Trendline Equation: y = %.3f * exp(-%.3f * x) + %.3f"%(popt[0],popt[1], popt[2]))
            plt.plot(sorted(xs),sorted(trend),"r--", label='Exponential Fit')
            
        elif analysis_type == 'rna' or analysis_type == 'dna_24':
            z = np.polyfit(xs, ys, 1)
            p = np.poly1d(z)
            trend = p(xs)
  
            print ("Trendline Equation: y=%.3fx+(%.3f)"%(z[0],z[1]))
            plt.plot(sorted(xs),sorted(trend),"r--", label='Linear Fit')
        
        elif analysis_type == 'aldos' and aldos_trendline:
            curated_xs = [x for index, x in enumerate(xs) if index != 1]  # cutting out 3m value
            curated_ys = [y for index, y in enumerate(ys) if index != 1]
            z = np.polyfit(curated_xs, curated_ys, 1)
            p = np.poly1d(z)
            trend = p(curated_xs)

            print ("Trendline Equation: y=%.3fx+(%.3f)"%(z[0],z[1]))
            plt.plot(sorted(curated_xs),sorted(trend),"r--", label='Linear Fit')

    # Plot labels, min/max limits etc
    if analysis_type == 'aldos':
        minplot = 0
    elif analysis_type == 'aldos_2':
        minplot = 200
    elif analysis_type == 'dna_24':
        minplot = 0
    else:
        minplot = -100

    if 'dna' in analysis_type:
        biomarker = 'BRCA1'
    elif 'rna' in analysis_type:
        biomarker = 'miR-141'
    else:
        biomarker = 'Aldosterone'

    plt.xlabel('%s Concentration (nM)' % biomarker, fontsize=22)
    plt.ylabel('Total Band Intensity (a.u.)', fontsize=22)
    plt.xticks(fontsize=18)

    if analysis_type == 'aldos':
        plt.ylim((minplot,max(ys) + 1))
        plt.yticks(np.arange(minplot, max(ys)+500, 400))
    elif analysis_type == 'dna_24':
        plt.ylim((minplot, max(ys)+100))
    elif analysis_type == 'dna_tae_24':
        pass
    else:
        plt.ylim((minplot,max(ys) + 500))

    current_ticks = ax.get_yticks().tolist()
    current_ticks = [int(v) for v in current_ticks]

    if analysis_type == 'rna':  # covering ticks to allow for easier manual editing later
        current_ticks[-2] = ''
        current_ticks[-1] = ''
    elif analysis_type == 'dna':
        for i in [-3, -2, -1]:
            current_ticks[i] = ''
    elif analysis_type == 'aldos_2':
        for i in [-2, -1]:
            current_ticks[i] = ''
    elif analysis_type == 'aldos':
        for i in [-1]:
            current_ticks[i] = ''
        current_x_ticks = ax.get_xticks().tolist()
        current_x_ticks = [int(v) for v in current_x_ticks]
        for i in  [-2, -1]:
            current_x_ticks[i] = ''
        ax.set_xticklabels(current_x_ticks)

    ax.set_yticklabels(current_ticks)
    plt.yticks(fontsize=18)

    if analysis_type == 'aldos_2' or (analysis_type == 'aldos' and not aldos_trendline):  # cutoff point plotting (no regression)
        current_ticks = ax.get_xticks().tolist()
        current_ticks = [int(v) for v in current_ticks]
        for i in  [-2, -1]:
            current_ticks[i] = ''
        ax.set_xticklabels(current_ticks)
        if not linear_only:
            if analysis_type == 'aldos':
                plt.axhline(1000, color='darkgray', linestyle='--')
            else:
                plt.axhline(600, color='darkgray', linestyle='--')

#     plt.legend(fontsize=18)
    plt.savefig('/Users/matt/Desktop/concentration_plot.png', bbox_inches='tight', dpi=300)
    pass

## FBS Plots

For FBS plots, calculate volumes first using below cell, then run subsequent cells according to data available.

In [None]:
vols = {}
for name, sprof in profiles.items():  # loops through each available lane

    baseline = baseline_als(sprof)  # extracts baseline signal
    new_signal = np.array(sprof) - baseline  # removes baseline from signal
    
    if full_plots:  # direct plots of raw signal, baseline and corrected signal
        plt.figure(figsize=(10,6))
        plt.plot(sprof, label='Raw Profile')
        plt.plot(baseline, label='Baseline Background')
        plt.plot(new_signal, label = 'Corrected Signal')

        plt.axvline(bands[name][0],color='r')
        plt.axvline(bands[name][1],color='r')

        plt.title(name,fontsize=22)
        plt.xlabel('Pixel Position',fontsize=22)
        plt.ylabel('Average Intensity (a.u.)',fontsize=22)
        plt.legend(fontsize=18)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)
    
    volume = sum(new_signal[bands[name][0]:bands[name][1]])  # volume calculated as sum of corrected signal within pre-selected band

    vols[name] = volume

## Aldos FBS

In [None]:
dilution = 9 # dilution factor (with respect to labels provided in input data)
xs = []
ys = []
for key,val in vols.items():
    if 'fbs' in key:
#     if ('fbs' not in key and 'no_target' not in key) or key == 'no_target_top_eth':
        if (key == '500n' and 'dna' in analysis_type) or key == '3m' or key=='1n':  # specific changes for specific files
            if key == '3m':
                mconc = (int(key[:-1])/dilution) * 1000000
                mval = int(val)
                ys.append(mval)
                xs.append(1500) # not actual value, just a placeholder for manual editing later
            continue
        if 'no_target' in key:  # control
            conc = 0
        elif 'm' in key:  # conversion from milli to nano
            conc = (int(key[:-1])/dilution) * 1000000
        else:
            conc = int(key.split('_fbs')[0][:-1])/dilution
        if conc > 4000:
            continue
        xs.append(conc)  # final concentration/intensity data
        ys.append(int(val))

fig, ax = plt.subplots(figsize=(8,6))
plt.scatter(xs,ys, s=60)  # initial data scatter plot
biomarker = 'Aldosterone'
plt.xlabel('%s Concentration (nM)' % biomarker, fontsize=22)
plt.ylabel('Total Band Intensity (a.u.)', fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.savefig('/Users/matt/Desktop/aldos_fbs_plot.png', bbox_inches='tight', dpi=300)

fbs_xs = xs
fbs_ys = ys

In [None]:
dilution = 9 # dilution factor (with respect to labels provided in input data)
xs = []
ys = []
for key,val in vols.items():
    if ('fbs' not in key and 'no_target' not in key) or key == 'no_target_top_eth':
        if (key == '500n' and 'dna' in analysis_type) or key == '3m' or key=='1n':  # specific changes for specific files
            if key == '3m':
                mconc = (int(key[:-1])/dilution) * 1000000
                mval = int(val)
                ys.append(mval)
                xs.append(1500) # not actual value, just a placeholder for manual editing later
            continue
        if 'no_target' in key:  # control
            conc = 0
        elif 'm' in key:  # conversion from milli to nano
            conc = (int(key[:-1])/dilution) * 1000000
        else:
            conc = int(key.split('_fbs')[0][:-1])/dilution
        if conc > 4000:
            continue
        xs.append(conc)  # final concentration/intensity data
        ys.append(int(val))

fig, ax = plt.subplots(figsize=(8,6))
plt.scatter(xs,ys, s=60)  # initial data scatter plot
biomarker = 'Aldosterone'
plt.xlabel('%s Concentration (nM)' % biomarker, fontsize=22)
plt.ylabel('Total Band Intensity (a.u.)', fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.savefig('/Users/matt/Desktop/aldos_plot.png', bbox_inches='tight', dpi=300)

standard_xs = xs
standard_ys = ys

## Combined Aldos Plot

In [None]:
print('Raw Data:')
print('FBS:\n', [(x,y) for x, y in zip(fbs_xs, fbs_ys)])
print('Standard:\n', [(x,y) for x, y in zip(standard_xs, standard_ys)])
fo_ys = [y - 164 for y in fbs_ys]
fn_ys = [y - 506 for y in standard_ys]


fig, ax = plt.subplots(figsize=(8,6))

# plt.scatter(fbs_xs,fo_ys, color='b', s=60)
# plt.scatter(standard_xs,fn_ys, color='r', s=60) 

plt.plot(fbs_xs, fo_ys, '-.', color='b', label='20% FBS', marker='^',markersize=10)
plt.plot(standard_xs, fn_ys, '--', color='r', label='no FBS', marker='o',markersize=7)

biomarker = 'Aldosterone'
plt.xlabel('%s Concentration (nM)' % biomarker, fontsize=22)
plt.ylabel('Zero-Reference Band Intensity (a.u.)', fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.legend(fontsize=15)
plt.savefig('/Users/matt/Desktop/combined_aldos_fbs_plot.png', bbox_inches='tight', dpi=300)