# Analysis Template
Updated 5/3/24 DFM

In [None]:
#enables autoreloding of modules
%load_ext autoreload
%autoreload 2

from copy import deepcopy

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

from pathlib import Path 
from htbam_db_api.htbam_db_api import LocalHtbamDBAPI
from htbam_analysis.analysis.experiment import HTBAMExperiment
from scipy.optimize import curve_fit

#Import Kinetics Package for line fitting:
import kinetics

#Configuration settings for pandas and seaborn
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme(style='ticks', context='paper', font_scale=1.2, rc={"lines.linewidth": 1.2})

#enable inline plotting of matplotlib figures
%matplotlib inline

#set the figure format to SVG
%config InlineBackend.figure_format = 'svg'


## 1. Connect DB Api

In [107]:
### PARAMETERS:
EGFP_SLOPE = 91900.03
EGFP_SLOPE_CONC_UNITS = 'nM' #RFU/nM

root = '/Users/duncanmuir/Desktop/20240501_nick_inhibition_fix/En-Flowon/'
db_conn = LocalHtbamDBAPI(standard_curve_data_path= root + 'd3_2_StandardSeries_Analysis.csv', standard_name="NADPH std curve", standard_type="NADPH", standard_units="uM",
                         kinetic_data_path= root+ 'd3_TitrationSeries_Analysis-noinhib.csv', kinetic_name="ADP kinetics curve", kinetic_type="ADP", kinetic_units="uM")

htbam_experiment = HTBAMExperiment(db_conn)

# TODO: it would be nice to print out what is contained in the database for this experiment for the user to see




## 2. Standards

In [108]:
htbam_experiment.fit_standard_curve('standard_0')

Existing run data not found. Fetching from database.
Standard curve data found for run "standard_0" with:
	-- 1 time points.
	-- 1792 chambers.
	-- 6 concentrations.

Fitting standard curve...


100%|██████████| 1792/1792 [00:00<00:00, 32972.84it/s]


In [109]:
htbam_experiment.plot_standard_curve_chip('standard_0')

{'1,1': 3787414.9806231027, '1,2': 4116612.9246228244, '1,3': 4165371.8937463355, '1,4': 4147746.909738952, '1,5': 3911450.455293237, '1,6': 3999379.8168772897, '1,7': 3622925.965152931, '1,8': 3796233.0744716055, '1,9': 3938714.167478391, '1,10': 3829434.992801871, '1,11': 4187000.5657322644, '1,12': 4336055.694329152, '1,13': 4325515.373867311, '1,14': 4108066.946907997, '1,15': 4105105.912519519, '1,16': 4057226.00905571, '1,17': 4073676.397888162, '1,18': 4333367.070199168, '1,19': 4084709.777751978, '1,20': 3988631.0255301357, '1,21': 4030020.0386899407, '1,22': 3930131.0198529027, '1,23': 4393207.362228401, '1,24': 4461699.061729757, '1,25': 4155221.178124656, '1,26': 4182641.949259, '1,27': 4434067.718112, '1,28': 4556400.848282027, '1,29': 4132461.1698584175, '1,30': 4130167.5438418277, '1,31': 4087151.287750992, '1,32': 4065387.9240423297, '1,33': 4061750.962029872, '1,34': 3905188.7477288167, '1,35': 4034361.7467651963, '1,36': 4426364.110775715, '1,37': 4216691.772974511, '1

## 3. Fit Initial Rates

In [110]:
htbam_experiment.fit_initial_rates('kinetics_0', 
                                   'standard_0',
                                   max_rxn_perc=10,
                                   starting_timepoint_index=1,
                                   max_rxn_time=300,
                                   substrate_conc=60)

# substrate_conc = np.array([60 for _ in range(conc_dim)])

Existing run data not found. Fetching from database.
Activity data found for run "kinetics_0" with:
	-- 20 time points.
	-- 1792 chambers.
	-- 8 concentrations.
Using standard curve data from run "standard_0" to convert luminance data to concentration data.


100%|██████████| 1792/1792 [00:04<00:00, 411.01it/s]

1 reactions had less than 2 points for fitting





In [111]:
htbam_experiment.plot_initial_rates_chip('kinetics_0')

## 4. Filter initial rates

In [112]:
htbam_experiment.compute_enzyme_concentration('kinetics_0', EGFP_SLOPE)

In [127]:
htbam_experiment.filter_initial_rates('kinetics_0',
                                      'standard_0',
                                      standard_curve_r2_cutoff = 0.98,
                                      expression_threshold = 1.0,
                                      initial_rate_R2_threshold = 0.0, 
                                      positive_initial_slope_filter = False)

Pearson r^2 filter: 1679/1792 chambers pass
Enzyme expression filter: 1782/1792 chambers pass
Initial Rate R^2 filter: 1792/1792 chambers pass with 10 or more slopes.


In [122]:
###N.B.: May be some bug here, because some of the filtered-out chambers are still showing slopes.
# I think they should have all nans...?

#Let's plot as before:
#plotting variable: We'll plot by luminance. We need a dictionary mapping chamber id (e.g. '1,1') to the value to be plotted (e.g. slope)
filtered_initial_rates_to_plot = {chamber_idxs[i]: np.any(~np.isnan(filtered_initial_slopes[i,:])) for i in range(len(chamber_idxs))}

#chamber_names: Same as before.

#plotting function: We'll generate a subplot for each chamber, showing the raw data and the linear regression line.
# to do this, we make a function that takes in the chamber_id and the axis object, and returns the axis object after plotting. Do NOT plot.show() in this function.

def plot_chamber_filtered_initial_rates(chamber_id, ax):
    #N.B. Every so often, slope and line colors don't match up. Not sure why.
    #parameters: what amount of total time to plot? First 20%?
    time_to_plot = 0.2
    
    #convert from 'x,y' to integer index in the array:
    data_index = list(chamber_idxs).index(chamber_id)
    x_data = time_data[:,0]
    y_data = product_concentration[:,data_index,:].T
    
    #plot only first X% of time:
    max_time = np.nanmax(x_data)
    time_to_plot = max_time*time_to_plot
    time_idxs_to_plot = x_data < time_to_plot
    x_data = x_data[time_idxs_to_plot]
    y_data = y_data[:, time_idxs_to_plot]
    
    #get slope from the analysis:
    current_chamber_slopes = filtered_initial_slopes[data_index,:]
    #calculate y-intercept by making sure it intersects first point:
    current_chamber_intercepts = y_data[:,0] - current_chamber_slopes*x_data[0] #note: not true y-intercept from linear regression
    
    #get slope from the analysis:
    current_chamber_slopes = filtered_initial_slopes[data_index,:]
    #calculate y-intercept by making sure it intersects first point:
    current_chamber_intercepts = initial_slopes_intercepts[data_index,:]
    # get regressed point mask:
    current_chamber_reg_mask = reg_idx_arr[data_index,:][:,:len(x_data)]

    colors = sns.color_palette('husl', n_colors=y_data.shape[0])

    #print(y_data.shape[0])
    for i in range(y_data.shape[0]): #over each concentration:
        
        ax.scatter(x_data, y_data[i,:], color=colors[i], alpha=0.3)
        ax.scatter(x_data[current_chamber_reg_mask[i]], y_data[i, current_chamber_reg_mask[i]], color=colors[i], alpha=1, s=50)
        
        m = current_chamber_slopes[i]
        b = current_chamber_intercepts[i]
        if not (np.isnan(m) or np.isnan(b)):
            #return False, no_update, no_update
            ax.plot(x_data, m*np.array(x_data) + b, color=colors[i])
    return ax

    

### PLOT THE CHIP: now, we plot
plot_chip(filtered_initial_rates_to_plot, chamber_names_dict, graphing_function=plot_chamber_filtered_initial_rates, title='Kinetics: Filtered Initial Rates (Max)')
print('{}/1792 wells pass our filters.'.format( np.sum([x for x in filtered_initial_rates_to_plot.values()]) ) )

NameError: name 'chamber_idxs' is not defined

## 9. Fit Inhibition Constant:

In [None]:
substrate_conc_unit = db_conn._json_dict['runs']['kinetics_0']['conc_unit']
if  substrate_conc_unit != EGFP_SLOPE_CONC_UNITS: print('Substrate concentration units do not match EGFP standard curve units! \n{} != {}'.format(substrate_conc_unit, EGFP_SLOPE_CONC_UNITS))

unit_converstion = 0.001 #convert FROM eGFP units TO substrate units (in this case, nM to uM)
enzyme_concentration_converted_units = enzyme_concentration * unit_converstion

#Double check!
print('Conversion:')
print('{} {} = {} {}  ?'.format(enzyme_concentration[0], EGFP_SLOPE_CONC_UNITS, enzyme_concentration_converted_units[0], substrate_conc_unit))


In [None]:
#Here, we calculate the Michaelis-Menten parameters for each chamber.
K_i_array = np.array([])
K_i_error_array = np.array([])

for i in range(len(chamber_idxs)):
    current_slopes = filtered_initial_slopes[i, :]

    if np.all(np.isnan(current_slopes)):
        print('Chamber {} has no slopes!'.format(chamber_idxs[i]))
        K_i_array = np.append(K_i_array, np.nan)
        K_i_error_array = np.append(K_i_error_array, np.nan)
        continue

    #get indices of non-nan values:
    non_nan_idxs = np.where(~np.isnan(current_slopes))[0]
    
    current_slopes = current_slopes[non_nan_idxs]
    current_concs = conc_data[non_nan_idxs]

    if len(current_slopes) < 3:
        print('Chamber {} has fewer than 3 slopes!'.format(chamber_idxs[i]))
        K_i_array = np.append(K_i_array, np.nan)
        K_i_error_array = np.append(K_i_error_array, np.nan)
        continue

    max_normed_slopes = current_slopes / np.nanmax(current_slopes)
    #kinetics.fit_and_plot_micheaelis_menten(current_slopes, current_slopes, current_concs, enzyme_concentration_converted_units[i], 'uM', 'MM for first chamber!')
    #K_i, std_err = kinetics.fit_inhibition_constant(max_normed_slopes, max_normed_slopes, current_concs, enzyme_concentration_converted_units[i], 'uM', 'MM for first chamber!')

    def inhibition_model(x, Ki, r_max):
        return 1 - (r_max * (x / (x + Ki)))

    p_opt, p_cov = curve_fit(inhibition_model, current_concs, max_normed_slopes)
    K_i = p_opt[0]
    std_err = np.sqrt(np.diag(p_cov))
   

    K_i_array = np.append(K_i_array, K_i)
    K_i_error_array = np.append(K_i_error_array, std_err[0])
    


In [None]:
# chamber_idxs, luminance_data, conc_data, time_data

#save filtered data to new analysis:
if 'analyses' not in db_conn._json_dict['runs']['kinetics_0'].keys():
    db_conn._json_dict['runs']['kinetics_0']['analyses'] = {}

#initialize the dictionary
db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_raw'] = {
        'chambers': {}} 

for i, chamber_idx in enumerate(chamber_idxs):
    db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_raw']['chambers'][chamber_idx] = {
        'K_i': K_i_array[i],
        'K_i_error': K_i_error_array[i],
    }
   

In [None]:
#Get chamber ids from metadata:
chamber_name_to_idx = {}
for chamber_idx, subdict in db_conn._json_dict['chamber_metadata'].items():
    name = subdict['id']
    if name not in chamber_name_to_idx.keys():
        chamber_name_to_idx[name] = [chamber_idx]
    else:
        chamber_name_to_idx[name].append(chamber_idx)

#get average number of replicates:
n_replicates = np.mean([len(x) for x in chamber_name_to_idx.values()])
print('Average number of replicates per sample: {}'.format(np.round(n_replicates, 1)))

In [None]:
z_score_threshhold_MM = 1.5
z_score_threshhold_expression = 1.5

In [None]:
#Get average k_cat, k_M, and v_max for each sample:
sample_names = np.array([])
sample_K_i = np.array([])
sample_K_i_error = np.array([])
sample_K_i_replicates = []

#Get z-scores for each well (used to filter in the next step!)
K_i_zscores = np.array([])
enzyme_concentration_zscores = np.array([])
export_list1=[]
#For each sample, 
for name, ids in chamber_name_to_idx.items():

    ### GATHER MM PARAMETERS OF REPLICATES FOR EACH SAMPLE: ###
    #get indices of idxs in chamber_idxs:
    idxs = [list(chamber_idxs).index(x) for x in ids]

    #get values for those indices:
    K_i = K_i_array[idxs]

    #keep track of which wells we exclude later:
    K_i_replicates = np.array(ids)

    #if any of these is all nans, just continue to avoid errors:
    if np.all(np.isnan(K_i)):
        print('No values from sample {}, all pre-filtered.'.format(name))
        continue

    ### FILTER OUT OUTLIERS: ###
    #calculate z-score for each value:
    K_i_zscore = (K_i - np.nanmean(K_i))/np.nanstd(K_i)

    #also, get z-score of enzyme expression for each well:
    enzyme_concentration_zscore = (enzyme_concentration_converted_units[idxs] - np.nanmean(enzyme_concentration_converted_units[idxs]))/np.nanstd(enzyme_concentration_converted_units[idxs]) #in units of 'substrate_conc_unit' 

    #First, for enzyme expression outliers, set the value to NaN to be filtered in the final step:
    K_i[np.abs(enzyme_concentration_zscore) > z_score_threshhold_expression] = np.nan

    #filter out values with z-score > threshhold:
    K_i = K_i[np.abs(K_i_zscore) < z_score_threshhold_MM]

    #do the same for the replicates ids:
    K_i_replicates = K_i_replicates[np.abs(K_i_zscore) < z_score_threshhold_MM]

    #remove nan values from all (nan values are due to both no experimental data, and z-score filtering)
    K_i_replicates = K_i_replicates[~np.isnan(K_i)]
    K_i = K_i[~np.isnan(K_i)]

    if len(K_i) < 3:
        print('Not enough replicates for sample {}. Skipping.'.format(name))
        continue
    
    #get average values:
    sample_names = np.append(sample_names, name)
    sample_K_i = np.append(sample_K_i, np.mean(K_i))
    sample_K_i_error = np.append(sample_K_i_error,np.std(K_i))
    
    #keep track of replicates:
    sample_K_i_replicates.append(K_i_replicates)


    temp_list1 = []
    temp_list1.append(name)
    for ki in K_i:
        temp_list1.append(ki)
    export_list1.append(temp_list1)

        
df2 = pd.DataFrame(export_list1)
df2.to_csv('kifile.csv')


In [None]:
#save filtered data to new analysis:
if 'analyses' not in db_conn._json_dict['runs']['kinetics_0'].keys():
    db_conn._json_dict['runs']['kinetics_0']['analyses'] = {}

#initialize the dictionary
db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_filtered'] = {
        'samples': {}} 

for i, sample_name in enumerate(sample_names):
    db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_filtered']['samples'][sample_name] = {
        'K_i': sample_K_i[i],
        'K_i_error': sample_K_i_error[i],
        'K_i_replicates': sample_K_i_replicates[i],
    }

In [None]:
#visualize:
#plotting variable: We'll plot by K_M. We need a dictionary mapping chamber id (e.g. '1,1') to the value to be plotted
#first, fill it with NaNs as a placeholder:
K_i_to_plot = {chamber_idx: np.nan for chamber_idx in chamber_idxs}
#then, fill in the values we have:
for i in range(len(sample_names)):
    for chamber_idx in chamber_name_to_idx[sample_names[i]]:
        K_i_to_plot[chamber_idx] = sample_K_i[i]

#plotting function: We'll generate an MM subplot for each chamber.
def plot_chamber_K_i(chamber_id, ax):

    #get the substrate concentrations that match with each initial rate:
    substrate_concs = conc_data



    ### PLOT MEAN KI FIT###
    #find the name of the chamber:
    chamber_name = chamber_names_dict[chamber_id]
    #first, find all chambers with this name:
    #if there's no data, just skip!
    if chamber_name not in db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_filtered']['samples']:
        return ax
    chamber_id_list = db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_filtered']['samples'][chamber_name]['K_i_replicates']

    #convert to array indices:
    chamber_id_list = [list(chamber_idxs).index(x) for x in chamber_id_list]

    #get the initial rates for each chamber:
    initial_slopes = filtered_initial_slopes[chamber_id_list,:]
   
    normed_initial_slopes = initial_slopes / np.nanmax(initial_slopes, axis=1)[: , np.newaxis]

    #get average
    initial_slopes_avg = np.nanmean(normed_initial_slopes, axis=0)
    #get error bars
    initial_slopes_std = np.nanstd(normed_initial_slopes, axis=0)


    x_data = substrate_concs
    y_data = initial_slopes_avg

    #plot with error bars:
    ax.errorbar(x_data, y_data, yerr=initial_slopes_std,  fmt='o', label="Average")


     ### PLOT INDIVIDUAL K_i VALUES ###
    chamber_initial_slopes = filtered_initial_slopes[list(chamber_idxs).index(chamber_id), :]
    chamber_normed_initial_slopes = chamber_initial_slopes/ np.nanmax(chamber_initial_slopes)
    x_data = substrate_concs
    y_data = chamber_normed_initial_slopes

    chamber_Ki = db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_raw']['chambers'][chamber_id]['K_i']

    def inhibition_model(x, Ki, v_0):
        return 1 - (v_0 * (x / (x + Ki)))
        # return v_0 / (1 + (x / Ki))

    #plot with error bars:
    ax.scatter(x_data, y_data, color='green', s=100, label='Chamber')
    x_logspace = np.logspace(np.log10(np.nanmin(x_data[1:])), np.log10(np.nanmax(x_data)), 100)
  
    ax.plot(x_logspace, inhibition_model(x_logspace, chamber_Ki, 1), color='green', label='Chamber Fit')

    ax.set_xscale('log')
    ax.legend()
    
    return ax

#chamber_names: We'll provide the name of the sample in each chamber as well, in the same way:
chamber_names_dict = {chamber_idx: subdict['id'] for chamber_idx, subdict in db_conn._json_dict['chamber_metadata'].items()}

### PLOT THE CHIP: now, we plot
plot_chip(K_i_to_plot, chamber_names_dict, graphing_function=plot_chamber_K_i, title='Filtered K_i')

In [None]:
sample_name = []
kis = []
for sample, dic in db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_filtered']['samples'].items():
    kis.append([db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_raw']['chambers'][i]['K_i'] for i in dic['K_i_replicates']])
    sample_name.append(sample)


In [None]:
# violin plot with sample name labels
fig, ax = plt.subplots()
ax.violinplot(kis)
ax.set_xticks(np.arange(1, len(sample_name) + 1))
ax.set_xticklabels(sample_name, rotation=45)
ax.set_ylabel('$K_i$ (uM)')


## 10. Export to CSV in format people like

In [None]:
#Summary CSV, showing data for each SAMPLE:
output_csv_name = 'inhibition_summary'

import csv
with open(output_csv_name+'_short.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    #write header:
    writer.writerow(['id', 
                     'substrate_name', 
                     'assay_type', 
                     'replicates', 
                     'Ki_mean_filtered', 
                     'Ki_stdev_filtered', 
                     'enzyme'])
    #write data:
    for i, sample_name in enumerate(sample_names):
        row = [sample_name,
               sample_name,
               db_conn._json_dict['runs']['kinetics_0']['type'], 
               len(sample_K_i_replicates[i]), 
               sample_K_i[i], 
               sample_K_i_error[i], 
               enzyme_concentration_converted_units[i],
               ]
        writer.writerow(row)

In [None]:
#Full CSV, showing data for each CHAMBER:
import csv
output_csv_name = 'inhibition'

with open(output_csv_name+'.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    #write header:
    writer.writerow(['id', 
                     'x,y',
                     'substrate_name', 
                     'assay_type', 
                     'replicates', 
                     'Ki', 
                     'Ki_mean_filtered', 
                     'Ki_stdev_filtered', 
                     'enzyme',])
    #write data for each chamber:
    for i, chamber_idx in enumerate(chamber_idxs):
        sample_name = chamber_names_dict[chamber_idx]
        #get index in sample_names:
        if sample_name in sample_names:
            sample_idx = list(sample_names).index(sample_name)
            row = [chamber_names_dict[chamber_idx], #id
                    chamber_idx, #x,y
                    sample_name, #substrate_name
                    db_conn._json_dict['runs']['kinetics_0']['type'], #assay_type
                    len(sample_K_i_replicates[sample_idx]), #replicates
                    db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_raw']['chambers'][chamber_idx]['K_i'], #kcat
                    sample_K_i[sample_idx], #kcat_mean_filtered
                    sample_K_i_error[sample_idx], #kcat_stdev_filtered
                    enzyme_concentration_converted_units[i], #enzyme
                    ]
        else:
            row = [chamber_names_dict[chamber_idx], #id
                    chamber_idx, #x,y
                    sample_name, #substrate_name
                    db_conn._json_dict['runs']['kinetics_0']['type'], #assay_type
                    'NaN', #replicates
                    db_conn._json_dict['runs']['kinetics_0']['analyses']['K_i_raw']['chambers'][chamber_idx]['K_i'], #kcat
                    'NaN', #K_i_mean_filtered
                    'NaN', #K_i_stdev_filtered
                    enzyme_concentration_converted_units[i], #enzyme
            ]
        
        writer.writerow(row)