In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import copy

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

# silence future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

plot_folder = '../plots/1_process_count_data/'
data_output_folder = '../measured_data/2_normalized_log10/'

# create the plot folder if it doesn't exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)
    
# create the data output folder if it doesn't exist
if not os.path.exists(data_output_folder):
    os.makedirs(data_output_folder)

cell_lines_subset = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549"]
cell_lines_rest = ["HaCaT", "JEG3", "Tera1", "PC3"]
cell_lines = cell_lines_subset + cell_lines_rest

cell_line_labels = [f"{cell_line}_3UTR" for cell_line in cell_lines]

label_rename = {
    "HUH-7": "HUH7",
    "JEG-3": "JEG3",
    "Tera-1": "Tera1",
    "SK-N-SH": "SKNSH",
    "PC-3": "PC3",
}

### This notebooks processes raw stability data into normalized stability data.

# 1.1 - Interreplicate correlation

In [2]:
count_df = pd.read_csv('../measured_data/1_count_data/library2_count_and_log2fc_data.csv', index_col=0)

In [3]:
# get all columns that contain "count"
count_cols = [col for col in count_df.columns if 'count' in col]

# of these, get the ones that contain "r1"
r1_count_cols = [col for col in count_cols if 'r1' in col]
# of these, get the ones that contain "r2"
r2_count_cols = [col for col in count_cols if 'r2' in col]

# get all indices that contain "miRNA"
mirna_idx = [idx for idx in count_df.index if 'miRNA' in idx or 'mirna' in idx]

# get controls
control_idx = [idx for idx in count_df.index if '0_lib2_control' in idx]

# drop all data that is not a control or miRNA design
count_df = count_df.loc[control_idx + mirna_idx]

In [4]:
# get only columns that contain "count"
count_df_filter = count_df[count_cols]

# drop the count_
count_df_filter.columns = [col.replace('count_', '') for col in count_df_filter.columns]

# drop the 3UTR_ from the column names
count_df_filter.columns = [col.replace('3UTR_', '') for col in count_df_filter.columns]

# make it log10
count_df_filter = np.log10(count_df_filter)

# calculate the correlation between all columns
corr_df = count_df_filter.corr()**2

In [None]:
# create the plot folder if it doesn't exist
if not os.path.exists(os.path.join(plot_folder, '1.1_interreplicate_correlation')):
    os.makedirs(os.path.join(plot_folder, '1.1_interreplicate_correlation'))

plt.figure(figsize=(3, 3))
sns.set(font_scale=0.7)
ax = sns.heatmap(corr_df, annot=False, fmt=".2f", cmap='viridis', square=True, cbar_kws={'label': r'R$^2$', 'shrink': 0.8})

# Set the cell line labels, positioning them in the middle of their replicates
ax.set_xticks([2*i+1 for i in range(len(cell_lines)+1)])
ax.set_xticklabels(["DNA"] + cell_lines, rotation=90)
ax.set_yticks([2*i+1 for i in range(len(cell_lines)+1)])
ax.set_yticklabels(["DNA"] + cell_lines, rotation=0)

plt.title('Interreplicate correlation within library 2', fontsize=8)

for format in ['svg', 'png']:
    plt.savefig(os.path.join(plot_folder, '1.1_interreplicate_correlation/library2_3UTR_correlation.' + format), bbox_inches='tight', dpi=300)

In [7]:
# delete NaNs in the count df
count_df = count_df.dropna()

# reset matplotlib after using seaborn
plt.rcParams.update(plt.rcParamsDefault)
# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

# Calculate the number of rows and columns for the subplots
n = len(r2_count_cols)
rows = int(np.ceil(np.sqrt(n)))
cols = int(np.ceil(n / rows))

# Create a subplot grid
fig, axs = plt.subplots(rows, cols, figsize=(1.4*cols, 1.2*rows))

# Flatten axs for easy indexing
if rows > 1 and cols > 1:
    axs = axs.ravel()

for i in range(n):
    r1_col = r1_count_cols[i]
    r2_col = r2_count_cols[i]
    r1_vals = np.log10(count_df[r1_col] + 1)
    r2_vals = np.log10(count_df[r2_col] + 1)
    
    ax = axs[i]
    ax.scatter(r1_vals, r2_vals, s=10, alpha=0.5, linewidths=0, color='grey', rasterized=True)
    
    # calculate the r2 value
    r2 = stats.pearsonr(r1_vals, r2_vals)[0]**2
    
    # Set x and y axis labels only for specific positions
    if i >= (rows - 1) * cols:
        ax.set_xlabel(r"log$_{10}$(counts rep1)", fontsize=7)
    if i % cols == 0:
        ax.set_ylabel("log$_{10}$(counts rep2)", fontsize=7)

    ax.set_xlim(0.5, 4.5)
    ax.set_ylim(0.5, 4.5)
    ax.set_xticks([1,2,3,4])
    ax.set_yticks([1,2,3,4])
    # set the font size of the ticks
    ax.xaxis.set_tick_params(labelsize=7)
    ax.yaxis.set_tick_params(labelsize=7)
    
    ax.set_title(r1_col.split('_')[1], fontsize=7)
    ax.text(0.25, 0.8, r'$r^2$ =' + f'{r2:.3f}', transform=ax.transAxes, ha='center')

plt.tight_layout()

# Save as one combined figure
for form in ['png', 'svg']:
    plt.savefig(f'{plot_folder}/1.1_interreplicate_correlation/1_combined_count_figure.{form}', dpi=300)

# 1.2 - Start data processing

In [8]:
# get all columns that contain "log2FoldChange"
log2fc_cols = [col for col in count_df.columns if 'log2FoldChange' in col]

# get all columns that contain "lfcSE"
lfcSE_cols = [col for col in count_df.columns if 'lfcSE' in col]

log2_df = count_df[log2fc_cols]
std_df = count_df[lfcSE_cols]

# convert log2 to log10
log10_df = log2_df.applymap(lambda x: np.log10(2**x))
std_df = std_df.applymap(lambda x: np.log10(2**x))

# for all columns names, split by "_", then remove the last element
# and join them with a space
log10_df.columns = ['_'.join(col.split('_')[:-1]) for col in log10_df.columns]
std_df.columns = ['_'.join(col.split('_')[:-1]) for col in std_df.columns]

## divide the data into dictionaries based on the design name

In [9]:
# get the name of all files in "reference" folder
reference_files = os.listdir("../design_files/")

reference_dfs = {}
for file in reference_files:
    # if it's not a csv file, skip
    if not file.endswith('.csv'):
        continue
    # get the name for the df from the file name
    # it's the file name without the extension
    name = file.split('.')[0]
    reference_dfs[name] = pd.read_csv("../design_files/" + file, index_col=0)

# for each column, rename it according to label_rename if it's in the dictionary
for key, df in reference_dfs.items():
    df.rename(columns=label_rename, inplace=True)

# for each dataframe, if there are columns that match cell lines, prepend predicted_ to the column name
for key, df in reference_dfs.items():
    for col in df.columns:
        if col in cell_lines:
            df.rename(columns={col: 'predicted_' + col}, inplace=True)

reference_df_original = reference_dfs.copy()

# for each dataframe, add the columns from results_df based on the index
for key, df in reference_dfs.items():
    reference_dfs[key] = reference_dfs[key].join(log10_df, how='left')
    # drop rows with NaN values
    reference_dfs[key].dropna(inplace=True)

In [None]:
for key, df in reference_dfs.items():
    print(key)
    print(len(df))

In [None]:
# for each reference dataframe, check which percentage of the indices are contained in the dataframe
sum_original = 0
sum_after = 0

for key, df in reference_df_original.items():
    print(f"Percentage of {key} contained in results:" + \
        str(100* df.index.isin(log10_df.index).sum() / len(df.index)))
    sum_original += len(df)
    sum_after += df.index.isin(log10_df.index).sum()
    
print(f"Total number of rows in reference files: {sum_original}")
print(f"Total number of rows in reference files after filtering: {sum_after}")

In [None]:
# show which entries are not contained in the results
entries = []
for key, df in reference_df_original.items():
    entry = df.index[~df.index.isin(log10_df.index)]
    if len(entry) > 0:
        entries.append(entry.to_list())
# flatten the list to get a valid index
entries = [item for sublist in entries for item in sublist]

# previously filtered designs:
# contains AATAAA and was filtered during the design process
# 1_mirna_full_single_high_conf_94	
# 1_mirna_full_single_high_conf_307	
# 1_mirna_full_single_high_conf_638	
# 1_mirna_full_single_high_conf_639	
# 2_mirna_full_single_low_conf_mirgenedb_165	
# 5.14_miRNA_let-7a-5p_smut_swob_omut11.15.19_owob

entries

In [None]:
# get a list of polyA-containing designs
polyA_designs = pd.read_excel('../design_info/polyA_signals.xlsx', index_col=0)
polyA_designs

# 94, 307, 638, and 639 were filtered beforehand (during library generation)
# this means that only 3 designs are genuinely missing

# 1.3 - Normalize data

Here, we use microRNA data to normalize the expression for constructs that are not expected to be knocked down to 1.

In [14]:
mirna_expression_df = pd.read_csv("../microrna_data/3_output/Alles_Keller_completely_unfiltered_merge.csv", index_col=0)

In [16]:
# ADD EXPRESSION DATA TO THE DATAFRAMES
# get all dfs that contain "single" in their key
single_dfs = {key: reference_dfs[key].copy() for key in reference_dfs.keys() if "single_high_conf" in key or "full_repeat" in key}

for key, df in single_dfs.items():
    for cell_line in cell_lines:
        # add miRNA expression data
        # this is done by matching the column "miRNA1" in the df with the column "miRNA" in the mirna_expression_df
        # not all values are present in the mirna_expression_df, so we have to match them
        df[f"{cell_line}_expression"] = df["miRNA1"].map(mirna_expression_df[cell_line])

        # drop NaN values
        df.dropna(inplace=True)

In [17]:
%%capture output
# create the plot folder if it doesn't exist
if not os.path.exists(f'{plot_folder}/1.3_normalization'):
    os.makedirs(f'{plot_folder}/1.3_normalization')

# plot the data before normalization
for key in single_dfs.keys():
    for cell_line in cell_lines:
        df = single_dfs[key]
        knock_df = df[f"{cell_line}_3UTR"].sort_values(ascending=False)

        plt.clf()
        fig = plt.figure(figsize=(2.2, 1.7))

        # create plots
        plt.scatter(df.loc[knock_df.index, f"{cell_line}_expression"], 10**knock_df, s=5, color="dodgerblue")
        x_range = np.arange(0, 5.5, 0.01)
        plt.plot(x_range, [1 for i in range(len(x_range))], color="black", linestyle="dashed", label="y=0")

        # calculate the correlation coefficient
        r, p = stats.spearmanr(np.log10(df.loc[knock_df.index, f"{cell_line}_expression"]), knock_df)

        plt.xlabel(r"log$_{10}$"+f"({cell_line} expression)")
        plt.ylabel(r"log$_{10}$(RNA/DNA)")

        plt.xlim(0, 5.5)
        plt.legend(loc="lower left", frameon=False)
        plt.title(f"{cell_line}_{key}, " + r"$\rho^2$ = " + str(round(r**2, 2)), fontsize=8)
        plt.savefig(f"{plot_folder}/1.3_normalization/{key}_{cell_line}_before.png", dpi=300, bbox_inches='tight')

In [18]:
for key, df in reference_dfs.items():
    fig = plt.figure(figsize=(2.2, 1.7))
    sns.violinplot(data=df[cell_line_labels], scale='width', inner='quartile', linewidth=0.5, palette='viridis')
    plt.axhline(0, color='black', linestyle='dashed', linewidth=0.5)
    # make cell_lines the xticklabels
    plt.xticks(range(len(cell_lines)), cell_lines, rotation=90)
    plt.ylabel("log10(RNA/DNA)")
    
    plt.savefig(f"{plot_folder}/1.3_normalization/violin_{key}_before.png", dpi=300, bbox_inches='tight')

## Apply normalization

#### First, we normalize to the expected value of single microRNA sites. We take the top 300 miRNAs as reference given that most miRNAs are not expressed in any given cell line.

In [19]:
# get the dfs for single microRNA target sites 
key = "1_mirna_full_single_high_conf"
df_ctx1 = single_dfs[key]
df_ctx1.dropna(inplace=True)

df_results_norm = log10_df.copy()
norm_factors = {}
n_norm = 300

for cell_line in cell_lines:
    knock_df_ctx1 = df_ctx1[f"{cell_line}_3UTR"].sort_values(ascending=False)
    
    # get the 200 highest values and calculate the median
    norm_ctx1 = knock_df_ctx1.head(n_norm).median()
    norm_factors[f"{cell_line}"] = norm_ctx1
    
    # subtract the median from all values to normalize them
    # we subtract rather than divide because the values are log10 transformed
    df_results_norm[f"{cell_line}_3UTR"] = df_results_norm.apply(lambda x: x[f"{cell_line}_3UTR"] - norm_ctx1, axis=1)

#### Then, we split by designs

In [20]:
# get the name of all files in "reference" folder
reference_files = os.listdir("../design_files/")

reference_dfs = {}
for file in reference_files:
    # check that the file ends in .csv
    if file.endswith(".csv"):
        # get the name for the df from the file name
        # it's the file name without the extension
        name = file.split('.')[0]
        reference_dfs[name] = pd.read_csv("../design_files/" + file, index_col=0)

# for each column, rename it according to label_rename if it's in the dictionary
for key, df in reference_dfs.items():
    df.rename(columns=label_rename, inplace=True)

# for each dataframe, if there are columns that match cell lines, prepend predicted_ to the column name
for key, df in reference_dfs.items():
    for col in df.columns:
        if col in cell_lines:
            df.rename(columns={col: 'predicted_' + col}, inplace=True)

# for each dataframe, add the columns from results_df based on the index
for key, df in reference_dfs.items():
    reference_dfs[key] = reference_dfs[key].join(df_results_norm, how='left')
    # drop rows with NaN values
    reference_dfs[key].dropna(inplace=True)

### Plot the designs again after this initial normalization

In [21]:
# ADD EXPRESSION DATA TO THE DATAFRAMES
# get all dfs that contain "single" in their key
single_dfs = {key: reference_dfs[key].copy() for key in reference_dfs.keys() if "single_high_conf" in key or "full_repeat" in key}

for key, df in single_dfs.items():
    for cell_line in cell_lines:
        # add miRNA expression data
        # this is done by matching the column "miRNA1" in the df with the column "miRNA" in the mirna_expression_df
        # not all values are present in the mirna_expression_df, so we have to match them
        df[f"{cell_line}_expression"] = df["miRNA1"].map(mirna_expression_df[cell_line])

        # drop NaN values
        df.dropna(inplace=True)

In [22]:
%%capture output
# create the plot folder if it doesn't exist
if not os.path.exists(f'{plot_folder}/1.3_normalization'):
    os.makedirs(f'{plot_folder}/1.3_normalization')

# plot the data after the first normalization step
for key in single_dfs.keys():
    for cell_line in cell_lines:
        df = single_dfs[key]
        knock_df = df[f"{cell_line}_3UTR"].sort_values(ascending=False)

        plt.clf()
        fig = plt.figure(figsize=(2.2, 1.7))

        # create plots
        plt.scatter(df.loc[knock_df.index, f"{cell_line}_expression"], 10**knock_df, s=5, color="dodgerblue")
        x_range = np.arange(0, 5.5, 0.01)
        plt.plot(x_range, [1 for i in range(len(x_range))], color="black", linestyle="dashed", label="y=0")

        # calculate the correlation coefficient
        r, p = stats.spearmanr(np.log10(df.loc[knock_df.index, f"{cell_line}_expression"]), knock_df)

        plt.xlabel(r"log$_{10}$"+f"({cell_line} expression)")
        plt.ylabel(r"log$_{10}$(RNA/DNA)")

        plt.xlim(0, 5.5)
        plt.legend(loc="lower left", frameon=False)
        plt.title(f"{cell_line}_{key}, " + r"$\rho^2$ = " + str(round(r**2, 2)), fontsize=8)
        plt.savefig(f"{plot_folder}/1.3_normalization/{key}_{cell_line}_after1.png", dpi=300, bbox_inches='tight')

In [23]:
for key, df in reference_dfs.items():
    fig = plt.figure(figsize=(2.2, 1.7))
    sns.violinplot(data=df[cell_line_labels], scale='width', inner='quartile', linewidth=0.5, palette='viridis')
    plt.axhline(0, color='black', linestyle='dashed', linewidth=0.5)
    
    # make cell_lines the xticklabels
    plt.xticks(range(len(cell_lines)), cell_lines, rotation=90)
    plt.ylabel("log10(RNA/DNA)")
    
    plt.savefig(f"{plot_folder}/1.3_normalization/violin_{key}_norm_step1.png", dpi=300, bbox_inches='tight')

# Second normalization step

## Plot the distribution of values in each design type as a violin plot

In [24]:
reference_dfs_old = copy.deepcopy(reference_dfs)

In [25]:
mirna_numbers = {i: [] for i in range(0, 7)}
for key in reference_dfs.keys():
    df = reference_dfs[key].copy()
    mirna_number = sum([1 for column in df.columns if "miRNA" in column and column != "miRNA"])
    mirna_numbers[mirna_number].append(key)

In [26]:
# now denote the subsets that are well suited to normalization
# these are the ones where we expect a significant number of sequences to have baseline stability
normalization_by_number = {i: [] for i in range(0, 7)}
for key in reference_dfs.keys():
    df = reference_dfs[key].copy()
    mirna_number = sum([1 for column in df.columns if "miRNA" in column and column != "miRNA"])
    if mirna_number == 1:
        if key.startswith("1_") or key.startswith("2_") or key.startswith("3_"):
            normalization_by_number[mirna_number].append(key)
    else:
        if "full_repeat" in key: # or "mut_repeat" in key:
            normalization_by_number[mirna_number].append(key)

In [27]:
# get the microRNA expression data from notebook 3 - we use an unfiltered geometric mean of the two main datasets
mirna_expression = pd.read_csv("../microrna_data/3_output/Alles_Keller_completely_unfiltered_merge.csv", index_col=0)

In [28]:
median_dicts_before = {}
median_dicts_after = {}

for mirna_number in mirna_numbers.keys():
    curr_median_dict_before = {}
    curr_median_dict_after = {}
    
    # don't normalize the control sequences
    if mirna_number == 0:
        continue
    
    # get the the well-suited designs
    curr_dfs = []
    for key in normalization_by_number[mirna_number]:
        df = reference_dfs[key].copy()

        # skip files that have different context sequences [we later want to examine how they differ]
        if "4_miRNA_full_single_context" in key:
            continue
        
        # only use lowly expressed miRNAs
        if "orig_mi" in df.columns:
            df.set_index("orig_mi", inplace=True)
        else:
            df.set_index("miRNA1", inplace=True)
            
        common_index = df.index.intersection(mirna_expression.index)
        df = df.loc[common_index]
        curr_dfs.append(df)  
    curr_dfs = pd.concat(curr_dfs).reset_index(drop=True)
    
    # get the normalization factor for each cell line
    median_dict = {}
    for cell_line in cell_line_labels:
        df = curr_dfs[cell_line].copy().dropna()
        # sort the values from highest to lowest
        df = df.sort_values(ascending=False)
        # get the top 15%
        df = df.head(int(0.15*len(df)))
        # drop values larger than 2
        df = df[df < 2]
        
        median = df.median()
        median_dict[cell_line] = median
        curr_median_dict_before[cell_line] = median
        
    # get the median of medians
    median_of_medians = np.median(list(median_dict.values()))
    
    # apply the normalization
    for key in mirna_numbers[mirna_number]:
        df = reference_dfs[key].copy()
        
        for cell_line in cell_line_labels:
            df[cell_line] = df[cell_line] - median_dict[cell_line] + median_of_medians

        reference_dfs[key] = df
    
    # ----------------- AFTER NORMALIZATION -----------------
    # This is to check if the normalization worked as intended
    # get the the well-suited designs
    curr_dfs = []
    for key in normalization_by_number[mirna_number]:
        df = reference_dfs[key].copy()

        # skip files that have different context sequences [we later want to examine how they differ]
        if "4_miRNA_full_single_context" in key:
            continue
        
        # only use lowly expressed miRNAs
        if "orig_mi" in df.columns:
            df.set_index("orig_mi", inplace=True)
        else:
            df.set_index("miRNA1", inplace=True)
            
        common_index = df.index.intersection(mirna_expression.index)
        df = df.loc[common_index]
        curr_dfs.append(df)  
    curr_dfs = pd.concat(curr_dfs).reset_index(drop=True)
    
    # get the median after
    for cell_line in cell_line_labels:
        df = curr_dfs[cell_line].copy().dropna()
        # sort the values from highest to lowest
        df = df.sort_values(ascending=False)
        
        # get the top 15%
        df = df.head(int(0.15*len(df)))
        
        median = df.median()
        curr_median_dict_after[cell_line] = median
    
    median_dicts_before[mirna_number] = curr_median_dict_before
    median_dicts_after[mirna_number] = curr_median_dict_after

## Note designs with very high stability

In [None]:
total_sum = 0
high_stability_designs = {}
for key in reference_dfs.keys():
    if key.startswith("4_") or key.startswith("0_"):
        continue
    
    df = reference_dfs[key].copy()
    
    # get the linear maximum value across cell lines
    cell_lines_UTR = [col for col in df.columns if "3UTR" in col]
    curr_max = (10**df[cell_lines_UTR]).max(axis=1)
    
    # find all those that have a max value of more than 1.5
    mask = curr_max > 1.5
    print(f"{key}, total designs: {mask.sum()}, percentage: {100*mask.sum()/len(mask)}")
    total_sum += mask.sum()
    
    high_stability_designs[key] = df[mask]
    
high_stability_designs = pd.concat(high_stability_designs.values())

high_stability_folder = os.path.join(data_output_folder, "high_stability")
if not os.path.exists(high_stability_folder):
    os.makedirs(high_stability_folder)
    
high_stability_designs.to_csv(os.path.join(high_stability_folder, "high_stability.csv"))

## Save the normalized data for further use

In [30]:
# save each reference df as a csv
for key, df in reference_dfs.items():
    df.to_csv(os.path.join(data_output_folder, f"{key}.csv"))

In [37]:
# save the normalized data as a single dataframe
threeUTRcolumns = [col for col in reference_dfs["0_lib2_controls"].columns if "3UTR" in col]
all_dfs = [reference_dfs[key][threeUTRcolumns] for key in reference_dfs.keys()]
all_dfs = pd.concat(all_dfs)
all_dfs.columns = [col.replace("_3UTR", "") for col in all_dfs.columns]
all_dfs = 10**all_dfs
all_dfs.to_csv(os.path.join(data_output_folder, "library2_normalized_stability_data.csv"))

## Plot the normalized data

### individual plots

In [31]:
# ADD EXPRESSION DATA TO THE DATAFRAMES
# get all dfs that contain "single" in their key
single_dfs = {key: reference_dfs[key].copy() for key in reference_dfs.keys() if "single_high_conf" in key or "full_repeat" in key}

for key, df in single_dfs.items():
    for cell_line in cell_lines:
        # add miRNA expression data
        # this is done by matching the column "miRNA1" in the df with the column "miRNA" in the mirna_expression_df
        # not all values are present in the mirna_expression_df, so we have to match them
        df[f"{cell_line}_expression"] = df["miRNA1"].map(mirna_expression[cell_line])

        # drop NaN values
        df.dropna(inplace=True)

In [32]:
%%capture output
# create the plot folder if it doesn't exist
if not os.path.exists(f'{plot_folder}/1.3_normalization'):
    os.makedirs(f'{plot_folder}/1.3_normalization')

# plot the data after the second normalization step
for key in single_dfs.keys():
    for cell_line in cell_lines:
        df = single_dfs[key]
        knock_df = df[f"{cell_line}_3UTR"].sort_values(ascending=False)

        plt.clf()
        fig = plt.figure(figsize=(2.2, 1.7))

        # create plots
        plt.scatter(df.loc[knock_df.index, f"{cell_line}_expression"], 10**knock_df, s=5, color="dodgerblue")
        x_range = np.arange(0, 5.5, 0.01)
        plt.plot(x_range, [1 for i in range(len(x_range))], color="black", linestyle="dashed", label="y=0")

        # calculate the correlation coefficient
        r, p = stats.spearmanr(np.log10(df.loc[knock_df.index, f"{cell_line}_expression"]), knock_df)

        plt.xlabel(r"log$_{10}$"+f"({cell_line} expression)")
        plt.ylabel(r"log$_{10}$(RNA/DNA)")

        plt.xlim(0, 5.5)
        plt.legend(loc="lower left", frameon=False)
        plt.title(f"{cell_line}_{key}, " + r"$\rho^2$ = " + str(round(r**2, 2)), fontsize=8)
        for format in ['svg', 'png']:
            plt.savefig(f"{plot_folder}/1.3_normalization/{key}_{cell_line}_after2.{format}", dpi=300, bbox_inches='tight')

In [33]:
for key, df in reference_dfs.items():
    fig = plt.figure(figsize=(2.2, 1.7))
    sns.violinplot(data=df[cell_line_labels], scale='width', inner='quartile', linewidth=0.5, palette='viridis')
    # plot a line at 0
    plt.axhline(0, color='black', linestyle='dashed', linewidth=0.5)
    
    # make cell_lines the xticklabels
    plt.xticks(range(len(cell_lines)), cell_lines, rotation=90)
    plt.ylabel("log10(RNA/DNA)")
    for format in ['svg', 'png']:
        plt.savefig(f"{plot_folder}/1.3_normalization/violin_{key}_norm_step2.{format}", dpi=300, bbox_inches='tight')

# 1.4 - Analyze change after median normalization

In [34]:
# create the plot folder if it doesn't exist
if not os.path.exists(f'{plot_folder}/1.4_median_analysis'):
    os.makedirs(f'{plot_folder}/1.4_median_analysis')

for key, val in median_dicts_after.items():
    val_after = val[list(val.keys())[0]]
    index = list(median_dicts_before[key].keys())
    index = [entry.split('_')[0] for entry in index]
    values = list(median_dicts_before[key].values())
    vals_before = pd.Series(values, index=index)
    vals = val_after - vals_before
    plt.clf()
    plt.figure(figsize=(2.2, 2))
    plt.bar(vals.index, vals)
    
    # rotate xticks 90 degrees
    plt.xticks(rotation=90)
    plt.ylabel("median after - median before")
    plt.ylim(-0.4, 0.3)
    plt.tight_layout()
    plt.savefig(f"{plot_folder}/1.4_median_analysis/{key}.png", dpi=300, bbox_inches='tight')

A low value implies that the median before was higher, which in turn implies that constructs were relatively more stable than the baseline stability for constructs with a single target site for a non-expressed microRNA.

The differences are small for values below AND4. This implies that the reference stability for some cell lines changes when sites 4 to 6 are introduced.
Because we are interested in the stability relative to a case without microRNA knockdown (as opposed to the stability relative to the original context sequence without microRNA sequences), we divide this out.

In [49]:
change_vals = {}
index = list(median_dicts_before[key].keys())
index = [entry.split('_')[0] for entry in index]

change_vals = {}
for key in median_dicts_after.keys():
    vals_after = pd.Series(list(median_dicts_after[key].values()), index=index)
    vals_before = pd.Series(list(median_dicts_before[key].values()), index=index)
    change_vals[key] = val_after - vals_before

In [None]:
change_vals[6]

In [57]:
# average the change values
relevant_keys = [4]
average_change_vals = {cell_line: 0 for cell_line in cell_lines}
for key in relevant_keys:
    for cell_line in cell_lines:
        average_change_vals[cell_line] += change_vals[key][cell_line]
average_change_vals = {key: val/len(relevant_keys) for key, val in average_change_vals.items()}