In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from lib.NA_sequence_utilities import *

Here, we choose which miRNAs we wish to measure using flow cytometry. The idea is to span the whole range of repression in the chosen cell lines. We therefore take the fitted model, apply it to the expression data, then check the expected pattern across cell lines.

In [2]:
# mirna_expression
mirna_expression_all = pd.read_csv('../input_data/miRNA_expression_data/1_output/1.10_alles_quantile_no_crosstalk_filter.csv', index_col=0)
mirna_expression = pd.read_csv('../input_data/miRNA_expression_data/1_output/1.10_alles_quantile_crosstalk_filter.csv', index_col=0)

In [3]:
data_dir_input = "../input_data/measurements_lib1"

# get the name of all files in "reference" folder
reference_files = os.listdir(data_dir_input)

# read them into a dictionary
reference_dict = {}
for reference_file in reference_files:
    if reference_file.endswith(".csv"):
        reference_dict[reference_file.split('.')[0]] = pd.read_csv(os.path.join(data_dir_input, reference_file), index_col=0)

# get all dfs that contain "single" in their key
single_dfs = {key: reference_dict[key].copy() for key in reference_dict.keys() if "single" in key}

In [4]:
def transfer_function(x, c1=3.5, c2=10):
    """The expression is assumed to be normalized to one.
    The microRNA data is assumed to be linear.
    The output is also linear."""
    result = (1 / (1 + x / c1)) * (1 + x / c2)
    return result

def add_mirna_expression(mirna_expr, construct_df):
    combs = get_combinations(construct_df)
    added_expression = pd.DataFrame(columns=mirna_expr.columns, index=construct_df.index)

    for i, comb in enumerate(combs):
        added_expression.iloc[i,:] = mirna_expr.loc[comb,:].sum(axis=0)

    return added_expression

def get_combinations(df):
    # get all columns that contain "miRNA"
    miRNA_columns = [column for column in df.columns if "miRNA" in column]
    # make a list of tuples of the miRNA names
    miRNA_combinations = list(zip(*[df[miRNA_column] for miRNA_column in miRNA_columns]))
    return miRNA_combinations

def tsi(x):
    # if x is not normalized yet, normalize it
    # THIS IS IMPORTANT
    x = x/x.max(axis=1, keepdims=True)

    tsi = np.sum(1-x, axis=1)/(x.shape[1]-1)
    return tsi

def calculate_quality(row):
    return row["tsi"]*row[row["cell"]]

filename = "../output/1_output/1.10_fit_parameters_with_scales.txt"
with open(filename, "r") as file:
    file.readline()
    line = file.readline().split("\t")
    c1 = float(line[0])
    c2 = float(line[1])

popt_filter = 10**np.array([c1,  c2])
mirna_expression_lin = 10**mirna_expression
mirna_expression_lin_all = 10**mirna_expression_all
expression = mirna_expression_lin.apply(lambda x: transfer_function(x, *popt_filter))

In [5]:
# get data for two repeats
df_2x = single_dfs["2_full_singleX2_context1"]
sub_df = df_2x.filter(regex="log10")
sub_df = sub_df.filter(regex="3UTR")
sub_df.columns = sub_df.columns.str.replace("_3UTR_log10", "")
sub_df.index = df_2x["miRNA1"]
df_2x.index = df_2x["miRNA1"]

# filter to mirnas in the expression data
common_index = mirna_expression_lin_all.index.intersection(sub_df.index)
df_2x = df_2x.loc[common_index,:]
sub_df = sub_df.loc[common_index,:]

In [7]:
added_mirnas = add_mirna_expression(mirna_expression_lin_all, df_2x)
model_df = added_mirnas.apply(lambda x: transfer_function(x, *popt_filter))
model_df = np.log10(model_df.astype(float))

In [None]:
cell_lines_flow = ["HEK293T", "HeLa"]

for cell_line in cell_lines_flow:
    plt.scatter(model_df[cell_line], sub_df[cell_line], label=f"{cell_line}", s=15)
    
plt.ylabel("log10(Measurement)")
plt.xlabel("log10(Model)")
plt.show()

In [None]:
# choose miRNAs that span the range of the data
mirnas = [
    "hsa-miR-21-5p",
    "hsa-miR-141-3p",
    "hsa-let-7a-5p",
    "hsa-miR-20a-5p",
    "hsa-miR-122-5p",
    "hsa-miR-10a-5p",
    "hsa-miR-18a-5p",
    "hsa-miR-31-5p",
    "hsa-miR-7-5p",
    "hsa-miR-25-3p",
]

cell_line = "HeLa"
filter_df = mirna_expression_all.loc[mirnas, cell_line]
model_filter = model_df.loc[mirnas, cell_line]
sub_filter = sub_df.loc[mirnas, cell_line]

# plt.scatter(model_filter, sub_filter, label=f"{cell_line}", s=15)
plt.scatter(filter_df, model_filter, label=f"{cell_line}", s=15, marker="x", color="black")
for i, row in filter_df.iteritems():
    plt.text(filter_df.loc[i], model_filter.loc[i], "-".join(i.split("-")[2:]), fontsize=9)

plt.ylabel("log10(Measurement)")
plt.xlabel("log10(Model)")
plt.show()

## Export the flow sequences needed for cloning

In [10]:
output_df = df_2x[df_2x["miRNA1"].isin(mirnas)]
output_df.set_index("miRNA1", inplace=True)
output_df = output_df[["target1", "oligoseq"]]

In [11]:
# This is what needs to be added for GoldenGate cloning
left_flank = "TTGGAGCCAAGGGTTCAGAGACTCAG"
right_flank = "ACTGTGTACAGAATTCTGCTACCTCACT"
left_overhang = "TCAG"
right_overhang = "ACTG"

In [12]:
oligoseqs = output_df["oligoseq"].copy()

In [13]:
inserts_seqs = [oligoseq[oligoseq.find(left_flank)+len(left_flank):oligoseq.find(right_flank)] for oligoseq in oligoseqs]
forward_oligos = [left_overhang + insert_seq for insert_seq in inserts_seqs]
reverse_oligos = [reverse_complement(insert_seq + right_overhang, alph="DNA") for insert_seq in inserts_seqs]

In [14]:
output_df["insert_seq"] = inserts_seqs
output_df["forward_oligo"] = forward_oligos
output_df["reverse_oligo"] = reverse_oligos

In [16]:
# create the output folder if it does not exist
os.makedirs("../output/1a_output", exist_ok=True)
output_df.to_excel("../output/1a_output/1a_flow_control_sequences.xlsx")