In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import itertools
import random
import time
import re
import os
from lib.context_insertion import distance_to_start_codon, determine_generic_ins_positions, insert_miRNA_sites

cell_lines_main = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH-7", "A549"]
cell_lines_other = ["HaCaT", "JEG-3", "Tera-1", "PC-3"]
cell_lines_measured = ["HEK293T", "HeLa", "SKNSH", "MCF7"]
cell_lines = cell_lines_main + cell_lines_other

output_folder = "../designs/inserted/"

## Get the target sites

In [25]:
mirbase_df = pd.read_csv("../data/miRNA_data/mirbase_with_targets.csv", index_col=0)
mirbase_df = mirbase_df[["target"]]

In [26]:
mutation_df = pd.read_csv("../designs/5_miRNA_single_mut.csv", index_col=0)
mutation_df = mutation_df[["target"]]
# replace Us by Ts in the mutation df
mutation_df["target"] = mutation_df["target"].apply(lambda x: x.replace("U", "T"))

In [27]:
target_df = pd.concat([mirbase_df, mutation_df])

# get the position of the first ATG
target_df["ATG_pos"] = target_df["target"].apply(lambda x: [match.start() for match in re.finditer('ATG', x)])
# get the position of the first ATG modulo 3
target_df["ATG_pos_mod3"] = target_df["ATG_pos"].apply(lambda x: [entry%3 for entry in x])
# fill the empty lists with 0
target_df['ATG_pos_mod3'] = target_df['ATG_pos_mod3'].apply(lambda x: x if x else [0])
# get the ATG count
target_df["ATG_count"] = target_df["target"].apply(lambda x: x.count("ATG"))

# Insert targets into the context sequence

In [28]:
def add_target_sites(df, n_mirnas, target_df):
    """This code adds target sites to the dataframe df. The target sites are taken from the target_df dataframe."""
    for i in range(n_mirnas):
        df.loc[:,f"target{i+1}"] = df[f"miRNA{i+1}"].map(target_df["target"])
    return df

In [29]:
# load the context data
with open("../designs/context/universal_lib2_context.txt", "r") as f:
    context_lib2 = f.readline().strip()
    print(context_lib2)
    print(len(context_lib2))


AAAGAGCAGAAGCAGCAGCATCTCTGTACATTTTGGAGCCAAGGGTTCAGAGACTCAGGGCCCCAGCACTTAAGCAGTGGACACCAGGAGTCCCTGGTAATCAGTACTGTGTACAGAATTCTGCTACCTCACTACAAGCAGAAGGAAACATTGAACTCAGAGCC
164


In [30]:
# some of the designs (0_lib2_controls and 4_miRNA_full_single_context_controls) have already been generated. We need to skip them.
prefixes = [f"{i}_mi" for i in range(1, 50) if i != 4]
print(prefixes)

['1_mi', '2_mi', '3_mi', '5_mi', '6_mi', '7_mi', '8_mi', '9_mi', '10_mi', '11_mi', '12_mi', '13_mi', '14_mi', '15_mi', '16_mi', '17_mi', '18_mi', '19_mi', '20_mi', '21_mi', '22_mi', '23_mi', '24_mi', '25_mi', '26_mi', '27_mi', '28_mi', '29_mi', '30_mi', '31_mi', '32_mi', '33_mi', '34_mi', '35_mi', '36_mi', '37_mi', '38_mi', '39_mi', '40_mi', '41_mi', '42_mi', '43_mi', '44_mi', '45_mi', '46_mi', '47_mi', '48_mi', '49_mi']


In [31]:
# get the names of all csv files in the design folder using glob
csv_files = os.listdir("../designs/")
# filter to those that contain the prefixes
csv_files = [file for file in csv_files if any([prefix in file for prefix in prefixes])]
print(csv_files)

['10_miRNA_full_repeat_x6.csv', '11_miRNA_full_combination_probe_x2.csv', '12_miRNA_full_combination_probe_x3.csv', '13_miRNA_full_combination_probe_x4.csv', '14_miRNA_full_combination_probe_x5.csv', '15_miRNA_full_combination_probe_x6.csv', '16_miRNA_previous_AND5.csv', '17_miRNA_mut_combination_probe_x2.csv', '18_miRNA_mut_combination_probe_x3.csv', '19_miRNA_mut_combination_probe_x4.csv', '1_mirna_full_single_high_conf.csv', '20_miRNA_mut_combination_probe_x5.csv', '21_miRNA_mut_combination_probe_x6.csv', '22_miRNA_full_combination_shuffle_x5.csv', '23_miRNA_mut_combination_shuffle_x5.csv', '24_miRNA_full_subset_quality_AND4.csv', '25_miRNA_full_subset_quality_AND5.csv', '26_miRNA_full_subset_quality_AND6.csv', '27_miRNA_full_quality_AND4.csv', '28_miRNA_full_quality_AND5.csv', '29_miRNA_full_quality_AND6.csv', '2_mirna_full_single_low_conf_mirgenedb.csv', '30_miRNA_AND4_subset_mse_designs.csv', '31_miRNA_AND5_subset_mse_designs.csv', '32_miRNA_AND6_subset_mse_designs.csv', '33_miRN

In [32]:
for csv_file in csv_files:
    df = pd.read_csv(f"../designs/{csv_file}", index_col=0)
    print(csv_file, len(df))

10_miRNA_full_repeat_x6.csv 100
11_miRNA_full_combination_probe_x2.csv 100
12_miRNA_full_combination_probe_x3.csv 100
13_miRNA_full_combination_probe_x4.csv 100
14_miRNA_full_combination_probe_x5.csv 100
15_miRNA_full_combination_probe_x6.csv 100
16_miRNA_previous_AND5.csv 20
17_miRNA_mut_combination_probe_x2.csv 115
18_miRNA_mut_combination_probe_x3.csv 115
19_miRNA_mut_combination_probe_x4.csv 115
1_mirna_full_single_high_conf.csv 861
20_miRNA_mut_combination_probe_x5.csv 115
21_miRNA_mut_combination_probe_x6.csv 115
22_miRNA_full_combination_shuffle_x5.csv 450
23_miRNA_mut_combination_shuffle_x5.csv 600
24_miRNA_full_subset_quality_AND4.csv 54
25_miRNA_full_subset_quality_AND5.csv 54
26_miRNA_full_subset_quality_AND6.csv 54
27_miRNA_full_quality_AND4.csv 90
28_miRNA_full_quality_AND5.csv 90
29_miRNA_full_quality_AND6.csv 90
2_mirna_full_single_low_conf_mirgenedb.csv 166
30_miRNA_AND4_subset_mse_designs.csv 540
31_miRNA_AND5_subset_mse_designs.csv 540
32_miRNA_AND6_subset_mse_designs

In [33]:
var_region_len = len(context_lib2)
three_p_seq_len = 61
target_len = 21
dist_between = 6

for csv_file in csv_files:
    df = pd.read_csv(f"../designs/{csv_file}", index_col=0)
    mirna_columns = [col for col in df.columns if "miRNA" in col]
    n_mirnas = len(mirna_columns)
    df = add_target_sites(df, n_mirnas, target_df)
    df.loc[:, "context"] = "context_lib2"

    # get the insertion position
    insertion_positions = determine_generic_ins_positions(
                                                    var_region_len=var_region_len,
                                                    three_p_seq_len=three_p_seq_len,
                                                    target_len=target_len,
                                                    dist_between=dist_between,
                                                    no_of_inserts=n_mirnas)

    for index, row in df.iterrows():
        mirnas = [row[col] for col in mirna_columns]
        df.loc[index, "seq"] = insert_miRNA_sites(context_lib2, insertion_positions, target_df, mirnas)

    df.to_csv(f"{output_folder}{csv_file.split('.')[0]}_inserted.csv")

# Generate the final sequences by adding the adapters

In [34]:
# get a list of all files in the output folder
files = os.listdir(output_folder)

# read each file into a dataframe and append to a list
dfs = []
for file in files:
    df = pd.read_csv(os.path.join(output_folder, file), index_col=0)
    # only keep the sequence
    df = df[["seq"]]
    dfs.append(df)

# concatenate all dataframes in the list
all_designs = pd.concat(dfs, axis=0)

# print statistics on the length of the sequences, i.e., the length of each entry in the seq column
print(all_designs["seq"].str.len().describe())

count    9375.0
mean      164.0
std         0.0
min       164.0
25%       164.0
50%       164.0
75%       164.0
max       164.0
Name: seq, dtype: float64


In [35]:
# load the context and control sequences
df = pd.read_csv(f"../designs/0_lib2_controls.csv", index_col=0)
df = df[["seq"]]
all_designs = pd.concat([all_designs, df], axis=0)

df = pd.read_csv(f"../designs/4_miRNA_full_single_context_controls.csv", index_col=0)
df = df[["seq"]]
all_designs = pd.concat([all_designs, df], axis=0)

In [36]:
# print statistics on the length of the sequences, i.e., the length of each entry in the seq column
print(all_designs["seq"].str.len().describe())

count    10027.0
mean       164.0
std          0.0
min        164.0
25%        164.0
50%        164.0
75%        164.0
max        164.0
Name: seq, dtype: float64


## Drop duplicates

In [37]:
# are there any duplicate entries?
# get all duplicated sequences, and make sure to group them by the sequence
duplicated = all_designs[all_designs.duplicated(subset="seq", keep=False)].sort_values("seq")

# Group and list duplicates
grouped_duplicates = duplicated.groupby("seq").apply(lambda x: x.index.tolist())
grouped_duplicates.to_excel("../designs/additional_info/duplicated_sequences.xlsx")

# Drop duplicates
all_designs = all_designs.drop_duplicates(subset="seq", keep="first")

# print the new length
print(all_designs["seq"].str.len().describe())

count    10007.0
mean       164.0
std          0.0
min        164.0
25%        164.0
50%        164.0
75%        164.0
max        164.0
Name: seq, dtype: float64


## Add the adapters

In [38]:
left_handle = "ACGACGCTCTTCCGATCT"
right_handle = "CTCTGGATTTGCAACCGA"
all_designs["seq"] = left_handle + all_designs["seq"] + right_handle

In [39]:
print(all_designs["seq"].str.len().describe())

count    10007.0
mean       200.0
std          0.0
min        200.0
25%        200.0
50%        200.0
75%        200.0
max        200.0
Name: seq, dtype: float64


# Check the generated sequences

I should check for
- restriction sites (BsaI: GAGACC, GGTCTC) [ended up using Gibson cloning]
- start codons (ATG) and their distance to the start site
- polyA signals (AATAAA|ATTAAA|AGTAAA|TATAAA|ACTAAA)
- homopolymers (AAAA|TTTT|CCCC|GGGG)

I should also look at
- GC content
- length

In [40]:
# look at restriction sites
restriction_sites = ["GAGACC", "GGTCTC"]
# print how many of the sequences contain the restriction sites
print(all_designs["seq"].str.contains("|".join(restriction_sites)).sum())
all_designs[all_designs["seq"].str.contains("|".join(restriction_sites)) == True].to_excel("../designs/additional_info/restriction_sites.xlsx")

20


In [44]:
# look at polyA signals
polyA_signals = ["AATAAA", "ATTAAA", "AGTAAA", "TATAAA", "ACTAAA"]
# print how many of the sequences contain the polyA signals
print(all_designs["seq"].str.contains("|".join(polyA_signals)).sum())
all_designs[all_designs["seq"].str.contains("|".join(polyA_signals)) == True].to_excel("../designs/additional_info/polyA_signals.xlsx")

# delete the sequences that contain "AATAAA" or "ATTAAA"
print(all_designs[all_designs["seq"].str.contains("AATAAA") == True].index)

22
Index(['1_mirna_full_single_high_conf_94', '1_mirna_full_single_high_conf_307',
       '1_mirna_full_single_high_conf_638',
       '1_mirna_full_single_high_conf_639',
       '2_mirna_full_single_low_conf_mirgenedb_165',
       '5.14_miRNA_let-7a-5p_smut_swob_omut11.15.19_owob'],
      dtype='object')


In [45]:
# delete the sequences that contain "AATAAA"
all_designs = all_designs[all_designs["seq"].str.contains("AATAAA") == False]

In [46]:
print(len(all_designs))

10001


In [47]:
# look at start codons
start_codons = ["ATG"]
# print how many of the sequences contain the start codons
print(all_designs["seq"].str.contains("|".join(start_codons)).sum())
# print the statistics of their position in the sequence modulo 3
print(all_designs["seq"].str.find("ATG").mod(3).value_counts())

6231
0    6225
2    3772
1       4
Name: seq, dtype: int64


In [48]:
# how many sequences do I have?
print(all_designs["seq"].str.len().describe())

count    10001.0
mean       200.0
std          0.0
min        200.0
25%        200.0
50%        200.0
75%        200.0
max        200.0
Name: seq, dtype: float64


# Final check

In [75]:
# print statistics on the length of the sequences, i.e., the length of each entry in the seq column
print(all_designs["seq"].str.len().describe())

count    9999.0
mean      200.0
std         0.0
min       200.0
25%       200.0
50%       200.0
75%       200.0
max       200.0
Name: seq, dtype: float64


In [76]:
# for each oligoseq, check if the prefix and suffix are present
print("prefix present:", all_designs['seq'].str.contains(left_handle).value_counts())
print("suffix present:", all_designs['seq'].str.contains(right_handle).value_counts())

# get value counts of the positions of the prefix and suffix
print("prefix position:", all_designs['seq'].apply(lambda x: x.find(left_handle)).value_counts())
print("suffix position:", all_designs['seq'].apply(lambda x: x.find(right_handle)).value_counts())

prefix present: True    9999
Name: seq, dtype: int64
suffix present: True    9999
Name: seq, dtype: int64
prefix position: 0    9999
Name: seq, dtype: int64
suffix position: 182    9999
Name: seq, dtype: int64


In [77]:
# check if all indices are unique
print(all_designs.index.is_unique)
# check if all oligoseqs are unique
print(all_designs['seq'].is_unique)

True
True


In [113]:
# write designs to a csv file
all_designs.to_csv("../designs/all_designs/all_designs.csv")