In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from tqdm.notebook import tqdm
import re
import os

In [17]:
trans = str.maketrans('ATGC', 'TACG')

In [18]:
from isotonic_barcode_calling import *
import parallel_process_files

In [42]:
# minimum score (out of 100) on fuzzy protospacer matching
MATCHING_THRESHOLD = 80
IDX_THRESHOLD = 75
# number of workers for distance computation in calling
NUM_WORKERS = 8

In [20]:
DIRECTORY = "/data1/normantm/angel/sequencing/original_single_cell_24wells_output/Samples/DefaultProject"

In [22]:
def get_folder_names(directory):
    # List all entries in the directory
    entries = os.listdir(directory)
    
    # Filter out only folders
    folders = sorted([entry for entry in entries if os.path.isdir(os.path.join(directory, entry))])
    
    return folders

SAMPLE_NAMES = get_folder_names(DIRECTORY)

In [23]:
barcodes = pd.read_csv("10k_barcodes.csv", header=None)[1].str[30:55].values

In [24]:
K562_separate_sgRNA1 = pd.read_csv("twist_library_separate_sgRNA1.csv", header=None, index_col=0)
K562_separate_sgRNA2 = pd.read_csv("twist_library_separate_sgRNA2.csv", header=None, index_col=0)

K562_separate_sgRNA1["sgRNA1"] = K562_separate_sgRNA1[1].str[34:54].str.translate(trans).str[::-1]
K562_separate_sgRNA2["sgRNA2"] = K562_separate_sgRNA2[1].str[70:90]

In [2]:
well_index_df = pd.read_csv("96well_RT_oligo_info.csv", index_col=0)

In [40]:
def process_sample(sample, separate_sgRNA1, separate_sgRNA2, i1_map, run_name, output_dir):
    print("reading UMI")
    umi = parallel_process_files.fast_fastq_sequences(f"{DIRECTORY}/{sample}/{sample}_UMI_I1.fastq.gz", pigz_threads=4)
    print("reading guide 2")
    g2 = parallel_process_files.parallel_extract_protospacer2(f"{DIRECTORY}/{sample}/{sample}_R2.fastq.gz",
                                                              workers=12, chunksize=1000000, pigz_threads=4)
    print("reading barcode")
    b = parallel_process_files.parallel_barcode_extractor(f"{DIRECTORY}/{sample}/{sample}_R2.fastq.gz",
                                                          workers=12, chunksize=1000000, pigz_threads=4)
    print("reading guide 1")
    g1 = parallel_process_files.parallel_protospacer_extractor(f"{DIRECTORY}/{sample}/{sample}_R1.fastq.gz", 
                                                               five_bp="TAAAC", workers=12, chunksize=1000000,
                                                               pigz_threads=4)
    print("reading index 1")
    i1 = parallel_process_files.fast_fastq_sequences(f"{DIRECTORY}/{sample}/{sample}_I1.fastq.gz", pigz_threads=4)
    
    unique_b = pd.unique(b)
    unique_i1 = pd.unique(i1)

    print("mapping barcodes")
    with open(f"/data1/normantm/angel/freebarcodes/{run_name}_unique_b_{sample}.fastq", "w") as f:
        for i, ba in enumerate(unique_b):
            if "N" in ba:
                continue
            f.write(f"@{i}\n")
            f.write(f"{ba}\n")
            f.write("+\n")
            Q = ['I']*25
            Q = "".join(Q)
            f.write(f"{Q}\n")
            
    os.system("freebarcodes decode /data1/normantm/angel/freebarcodes/10k_barcodes15-2.txt,/data1/normantm/angel/freebarcodes/10k_barcodes10-1.txt " +\
              f"/data1/normantm/angel/freebarcodes/{run_name}_unique_b_{sample}.fastq --output-dir /data1/normantm/angel/freebarcodes/")
    
    decoded_unique_b = pd.read_csv(f"/data1/normantm/angel/freebarcodes/{run_name}_unique_b_{sample}_decoded.txt", sep="\t", header=None, index_col=0)
    decoded_unique_b.columns = ["b15", "b10", "seq"]
    decoded_unique_b["barcode_mapped"] = decoded_unique_b["b15"] + decoded_unique_b["b10"]
    decoded_unique_b.drop_duplicates(inplace=True)
    decoded_unique_b.query("barcode_mapped in @barcodes", inplace=True)
    decoded_unique_b.set_index("seq", inplace=True)

    print("aligning protospacers")
    idx_b = pd.Index(b).isin(decoded_unique_b.index)

    filtered_g1 = g1[idx_b]
    filtered_g2 = g2[idx_b]
    unique_p = pd.unique(np.concatenate([pd.unique(filtered_g1), pd.unique(filtered_g2)]))

    alignments_p = chunked_call_alignments_new(unique_p, separate_sgRNA1.sgRNA1, num_chunks=int(len(unique_p)/1000)+1, num_workers=NUM_WORKERS)
    alignments_p.to_csv(f"{output_dir}/{sample}_guide_alignments.csv")
    alignments_p.query("score >= @MATCHING_THRESHOLD", inplace=True)

    print("mapping indices")
    alignments_i1 = chunked_call_alignments_new(unique_i1, i1_map, num_chunks=int(len(unique_i1)/1000)+1, num_workers=NUM_WORKERS)
    alignments_i1.query("score >= @IDX_THRESHOLD", inplace=True)

    print("filtering reads")
    idx_p1 = pd.Index(g1).isin(alignments_p.index)
    idx_p2 = pd.Index(g2).isin(alignments_p.index)
    idx_i1 = pd.Index(i1).isin(alignments_i1.index)
    idx_umi = np.char.find(umi, "N") == -1
    
    filtered_idx = idx_p1 & idx_p2 & idx_b & idx_i1 & idx_umi
    
    g1, g2 = g1[filtered_idx], g2[filtered_idx]
    b = b[filtered_idx]
    umi = umi[filtered_idx]
    i1 = i1[filtered_idx]

    print("writing to files")
    with open(f"{output_dir}/new_processed_filtered_reads_{sample}.csv", "a") as f:
        f.write("p1_identity,p2_identity,p1_score,p2_score,barcode_mapped,well,UMI\n")
    
        size = 2000000
        chunks = np.arange(0, len(g1), size)
        
        for i in chunks:
                
            lines = np.array([alignments_p.loc[g1[i:i+size]].identity.values,
                              alignments_p.loc[g2[i:i+size]].identity.values,
                              alignments_p.loc[g1[i:i+size]].score.values.astype(int).astype(str),
                              alignments_p.loc[g2[i:i+size]].score.values.astype(int).astype(str),
                              decoded_unique_b.loc[b[i:i+size]].barcode_mapped.values,
                              alignments_i1.loc[i1[i:i+size]].identity.values,
                              umi[i:i+size]])
                
            lines = [",".join(a)+"\n" for a in lines.T]
            
            f.writelines(lines)

In [29]:
output_dir = "/data1/normantm/angel/sequencing/original_single_cell_24wells_output"
run_name = "20240422_K562_AP1_100_single_cell_24wells"

In [None]:
for sample in ['identity_1', 'reporter_1']:
    process_sample(sample, K562_separate_sgRNA1, K562_separate_sgRNA2,
                   well_index_df[well_index_df.index.str[1:].isin(['1','2','3'])]["index"],
                   run_name, output_dir)

reading UMI
reading guide 2


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:  1.3min
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  2.0min
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  2.7min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  3.5min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  3.8min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  4.6min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  5.4min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  6.9min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  7.7min
[Parallel(n_jobs=12)]: Done 110 out of 120 | elapsed:  8.2min remaining:   44.8s
[Parallel(n_jobs=12)]: Done 120 out of 120 | elapsed:  8.4min finished


reading barcode


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:  1.2min
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  2.0min
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  2.7min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  3.5min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  3.7min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  4.5min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  5.2min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  6.5min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  7.3min
[Parallel(n_jobs=12)]: Done 110 out of 120 | elapsed:  7.8min remaining:   42.6s
[Parallel(n_jobs=12)]: Done 120 out of 120 | elapsed:  8.0min finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.


reading guide 1


[Parallel(n_jobs=12)]: Done   2 tasks      | elapsed:   50.3s
[Parallel(n_jobs=12)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=12)]: Done  18 tasks      | elapsed:  1.9min
[Parallel(n_jobs=12)]: Done  27 tasks      | elapsed:  2.5min
[Parallel(n_jobs=12)]: Done  38 tasks      | elapsed:  2.8min
[Parallel(n_jobs=12)]: Done  49 tasks      | elapsed:  3.3min
[Parallel(n_jobs=12)]: Done  62 tasks      | elapsed:  3.9min
[Parallel(n_jobs=12)]: Done  75 tasks      | elapsed:  4.9min
[Parallel(n_jobs=12)]: Done  90 tasks      | elapsed:  5.5min
[Parallel(n_jobs=12)]: Done 110 out of 120 | elapsed:  6.0min remaining:   32.7s
[Parallel(n_jobs=12)]: Done 120 out of 120 | elapsed:  6.0min finished


reading index 1
mapping barcodes
aligning protospacers


100%|███████████████████████████████████████| 1376/1376 [00:52<00:00, 26.23it/s]


mapping indices


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.31it/s]

filtering reads





writing to files
reading UMI
reading guide 2


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:  1.4min
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  2.1min
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  2.8min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  3.6min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  3.9min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  4.6min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  5.5min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  6.9min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  7.7min
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:  8.6min
[Parallel(n_jobs=12)]: Done 130 out of 139 | elapsed:  9.4min remaining:   39.2s
[Parallel(n_jobs=12)]: Done 139 out of 139 | elapsed:  9.6min finished


reading barcode


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:  1.2min
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  1.9min
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  2.7min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  3.4min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  3.7min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  4.5min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  5.3min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  6.7min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  7.6min
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:  8.5min
[Parallel(n_jobs=12)]: Done 130 out of 139 | elapsed:  9.4min remaining:   38.9s
[Parallel(n_jobs=12)]: Done 139 out of 139 | elapsed:  9.5min finished


reading guide 1


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   51.9s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  1.4min
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.9min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  2.5min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  2.7min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  3.3min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  4.0min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  5.0min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  5.7min
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:  6.4min
[Parallel(n_jobs=12)]: Done 130 out of 139 | elapsed:  7.1min remaining:   29.4s
[Parallel(n_jobs=12)]: Done 139 out of 139 | elapsed:  7.1min finished


reading index 1
mapping barcodes


In [45]:
for sample in ['identity_2', 'reporter_2']:
    process_sample(sample, K562_separate_sgRNA1, K562_separate_sgRNA2,
                   well_index_df[well_index_df.index.str[1:].isin(['6','7','8'])]["index"],
                   run_name, output_dir)

reading UMI
reading guide 2


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   46.6s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  1.1min
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.6min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  2.1min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  2.3min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  2.7min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  3.3min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  4.1min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  4.6min
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:  5.1min
[Parallel(n_jobs=12)]: Done 130 out of 139 | elapsed:  5.7min remaining:   23.6s
[Parallel(n_jobs=12)]: Done 139 out of 139 | elapsed:  5.8min finished


reading barcode


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   40.6s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  1.0min
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.5min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  2.1min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  2.6min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  3.1min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  3.9min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  4.5min
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:  5.0min
[Parallel(n_jobs=12)]: Done 130 out of 139 | elapsed:  5.5min remaining:   23.0s
[Parallel(n_jobs=12)]: Done 139 out of 139 | elapsed:  5.7min finished


reading guide 1


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   28.4s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:   44.5s
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.1min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  1.5min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  1.7min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  2.0min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  2.4min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  2.9min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  3.4min
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:  3.8min
[Parallel(n_jobs=12)]: Done 130 out of 139 | elapsed:  4.3min remaining:   17.7s
[Parallel(n_jobs=12)]: Done 139 out of 139 | elapsed:  4.3min finished


reading index 1
mapping barcodes
aligning protospacers


100%|███████████████████████████████████████| 1445/1445 [00:35<00:00, 40.71it/s]


mapping indices


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  2.12it/s]


filtering reads
writing to files
reading UMI
reading guide 2


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   47.6s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  1.2min
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.6min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  2.0min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  2.2min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  2.7min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  3.2min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  4.0min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  4.5min
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:  5.0min
[Parallel(n_jobs=12)]: Done 129 out of 138 | elapsed:  5.6min remaining:   23.2s
[Parallel(n_jobs=12)]: Done 138 out of 138 | elapsed:  5.7min finished


reading barcode


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   41.1s
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:  1.1min
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:  1.5min
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:  1.9min
[Parallel(n_jobs=12)]: Done  37 tasks      | elapsed:  2.1min
[Parallel(n_jobs=12)]: Done  48 tasks      | elapsed:  2.6min
[Parallel(n_jobs=12)]: Done  61 tasks      | elapsed:  3.1min
[Parallel(n_jobs=12)]: Done  74 tasks      | elapsed:  3.9min
[Parallel(n_jobs=12)]: Done  89 tasks      | elapsed:  4.4min
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:  5.0min
[Parallel(n_jobs=12)]: Done 129 out of 138 | elapsed:  5.5min remaining:   23.1s
[Parallel(n_jobs=12)]: Done 138 out of 138 | elapsed:  5.7min finished


reading guide 1


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 tasks      | elapsed:   29.9s
[Parallel(n_jobs=12)]: Done   9 tasks      | elapsed:   48.0s
[Parallel(n_jobs=12)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=12)]: Done  27 tasks      | elapsed:  1.5min
[Parallel(n_jobs=12)]: Done  38 tasks      | elapsed:  1.7min
[Parallel(n_jobs=12)]: Done  49 tasks      | elapsed:  2.0min
[Parallel(n_jobs=12)]: Done  62 tasks      | elapsed:  2.4min
[Parallel(n_jobs=12)]: Done  75 tasks      | elapsed:  3.0min
[Parallel(n_jobs=12)]: Done  90 tasks      | elapsed:  3.4min
[Parallel(n_jobs=12)]: Done 105 tasks      | elapsed:  3.8min
[Parallel(n_jobs=12)]: Done 129 out of 138 | elapsed:  4.3min remaining:   17.9s
[Parallel(n_jobs=12)]: Done 138 out of 138 | elapsed:  4.3min finished


reading index 1
mapping barcodes
aligning protospacers


100%|███████████████████████████████████████| 1199/1199 [00:28<00:00, 42.10it/s]


mapping indices


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  2.03it/s]


filtering reads
writing to files
