In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from tqdm.notebook import tqdm
import re
import os

In [2]:
trans = str.maketrans('ATGC', 'TACG')

In [3]:
from isotonic_barcode_calling import *
import process_files

In [4]:
# minimum score (out of 100) on fuzzy protospacer matching
MATCHING_THRESHOLD = 90
# number of workers for distance computation in calling
NUM_WORKERS = 8

In [5]:
DIRECTORY = "/data1/normantm/angel/sequencing/AP1_816_puro_virus_output/Samples/DefaultProject"
sampe = "AP1_816_puro_virus"

In [6]:
barcodes = pd.read_csv("120k_barcodes.csv", header=None)[1].str[30:55].values

In [7]:
K562_separate_sgRNA1 = pd.read_csv("twist_K562_816_separate_sgRNA1.csv", header=None, index_col=0)
K562_separate_sgRNA2 = pd.read_csv("twist_K562_816_separate_sgRNA2.csv", header=None, index_col=0)

K562_separate_sgRNA1["sgRNA1"] = K562_separate_sgRNA1[1].str[34:54].str.translate(trans).str[::-1]
K562_separate_sgRNA2["sgRNA2"] = K562_separate_sgRNA2[1].str[70:90]

In [8]:
def process_sample(sample, separate_sgRNA1, separate_sgRNA2, barcodes, run_name, output_dir):
    print("reading fastq files")
    umi, _ = process_files.read_fastq(f"{DIRECTORY}/{sample}/{sample}_UMI_I1.fastq.gz")
    g2 = process_files.read_fastq_g2(f"{DIRECTORY}/{sample}/{sample}_R2.fastq.gz")
    b = process_files.read_fastq_b(f"{DIRECTORY}/{sample}/{sample}_R2.fastq.gz")
    g1 = process_files.read_fastq_g1(f"{DIRECTORY}/{sample}/{sample}_R1.fastq.gz")
    
    unique_p1 = np.unique(g1)
    unique_p2 = np.unique(g2)

    print("mapping barcodes")
    barcodes = set(barcodes)

    print("mapping guides")
    alignments_p1 = chunked_call_alignments_new(unique_p1, separate_sgRNA1.sgRNA1, num_chunks=int(len(unique_p1)/1000), num_workers=NUM_WORKERS)
    alignments_p2 = chunked_call_alignments_new(unique_p2, separate_sgRNA2.sgRNA2, num_chunks=int(len(unique_p2)/1000), num_workers=NUM_WORKERS)
    alignments_p1.query("score >= @MATCHING_THRESHOLD", inplace=True)
    alignments_p2.query("score >= @MATCHING_THRESHOLD", inplace=True)

    print("filtering reads")
    idx_p1 = np.array([r in alignments_p1.index for r in g1])
    idx_p2 = np.array([r in alignments_p2.index for r in g2])
    idx_b = np.array([r in barcodes for r in b])
    
    filtered_idx = idx_p1 & idx_p2 & idx_b
    
    g1, g2 = g1[filtered_idx], g2[filtered_idx]
    b = b[filtered_idx]
    umi = umi[filtered_idx]

    print("writing to files")
    with open(f"{output_dir}/processed_filtered_reads_{sample}_strict_barcodes.csv", "a") as f:
        f.write("p1_identity,p2_identity,barcode_mapped,UMI\n")
    
        size = 2000000
        chunks = np.arange(0, len(g1), size)
        
        for i in chunks:
            
            lines = np.array([alignments_p1.loc[g1[i:i+size]].identity.values,
                              alignments_p2.loc[g2[i:i+size]].identity.values,
                              b[i:i+size],
                              umi[i:i+size]])
            
            lines = [",".join(a)+"\n" for a in lines.T]
            
            f.writelines(lines)
            
    with open(f"{output_dir}/filtered_idx_{sample}.npy", 'wb') as f:
        np.save(f, filtered_idx)

In [9]:
output_dir = "/data1/normantm/angel/sequencing/AP1_816_puro_virus_output"
run_name = "20250319_AP1_816_puro_virus"

In [10]:
process_sample(sample, K562_separate_sgRNA1, K562_separate_sgRNA2, barcodes, run_name, output_dir)

reading fastq files
mapping barcodes
mapping guides


100%|███████████████████████████████████████| 3128/3128 [11:47<00:00,  4.42it/s]
100%|███████████████████████████████████████| 4365/4365 [15:58<00:00,  4.55it/s]


filtering reads
writing to files
