In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import gzip
import gc

In [2]:
trans = str.maketrans('ATGC', 'TACG')

In [3]:
def read_fastq(fn):
    n = 4
    sequence = dict()
    quality = dict()
    
    with gzip.open(fn, 'rt') as fh:
        lines = []
        for line in fh:
            lines.append(line.rstrip())
            if len(lines) == n:
                name = lines[0].split(" ")[0]
                if name[0] != "@":
                    raise ValueError(
                        "Records in Fastq files should start with '@' character"
                    )
                sequence[name[1:]] = lines[1]
                quality[name[1:]] = lines[3]
                lines = []
    return sequence#, quality

In [4]:
def read_fastq_guides(fn):
    n = 4
    sequence = []
    
    with gzip.open(fn, 'rt') as fh:
        lines = []
        for line in fh:
            lines.append(line.rstrip())
            if len(lines) == n:
                name = lines[0].split(" ")[0]
                if name[0] != "@":
                    raise ValueError(
                        "Records in Fastq files should start with '@' character"
                    )
                name_fields = name.split(":")
                sequence.append([lines[1][:20], name_fields[3], name_fields[4], name_fields[5], name_fields[6], name_fields[7]])
                del lines, name, name_fields
                lines = []
    return sequence

In [5]:
def read_fastq_barcodes(fn):
    n = 4
    sequence = []
    
    with gzip.open(fn, 'rt') as fh:
        lines = []
        for line in fh:
            lines.append(line.rstrip())
            if len(lines) == n:
                name = lines[0].split(" ")[0]
                if name[0] != "@":
                    raise ValueError(
                        "Records in Fastq files should start with '@' character"
                    )
                name_fields = name.split(":")
                sequence.append([lines[1][:25].translate(trans)[::-1], name_fields[7]])
                del lines, name, name_fields
                lines = []
    return sequence

In [6]:
DIRECTORY = "/data/norman/angel/sequencing/Project_14568/MICHELLE_0647"

In [7]:
samples = ["rep1_GFP_IGO_14568_1_S1", "rep1_mCherry_IGO_14568_2_S2",
           "rep2_GFP_IGO_14568_3_S3", "rep2_mCherry_IGO_14568_4_S4"]

In [8]:
CRISPRi_essentials = pd.read_csv("/home/tanga1/20221003_CRISPRi_essentials_trimmed_Twist.csv", header=None, index_col=0)
CRISPRi_essentials = CRISPRi_essentials[1].str[27:47].to_frame("protospacer")
CRISPRi_essentials.index = CRISPRi_essentials.index + ":" + CRISPRi_essentials.protospacer

In [9]:
freebarcodes = pd.read_csv("/home/tanga1/freebarcodes/10k_barcodes.txt", header=None)
freebarcodes = freebarcodes[0].str[30:55]#.values

In [10]:
s = samples[3]
lane = '2'

In [11]:
samples_guides = read_fastq_guides("%s/Sample_%s/%s_L00%s_R1_001.fastq.gz"%(DIRECTORY, s[:-3], s, lane))
samples_barcodes = read_fastq_barcodes("%s/Sample_%s/%s_L00%s_R2_001.fastq.gz"%(DIRECTORY, s[:-3], s, lane))

In [12]:
guides_df = pd.DataFrame(samples_guides)
guides_df.columns = ["protospacer", "lane", "tile", "x", "y", "UMI"]

barcodes_df = pd.DataFrame(samples_barcodes)
barcodes_df.columns = ["barcode", "UMI"]

In [13]:
if np.mean(guides_df["UMI"] == barcodes_df["UMI"]) == 1:
    guides_df["barcode"] = barcodes_df["barcode"]
else:
    print("UMIs don't match!")

In [14]:
unique_p = guides_df.protospacer.unique()
unique_b = guides_df.barcode.unique()

In [15]:
from isotonic_barcode_calling import *

In [16]:
# minimum score (out of 100) on fuzzy protospacer matching
MATCHING_THRESHOLD = 90
# number of workers for distance computation in calling
NUM_WORKERS = 8

In [None]:
alignments_p = chunked_call_alignments(unique_p, CRISPRi_essentials.protospacer, num_chunks=int(len(unique_p)/1000), num_workers=NUM_WORKERS)

 35%|████████████▍                       | 818/2363 [4:36:19<8:39:03, 20.16s/it]

In [18]:
guides_df['p_identity'] = guides_df['protospacer'].map(alignments_p['identity'])
guides_df['p_score'] = guides_df['protospacer'].map(alignments_p['score'])

In [19]:
filtered_guides_df = guides_df[guides_df.p_score >= MATCHING_THRESHOLD]

In [20]:
import re

In [21]:
with open("unique_b.fastq", "w") as f:
    for i, b in enumerate(unique_b):
        f.write(f"@{i}\n")
        b_new = b.replace("N", "C")
        f.write(f"{b_new}\n")
        f.write("+\n")
        Q = ['I']*25
        for i in re.finditer("N", b):
            Q[i.start()] = "!"
        Q = "".join(Q)
        f.write(f"{Q}\n")

In [22]:
import os
os.system("freebarcodes decode /home/tanga1/freebarcodes/10k_barcodes15-2.txt,/home/tanga1/freebarcodes/10k_barcodes10-1.txt " +\
              f"/home/tanga1/230303_Pscreen_pAT005-TelN/unique_b.fastq")

0

In [23]:
decoded_unique_b = pd.read_csv("/home/tanga1/230303_Pscreen_pAT005-TelN/unique_b_decoded.txt",
                               sep="\t", header=None, index_col=0)

In [24]:
decoded_unique_b.columns = ["b15", "b10", "seq"]

In [25]:
decoded_unique_b["barcode_mapped"] = decoded_unique_b["b15"] + decoded_unique_b["b10"]

In [26]:
decoded_unique_b = decoded_unique_b.drop_duplicates()

In [27]:
decoded_unique_b = decoded_unique_b.query("barcode_mapped in @freebarcodes.values")

In [28]:
filtered_guides_df = filtered_guides_df.query("barcode in @decoded_unique_b.seq.values")
filtered_guides_df['barcode_mapped'] = filtered_guides_df['barcode'].map(decoded_unique_b.set_index("seq")['barcode_mapped'])

In [29]:
filtered_guides_df["protospacer_mapped"] = filtered_guides_df.p_identity.str.split(":").str[1]

In [31]:
filtered_guides_df1 = guides_df.query("protospacer in @CRISPRi_essentials.protospacer.values and barcode in @freebarcodes.values")

In [33]:
summary_df = filtered_guides_df.groupby(["protospacer_mapped", "barcode_mapped", "UMI"]).size().to_frame("count").reset_index()

In [39]:
sample = "_".join(s.split("_")[:2])
summary_df.to_csv(f"{DIRECTORY}/processed/new/{sample}_L{lane}_summary.csv", index=False)