### Pipeline to extract barcodes from .fastq files of CATLAS

In [1]:
import os
import time

In [2]:
# path to the raw fastq files
input_dir_of_fastq = "/mnt/workspace_stud/stud5/samples"

# define path of output directory
output_dir_of_BC = "/mnt/workspace_stud/stud5/BC"

### Check different runtimes

In [3]:
# stack_size: collect some data first to flush them at once to the output -> avoiding a lot I/O-time
# Does extract barcodes from samples twice (R1 & R2). TODO: check if necessary

def extract_barcodes(input_dir_of_fastq, output_dir_of_BC, stack_size=1_000_000):
    for fastq_file in os.listdir(input_dir_of_fastq):
        start_time = time.time()
        
        full_path_IN = os.path.join(input_dir_of_fastq, fastq_file)
        full_path_OUT = os.path.join(output_dir_of_BC, fastq_file.rsplit('.', 1)[0] + "_BC.fastq")
        stack = []

        with open(full_path_IN, 'r') as fastq_in, open(full_path_OUT, 'w') as BC_out:
            for index, line in enumerate(fastq_in):
                if index % 4 == 0:
                    barcode = line.split(':', 1)[0][1:]
                    stack.append(f"{line}{barcode}\n+\n{'~'*len(barcode)}\n")
                    if len(stack) == stack_size:
                        BC_out.writelines(stack)
                        stack = []
        print(round(time.time() - start_time, 2))

In [4]:
extract_barcodes(input_dir_of_fastq, output_dir_of_BC)

85.97
83.28
150.93
127.33


### Check how many barcodes one file contains

In [5]:
full_path = os.path.join(input_dir_of_fastq, "ENC-1LGRB-069-SM-A8WNZ_snATAC_right_lobe_of_liver_Rep1.demultiplexed.R1.fastq")
barcodes = {}

with open(full_path, 'r') as fastq:
    for index, line in enumerate(fastq):
        if index % 4 == 0:
            barcode = line.split(':', 1)[0][1:]
            if not barcode in barcodes:
                barcodes[barcode] = 1
            else:
                barcodes[barcode] += 1

In [6]:
print(len(barcodes))
print(len([key for key, val in barcodes.items() if val > 100]))

73685
13289
