In [39]:
# data files for assessing plasmid length
base_directory = "/analysis/2021_07_12_PlasmidSeq_publication_96xr941/PlasmidSeq"
barcoding_summary_file = "/analysis/2021_07_12_PlasmidSeq_publication_96xr941/PlasmidSeq/pipelines/results/guppy_demultiplex/saved_data/barcoding_summary.txt"
read_sequencing_file = "/analysis/2021_07_12_PlasmidSeq_publication_96xr941/PlasmidSeq/pipelines/results/guppy/basecalling/sequencing_summary.txt"
sample_sheet = "/analysis/2021_07_12_PlasmidSeq_publication_96xr941/PlasmidSeq/pipelines/fex_117_sample_sheet.tsv"

output_sample_sheet = "/analysis/2021_08_26_PlasmidSeq_paper/fex_117_sample_sheet_with_annotations.tsv"
output_read_lengths = "/analysis/2021_08_26_PlasmidSeq_paper/fex_117_read_lengths.tsv"

In [37]:
import scipy

# load up the sample sheet, and extract a mapping of barcode ID to a tuple with (plasmid_name,reference_sequence,reference_size)
barcode_fasta_sizes = {}

def gc_content(string):
    gcs = sum([1 if x == 'G' or x == 'C' else 0 for x in string.upper()])
    return(float(gcs)/float(len(string)))

def base_to_index(base):
    if base == 'A' or base == 'a':
        return 0
    elif base == 'C' or base == 'c':
        return 1
    elif base == 'G' or base == 'g':
        return 2
    elif base == 'T' or base == 't':
        return 3
    return 4

def count_bases(string):
    counts = [0,0,0,0,0]
    for x in string:
        counts[base_to_index(x)] = counts[base_to_index(x)] + 1
    return(counts[0:4]) # drop the Ns from our comp.
    
def calc_entropy(string):
    counts = count_bases(string)
    return(scipy.special.entr([x/sum(counts) if x > 0 else 0.0 for x in counts]).sum())

def kmer_complexity(string,kmer_size=10):
    kmers = {}
    entropies = []
    for i in range(0,len(string) - kmer_size):
        kmer = string[i:i+kmer_size]
        kmers[kmer] = kmers.get(kmer,0) + 1
        entropies.append(calc_entropy(kmer))
    return(len(kmers),float(sum(entropies))/float(len(entropies)))

output_sheet = open(output_sample_sheet,"w")

with open(sample_sheet) as file: 
    header = file.readline().strip("\n")
    output_sheet.write("\t".join(header.split("\t")[0:3]) + "\tbarcodeID\tlength\tgcPercentage\tkmerCount\taverageEntropy\n")
    for line in file:
        sp = line.strip("\n").split("\t")
        barcode = int(sp[0])
        name = sp[1]
        fasta = sp[2]
        fasta_seq = ""
        with open(fasta) as fasta_file: 
            fasta_header = fasta_file.readline()
            for fasta_line in fasta_file:
                fasta_seq = fasta_seq + fasta_line.strip("\n")
        fasta_length = len(fasta_seq)
        gc_pct = gc_content(fasta_seq)
        kmer_compl = kmer_complexity(fasta_seq)
        barcode_fasta_sizes[barcode] = (name,fasta_seq,fasta_length)
        output_sheet.write(line.strip("\n").strip("\t") + "\tbarcode" + (str(barcode).rjust(2, '0')) + "\t" + str(fasta_length) + "\t" + str(gc_pct) + "\t" + "\t".join([str(x) for x in kmer_compl]) + "\n")

output_sheet.close()

In [40]:
# find the lengths of all the corrected reads
# /analysis/2021_07_12_PlasmidSeq_publication_96xr941/PlasmidSeq/pipelines/results/canu/01_canu_correct/reads.correctedReads.fasta.gz

import gzip
filtered_read_output = open(output_read_lengths,"w")
filtered_read_output.write("barcode\twell\treadlength\n")

base_start = "/analysis/2021_07_12_PlasmidSeq_publication_96xr941/PlasmidSeq/pipelines/results/canu/"
base_end = "_canu_correct/reads.correctedReads.fasta.gz"
for i in range (1,97):
    filtered_reads = base_start + str(i).rjust(2, '0') + base_end
    with gzip.open(filtered_reads,'rt') as f:
        for line in f:
            if not line.startswith(">"):
                filtered_read_output.write(str(i) + "\tbarcode" + str(i).rjust(2, '0') + "\t" + str(len(line)) + "\n")

filtered_read_output.close()
    

In [None]:
# find the lengths 