In [109]:

import os
import re
from Bio import SeqIO
import pysam
import csv
import pandas as pd

def get_coverage(input_dir : str = None, input_bam : str = None, \
                 sample_name : str = None) -> pd.DataFrame:

    '''
    input_dir:
    	type: str
    	This is the output of antiSMASH, where the *.region*.gbk files
    	(i.e., identified BGCs), can be found.
	input_bam:
    	These are the bam files representing the alignments of paired end
    	reads mapped on the assembled data.
	sample_name:
    	Sample name to be used in the output (tsv) table.   
	returns:
    	A data frame with the following columns: "acc", "bgc_class", "on_edge", "start", "end", "coverage", "sample_name".
	'''
 
    '''
    Start: subfunctions that we will be using in the get_coverage function
    '''

    def find_files(input_dir : str = None) -> list:
        pattern = r".*\.region\d+.gbk"
        matching_files = []
        for root, dirs, files in os.walk(input_dir):
            for file in files:
                if re.search(pattern, file):
                    matching_files.append(os.path.join(root, file))
        return matching_files

    def get_features(record, feature_name, qualifier_name):
       for feature in record.features:
           if feature.type == feature_name:
               return feature.qualifiers[qualifier_name][0]
    
    def get_feature_location(record, feature_name):
       for feature in record.features:
          if feature.type == feature_name:
             return [int(feature.location.start), int(feature.location.end)]
    
    def gbk_parser(paths2gbk):
        acc2x = dict()   
        for input_gbk in paths2gbk:
            i = 1
            with open(input_gbk, "r") as gbk_handle:
                record = SeqIO.read(gbk_handle, "genbank")
                acc = record.annotations['accessions'][0]
                bgc_class = get_features(record,"cand_cluster", "product")
                contg_edge = get_features(record,"cand_cluster", "contig_edge")  
                location = get_feature_location(record,"cand_cluster")
                acc_i = acc + "-" + str(i)
                i += 1
                acc2x[acc_i] = {"acc": acc, "bgc_class": bgc_class, "contg_edge": contg_edge, "start":location[0], "end":location[1]}
        return(acc2x)
    
    def custom_coverage(input_bam_handle, gbk_parser_out):
        coverage_dict = dict()
        seq_names = list(gbk_parser_out.keys())
        for i in seq_names:
            input_bam_cov = input_bam_handle.count_coverage(gbk_parser_out[i]["acc"], \
                                                            gbk_parser_out[i]["start"], \
                                                            gbk_parser_out[i]["end"],
                                                            quality_threshold = 0)
            n = 0
            for j in range (4):
                n += len([x for x in input_bam_cov[j] if x != 0])
                
            coverage_dict[i] = n/(gbk_parser_out[i]["end"] - gbk_parser_out[i]["start"]) 
        return(coverage_dict)

    '''
    End: subfunctions that we will be using in the get_coverage function
    '''
    
    '''
    1. Find all input gbk files
    '''
    
    paths2gbk = find_files(input_dir)
    
    '''
    2. Parse GBK
    '''

    gbk_parser_out = gbk_parser(paths2gbk)
   
    '''
    3. Compute coverage
    '''

    input_bam_handle = pysam.AlignmentFile(input_bam, "rb")
    coverage_out = custom_coverage(input_bam_handle, gbk_parser_out)
    input_bam_handle.close()

    '''
    4. Add coverage values to create the output dictionary
    '''

    output = dict()
    for i in gbk_parser_out.keys():
        gbk_parser_out[i]["coverage"] = coverage_out[i]
        gbk_parser_out[i]["sample"] = sample_name
        output[i] = list(gbk_parser_out[i].values())

    '''
    5. Convert dictionary to data frame
    '''
    
    column_names = ["acc", "bgc_class", "on_edge", "start", "end", "coverage", "sample_name"]
    output_df = pd.DataFrame(index = list(output.keys()), columns = column_names)
    for i in output.keys():
        output_df.loc[i] = output[i]

    return(output_df)

### Description of get_coverage function

This function computes the coverage of all BGCs identified by anitiSMASH. It performs the following tasks: 1) Searches for the \*region\*.gbk files within the input folder; 2) Extracts the accession of the contig in which the BGC was identified, and the BGC class, start and end coordinates from the features section; 3) Based on the accession id, and the start and end positions it computes the BGC coverage; 4) It outputs a data frame object with the columns "acc", "bgc_class", "on_edge", "start", "end", "coverage", "sample_name", where each row corresponds to a identified BGC. 