In [4]:

import argparse
from Bio import SeqIO
import pysam
import csv
import pandas as pd

'''
Here we define all the subfunctions that we will be using within our function get_coverage
'''

'''
get_features
'''
def get_features(record, feature_name, qualifier_name):
   for feature in record.features:
       if feature.type == feature_name:
           return feature.qualifiers[qualifier_name][0]

'''
get_feature_location
'''
def get_feature_location(record, feature_name):
   for feature in record.features:
      if feature.type == feature_name:
         return [int(feature.location.start), int(feature.location.end)]

'''
gbk_parser
'''
def gbk_parser(input_gbk):
    acc2x = dict()  
    with open(input_gbk, "r") as gbk_handle:
        for record in SeqIO.parse(gbk_handle, "genbank"):
            acc = record.annotations['accessions'][0]
            bgc_class = get_features(record,"cand_cluster", "product")
            contg_edge = get_features(record,"cand_cluster", "contig_edge")  
            location = get_feature_location(record,"cand_cluster")
            acc2x[acc] = {"acc": acc, "bgc_class": bgc_class, "contg_edge": contg_edge, "start":location[0], "end":location[1]}
    return(acc2x)

'''
custom_coverage
'''
def custom_coverage(input_bam_handle, gbk_parser_out):
    coverage_dict = dict()
    seq_names = list(gbk_parser_out.keys())
    for i in seq_names:
        input_bam_cov = input_bam_handle.count_coverage(gbk_parser_out[i]["acc"], \
                                                        gbk_parser_out[i]["start"], \
                                                        gbk_parser_out[i]["end"],
                                                        quality_threshold = 0)
        n = 0
        for j in range (4):
            n += len([x for x in input_bam_cov[j] if x != 0])
            
        coverage_dict[i] = n/(gbk_parser_out[i]["end"] - gbk_parser_out[i]["start"]) 
    return(coverage_dict)

'''
Here we define the function get_coverage
'''
def get_coverage(input_gbk : str = None, input_bam : str = None, \
                 sample_name : str = None) -> pd.DataFrame:

    '''
    input_gbk:
        type: str 
        This a GenBank file with all the BGC sequences of a single samples 
        (contatenated in a single file) previously identified with antiSMASH. 
    input_bam:
        Thiese are the bam fies representing the alingments of paired end reads mapped 
        on the assembled data.
    sample_name:
        Sample name to be used in the output (tsv) table.   
    returns:
        A data frame with the following columns  
    '''

    '''
    1 Perse GBK
    '''

    gbk_parser_out = gbk_parser(input_gbk)
  
    '''
    2. Compute coverage
    '''

    input_bam_handle = pysam.AlignmentFile(input_bam, "rb")
    coverage_out = custom_coverage(input_bam_handle, gbk_parser_out)
    input_bam_handle.close()

    '''
    3. Format output
    '''

    output = dict()
    for i in gbk_parser_out.keys():
        gbk_parser_out[i]["coverage"] = coverage_out[i]
        gbk_parser_out[i]["sample"] = sample_name
        output[i] = list(gbk_parser_out[i].values())



NameError: name 'df' is not defined

### Description of create_dataset_table function

This function reads the names of the subfolders in the input directory (i.e., metagenomic datasets), to create the datasets.tsv tables needed to run [BiG-SLICE](https://github.com/pereiramemo/bigslice) as described [here](https://github.com/medema-group/bigslice/wiki/Input-folder#datasetstsv).

In [155]:
import os
import csv
import re

def find_files(folder_path : str = None) -> list:
  
    '''
    folder_path:
        type: str 
        Contains the relative path from the notebook to the input folder 
    returns:
        A list with all the files having named with the format *reion\d+.gbk    
        '''
  
    pattern = r".*\.region\d+.gbk"
    matching_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if re.search(pattern, file):
                matching_files.append(os.path.join(root, file))
    return matching_files
  
def create_taxonomy_tables(path_to_input_folder : str = None):
  
    '''
    path_to_data:
        type: str 
        Contains the relative path from the notebook to the input folder 
        conatining the metagenomic datasets and assemblies subfolders, the 
        latter with annotated BGC sequences to be clustered.
    returns:
        A tsv file per sample located in a taxonomy directory created in the 
        root directory of the input folder, having the columns: 
        1. Sequence identification.
        2. Taxonomy.
    '''
    
    if path_to_input_folder == None:
        print('You need to insert a path to the input folder')
        return False
    
    if not os.path.exists(path_to_input_folder):
        print('You need to insert a valid path to the input folder')
        return False
    
    '''
    Create output directory.
    '''
    
    taxomoy_dir = f"{path_to_input_folder}/taxonomy"
    if not os.path.exists(taxomoy_dir):
       os.makedirs(taxomoy_dir)
    
    '''
    Crete a dictionary of lists of lists containing the data do crate the taxonomy files. 
    Each element in the dictionary will be used to create a tsv file.
    '''
    
    output_dict = dict()  
    subfolders = os.listdir(path_to_input_folder)
    for subfolder in subfolders:
        if subfolder != "datasets.tsv" and subfolder != "taxonomy": 
            output_dict[subfolder] = list()
            path_to_input_subfolder = "/".join([path_to_input_folder, subfolder])
            path_to_gbk_files_list = find_files(path_to_input_subfolder)
            gbk_files = [x.split("/")[-1] for x in path_to_gbk_files_list]
            for gbk_file in gbk_files:
                gbk_file = gbk_file.replace(".gbk", '')
                output_dict[subfolder].append([gbk_file, "NA"])
    
    for key in output_dict:
        '''
        Define file name.
        '''
        output_list = output_dict[key]
        output_tsv = f'{taxomoy_dir}/{key}_taxonomy.tsv'
        '''
        Write outout tsv file
        ''' 
        try:
            with open(output_tsv, 'w', newline='') as tsv_file:
                tsv_writer = csv.writer(tsv_file, delimiter='\t')
                for row in output_list:
                     tsv_writer.writerow(row)  
        except OSError as e:
            print(f"Error: {e}")


### Description of create_dataset_table function

This function reads the names of the subfolders in the input directory (i.e., metagenomic datasets), and the GenBank files named with the format "\*.region\d+.gbk" as generated by [antiSMASH](https://github.com/antismash/antismash) in order the create the taxonomy files required to run [BiG-SLICE](https://github.com/pereiramemo/bigslice) as described [here](https://github.com/medema-group/bigslice/wiki/Input-folder#datasetstsv). At the moment, the function only generates the files without any taxonomic information, since the taxonomy is analyzed independently of BiG-SLICE.