In [67]:
import os
import csv

def create_dataset_table(path_to_input_folder : str = None):
  
    '''
    path_to_data:
        type: str 
        Contains the relative path from the notebook to the input folder 
        conatining the metagenomic datasets and assemblies subfolders, the 
        latter with annotated BGC sequences to be clustered.
    returns:
        A tsv file located in the root directory of the input folder, having the columns: 
        1. Dataset name.
        2. Path to dataset folder (relative to input folder's root folder).
        3. Path to taxonomy file (see <taxonomy_X.tsv> files).
        4. Description of the dataset.
    '''
    
    if path_to_input_folder == None:
        print('You need to insert a path to the input folder')
        return False
    
    if not os.path.exists(path_to_input_folder):
        print('You need to insert a valid path to the input folder')
        return False
      
    '''
    Get the data to populate each column of the output table.
    Each line of the output table will be sotred in as list. 
    All these list will be stored in higher level list. 
    '''
    
    subfolders = os.listdir(path_to_input_folder)  
    path_to_input_folder_list = path_to_input_folder.split('/')
    path_to_dataset = path_to_input_folder_list[3]
    output_list = list()

    for dataset in subfolders:
        if dataset != "datasets.tsv" and dataset != "taxonomy":
            dataset_line = [dataset, "./", f"taxonomy/{dataset}_taxonomy.tsv",f"dataset_{dataset}"]
            output_list.append(dataset_line)

    '''
    Define the output file name 
    '''
    
    output_tsv = f'{path_to_input_folder}/datasets.tsv'
    
    '''
    Write outout tsv file
    ''' 
    
    try:
        with open(output_tsv, 'w', newline='') as tsv_file:
            tsv_writer = csv.writer(tsv_file, delimiter='\t')
            for row in output_list:
                 tsv_writer.writerow(row)  
    except OSError as e:
        print(f"Error: {e}")
 

### Description of create_dataset_table function

This function reads the names of the subfolders in the input directory (i.e., metagenomic datasets), to create the datasets.tsv tables needed to run [BiG-SLICE](https://github.com/pereiramemo/bigslice) as described [here](https://github.com/medema-group/bigslice/wiki/Input-folder#datasetstsv).

In [155]:
import os
import csv
import re

def find_files(folder_path : str = None) -> list:
  
    '''
    folder_path:
        type: str 
        Contains the relative path from the notebook to the input folder 
    returns:
        A list with all the files having named with the format *reion\d+.gbk    
        '''
  
    pattern = r".*\.region\d+.gbk"
    matching_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if re.search(pattern, file):
                matching_files.append(os.path.join(root, file))
    return matching_files
  
def create_taxonomy_tables(path_to_input_folder : str = None):
  
    '''
    path_to_data:
        type: str 
        Contains the relative path from the notebook to the input folder 
        conatining the metagenomic datasets and assemblies subfolders, the 
        latter with annotated BGC sequences to be clustered.
    returns:
        A tsv file per sample located in a taxonomy directory created in the 
        root directory of the input folder, having the columns: 
        1. Sequence identification.
        2. Taxonomy.
    '''
    
    if path_to_input_folder == None:
        print('You need to insert a path to the input folder')
        return False
    
    if not os.path.exists(path_to_input_folder):
        print('You need to insert a valid path to the input folder')
        return False
    
    '''
    Create output directory.
    '''
    
    taxomoy_dir = f"{path_to_input_folder}/taxonomy"
    if not os.path.exists(taxomoy_dir):
       os.makedirs(taxomoy_dir)
    
    '''
    Crete a dictionary of lists of lists containing the data do crate the taxonomy files. 
    Each element in the dictionary will be used to create a tsv file.
    '''
    
    output_dict = dict()  
    subfolders = os.listdir(path_to_input_folder)
    for subfolder in subfolders:
        if subfolder != "datasets.tsv" and subfolder != "taxonomy": 
            output_dict[subfolder] = list()
            path_to_input_subfolder = "/".join([path_to_input_folder, subfolder])
            path_to_gbk_files_list = find_files(path_to_input_subfolder)
            gbk_files = [x.split("/")[-1] for x in path_to_gbk_files_list]
            for gbk_file in gbk_files:
                gbk_file = gbk_file.replace(".gbk", '')
                output_dict[subfolder].append([gbk_file, "NA"])
    
    for key in output_dict:
        '''
        Define file name.
        '''
        output_list = output_dict[key]
        output_tsv = f'{taxomoy_dir}/{key}_taxonomy.tsv'
        '''
        Write outout tsv file
        ''' 
        try:
            with open(output_tsv, 'w', newline='') as tsv_file:
                tsv_writer = csv.writer(tsv_file, delimiter='\t')
                for row in output_list:
                     tsv_writer.writerow(row)  
        except OSError as e:
            print(f"Error: {e}")


### Description of create_dataset_table function

This function reads the names of the subfolders in the input directory (i.e., metagenomic datasets), and the GenBank files named with the format "\*.region\d+.gbk" as generated by [antiSMASH](https://github.com/antismash/antismash) in order the create the taxonomy files required to run [BiG-SLICE](https://github.com/pereiramemo/bigslice) as described [here](https://github.com/medema-group/bigslice/wiki/Input-folder#datasetstsv). At the moment, the function only generates the files without any taxonomic information, since the taxonomy is analyzed independently of BiG-SLICE.