This code will extract two files in which I will work with and create a new folder with only these two files (blast_tile & TIGER_gff) will convert our gff and blast files to csv files.

In [5]:
import os
import shutil
import csv

# specify the parent directory containing the 10 folders
parent_directory = 'Coli_2'

In [7]:
# specify the filenames to extract
filenames = ['genome.IS.nonoverlap.gff', 'genome.IS.blast.tile']

In [8]:
# specify the new folder to copy the extracted files to
new_folder_path = 'test_jobs'
os.makedirs(new_folder_path, exist_ok=True)

In [9]:
# loop through each folder in the parent directory
for folder_name in os.listdir(parent_directory):
    folder_path = os.path.join(parent_directory, folder_name)
    # check if the item is a directory
    if os.path.isdir(folder_path):
        print(f"Processing folder {folder_name}")
        
        # loop through each file in the folder
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            
            # check if the file is one we want to extract
            if file_name in filenames and os.path.isfile(file_path):
                
                # create a new folder for the extracted file
                output_folder_path = os.path.join(new_folder_path, f"{folder_name}_files")
                os.makedirs(output_folder_path, exist_ok=True)
                
                # copy the file to the output folder
                output_file_path = os.path.join(output_folder_path, file_name)
                shutil.copy(file_path, output_file_path)
    # Loop through all subfolders recursively
for root, dirs, files in os.walk(new_folder_path):
    # Loop through each file in the current subfolder
    for file_name in files:
        # Check if the file is a CSV or tile file
        if file_name.endswith('.gff') or file_name.endswith('.tile'):
            # Create the output CSV file name
            csv_file_name = os.path.splitext(file_name)[0] + '.csv'

            # Open the input and output files
            with open(os.path.join(root, file_name), 'r') as input_file, open(os.path.join(root, csv_file_name), 'w', newline='') as output_file:
                # Create a CSV writer
                csv_writer = csv.writer(output_file)

                # Write the header row to the CSV file
                csv_writer.writerow(['contig_name', 'software', 'IS', 'LCOR', 'RCOR', 'supporting', 'orientation1', 'orientation2', 'INFO'])

                # Loop through each line of the input file
                for line in input_file:
                    # Skip comments and blank lines
                    if line.startswith('#') or line.strip() == '':
                        continue

                    # Split the line into fields
                    fields = line.strip().split()

                    # Select only the first 9 columns of the fields list
                    fields = fields[:9]

                    # Write the fields to the CSV file
                    csv_writer.writerow(fields)

            print(f'Conversion of {file_name} complete!')



Processing folder Eco1001
Processing folder Eco1002
Processing folder Eco1004
Processing folder Eco10055
Processing folder Eco1007
Processing folder Eco10072
Conversion of genome.IS.blast.tile complete!
Conversion of genome.IS.nonoverlap.gff complete!
Conversion of genome.IS.blast.tile complete!
Conversion of genome.IS.nonoverlap.gff complete!
Conversion of genome.IS.blast.tile complete!
Conversion of genome.IS.nonoverlap.gff complete!
Conversion of genome.IS.blast.tile complete!
Conversion of genome.IS.nonoverlap.gff complete!
Conversion of genome.IS.blast.tile complete!
Conversion of genome.IS.nonoverlap.gff complete!
Conversion of genome.IS.blast.tile complete!
Conversion of genome.IS.nonoverlap.gff complete!
