In [2]:
import os

def calculate_gc_content(file_path):
    
    gc_count = 0
    total_bases = 0
    valid_bases = set('GCTA')  
    
    try:
        with open(file_path, 'r') as f:
            for line in f:
                sequence = line.strip().upper() 
                
                ## Uncomment the line below to filter only G, C, T, A
                # sequence = ''.join([base for base in sequence if base in valid_bases])
                
                gc_count += sequence.count('G') + sequence.count('C')
                total_bases += len(sequence)

        if total_bases == 0:
            return 0  # To avoid division by zero

        gc_content_percentage = (gc_count / total_bases) * 100
        return gc_content_percentage
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None

def main():
    directory = "./organism_sequences_txt"  # Directory containing all the ncrna.txt and cds.txt files
    
    gc_contents = {}

    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            
            file_path = os.path.join(directory, filename)

            gc_content = calculate_gc_content(file_path)
            
            if gc_content is not None:
                gc_contents[filename] = gc_content
    
    for filename, gc_content in gc_contents.items():
        print(f"{filename}: GC content: {gc_content:.2f}%")

if __name__ == "__main__":
    main()

caenorhabditis_elegans_cds.txt: GC content: 43.47%
caenorhabditis_elegans_ncrna.txt: GC content: 40.62%
candidatus_nitrosopelagicus_cds.txt: GC content: 33.36%
candidatus_nitrosopelagicus_ncrna.txt: GC content: 50.33%
Danio_rerio_cds.txt: GC content: 49.75%
Danio_rerio_ncrna.txt: GC content: 42.29%
Dictyostelium_discoideum_cds.txt: GC content: 27.40%
Dictyostelium_discoideum_ncrna.txt: GC content: 48.43%
escherichia_coli_cds.txt: GC content: 51.44%
escherichia_coli_ncrna.txt: GC content: 48.99%
escherichia_coli_rev_cds.txt: GC content: 51.44%
escherichia_coli_rev_ncrna.txt: GC content: 48.99%
Gallus_gallus_cds.txt: GC content: 50.12%
Gallus_gallus_ncrna.txt: GC content: 43.89%
halobacterium_salinarum_cds.txt: GC content: 66.97%
halobacterium_salinarum_ncrna.txt: GC content: 59.33%
ignicoccus_hospitalis_cds.txt: GC content: 56.53%
ignicoccus_hospitalis_ncrna.txt: GC content: 69.53%
methanosarcina_acetivorans_cds.txt: GC content: 45.16%
methanosarcina_acetivorans_ncrna.txt: GC content: 5