In [1]:
def parseFASTA(fasta_path):
    # Parses a FASTA file and returns a dictionary with sequence IDs as keys and sequences as values.
    # @param fasta_path: Path to the FASTA file.
    # @return: Dictionary with sequence IDs as keys and sequences as values in string format.

    sequences = {}

    with open(fasta_path, 'r') as fasta_file:
        seq_id = None
        seq_lines = []

        for line in fasta_file:
            line = line.strip()
            if line.startswith('>'):
                if seq_id is not None:
                    sequences[seq_id] = ''.join(seq_lines)
                seq_id = line[1:].split()[0]  # Get the sequence ID 
                seq_lines = []
            else:
                seq_lines.append(line)
        if seq_id is not None:
            sequences[seq_id] = ''.join(seq_lines)  # Add the last sequence
    return sequences

In [2]:
def gc(sequence):
    # Calculates the GC content of a given DNA sequence.
    # @param sequence: DNA sequence in string format.
    # @return: GC content as a percentage.
    
    gc_count = sequence.count('G') + sequence.count('C')
    gc_content = (gc_count / len(sequence)) * 100
    return gc_content

In [3]:
# Obtaining the sequence id with the highest GC content
fasta_path = '../data/rosalind_gc.txt'

sequences = parseFASTA(fasta_path)
highest_gc_id = None

for seq_id, sequence in sequences.items():
    if highest_gc_id is None or gc(sequence) > gc(sequences[highest_gc_id]):
        highest_gc_id = seq_id

print(highest_gc_id)
print(gc(sequences[highest_gc_id]))

Rosalind_4947
53.67647058823529
