In [2]:
# Open the fasta file in read mode
file = open("FinalProject.fasta", 'r')

sequences = {}

# Parse the fasta file and extract sequences
for line in file:
    line = line.strip()
    if line.startswith('>'):
        seq_id = line
        sequences[seq_id] = {}
        sequences[seq_id]['seq'] = ''
    else:
        sequences[seq_id]['seq'] += line
file.close()

# Display number of sequences
print("Number of sequences in the file:", len(sequences), end="\n\n")


for seq_id, seq in sequences.items():
    exons = []
    introns = [] 
    exon, intron = '', ''
   
    for i, k in enumerate(seq['seq']):
        if k.isupper(): 
            if intron: 
                introns.append(intron)
                intron = ''
            exon += k
        elif k.lower() and (exons or exon): 
            if exon: 
                exons.append(exon)
                exon = ''
            intron += k
    seq['exons'] = exons
    seq['introns'] = introns
    seq['AvgExonLength'] = round(sum(len(exon) for exon in exons)/len(exons),2)
    seq['AvgIntronLeng'] = round(sum(len(intron) for intron in introns)/len(introns),2)
    seq['A_in_Exon'] = round((''.join(exons).count('A')/len(''.join(exons)))*100, 2)
    seq['C_in_Exon'] = round((''.join(exons).count('C')/len(''.join(exons)))*100, 2)
    seq['G_in_Exon'] = round((''.join(exons).count('G')/len(''.join(exons)))*100, 2)
    seq['T_in_Exon'] = round((''.join(exons).count('T')/len(''.join(exons)))*100, 2)
    seq['A_in_Intron'] = round((''.join(introns).count('a')/len(''.join(introns)))*100, 2)
    seq['C_in_Intron'] = round((''.join(introns).count('c')/len(''.join(introns)))*100, 2)
    seq['G_in_Intron'] = round((''.join(introns).count('g')/len(''.join(introns)))*100, 2)
    seq['T_in_Intron'] = round((''.join(introns).count('t')/len(''.join(introns)))*100, 2)

# Header
headers = ["SequenceID", "#Exons", "#Introns", "AvgExonLength", "AvgIntronLeng", "%A in Exon",
           "%C In Exon", "%G In Exon", "%T In Exon", "%A in Intron", "%C In Intron", "%G In Intron",
           "%T In Intron"]

# Print header
print(("{:<15}"*len(headers)).format(*headers))
# Loop through data stored in sequences dictionary
# and print the data
for seq_id, seq in sequences.items():
    print(("{:<15}"*len(headers)).format(seq_id, len(seq['exons']), len(seq['introns']), seq['AvgExonLength'], 
        seq['AvgIntronLeng'], seq['A_in_Exon'], seq['C_in_Exon'], seq['G_in_Exon'], seq['T_in_Exon'], 
        seq['A_in_Intron'], seq['C_in_Intron'], seq['G_in_Intron'], seq['T_in_Intron']))

# Open DNAstats.txt in writing mode
output = open('DNAstats.txt', 'w')
# Write header
output.write('\t'.join(headers)+"\n")
# Loop through data stored in sequences dictionary
# and write the data to the file
for seq_id, seq in sequences.items():
    output.write('\t'.join(map(str,[seq_id, len(seq['exons']), len(seq['introns']), seq['AvgExonLength'], 
        seq['AvgIntronLeng'], seq['A_in_Exon'], seq['C_in_Exon'], seq['G_in_Exon'], seq['T_in_Exon'], 
        seq['A_in_Intron'], seq['C_in_Intron'], seq['G_in_Intron'], seq['T_in_Intron']]))+"\n")

print("\n{} file is created.".format(output.name))
output.close()

Number of sequences in the file: 6

SequenceID     #Exons         #Introns       AvgExonLength  AvgIntronLeng  %A in Exon     %C In Exon     %G In Exon     %T In Exon     %A in Intron   %C In Intron   %G In Intron   %T In Intron   
>HCN1          8              7              1241.62        61642.86       30.25          20.22          20.0           29.53          32.1           16.94          17.69          33.27          
>HCN2          8              7              427.5          3408.43        15.12          38.71          32.11          14.06          17.53          28.71          33.35          20.42          
>KCNA1         2              1              1309.0         4062.0         27.73          24.41          21.28          26.59          28.39          20.83          20.7           30.08          
>KCNA2         5              4              391.8          8984.0         24.4           26.9           24.2           24.5           25.49          23.22          23.41          