In [1]:

file = open("FinalProject.fasta", 'r')

sequences = {}


for line in file:
    line = line.strip()
    if line.startswith('>'):
        seq_id = line.replace('>', '')
        sequences[seq_id] = {}
        sequences[seq_id]['seq'] = ''
    else:
        sequences[seq_id]['seq'] += line
file.close()

print("Number of sequences in the file:", len(sequences), end="\n\n")


for seq_id, seq in sequences.items():
    exons = []
    introns = [] 
    exon, intron = '', ''
    seq_before_exon = ''
   
    for i, k in enumerate(seq['seq']):
        if k.isupper(): 
            if intron: 
                introns.append(intron)
                intron = ''
            exon += k
        elif k.lower() and (exons or exon):  
            if exon: 
                exons.append(exon)
                exon = ''
            intron += k
        else:
            seq_before_exon += k
    seq['exons'] = exons
    seq['introns'] = introns
    seq['seq_before_exon'] = seq_before_exon
    seq['AvgExonLength'] = round(sum(len(exon) for exon in exons)/len(exons),2)
    seq['AvgIntronLeng'] = round(sum(len(intron) for intron in introns)/len(introns),2)
    seq['A_in_Exon'] = round((''.join(exons).count('A')/len(''.join(exons)))*100, 2)
    seq['C_in_Exon'] = round((''.join(exons).count('C')/len(''.join(exons)))*100, 2)
    seq['G_in_Exon'] = round((''.join(exons).count('G')/len(''.join(exons)))*100, 2)
    seq['T_in_Exon'] = round((''.join(exons).count('T')/len(''.join(exons)))*100, 2)
    seq['A_in_Intron'] = round((''.join(introns).count('a')/len(''.join(introns)))*100, 2)
    seq['C_in_Intron'] = round((''.join(introns).count('c')/len(''.join(introns)))*100, 2)
    seq['G_in_Intron'] = round((''.join(introns).count('g')/len(''.join(introns)))*100, 2)
    seq['T_in_Intron'] = round((''.join(introns).count('t')/len(''.join(introns)))*100, 2)


headers = ["SequenceID", "#Exons", "#Introns", "AvgExonLength", "AvgIntronLeng", "%A in Exon",
           "%C In Exon", "%G In Exon", "%T In Exon", "%A in Intron", "%C In Intron", "%G In Intron",
           "%T In Intron"]


print(("{:<15}"*len(headers)).format(*headers))


for seq_id, seq in sequences.items():
    print(("{:<15}"*len(headers)).format(seq_id, len(seq['exons']), len(seq['introns']), seq['AvgExonLength'], 
        seq['AvgIntronLeng'], seq['A_in_Exon'], seq['C_in_Exon'], seq['G_in_Exon'], seq['T_in_Exon'], 
        seq['A_in_Intron'], seq['C_in_Intron'], seq['G_in_Intron'], seq['T_in_Intron']))


output = open('DNAstats.txt', 'w')

output.write('\t'.join(headers)+"\n")


for seq_id, seq in sequences.items():
    output.write('\t'.join(map(str,[seq_id, len(seq['exons']), len(seq['introns']), seq['AvgExonLength'], 
        seq['AvgIntronLeng'], seq['A_in_Exon'], seq['C_in_Exon'], seq['G_in_Exon'], seq['T_in_Exon'], 
        seq['A_in_Intron'], seq['C_in_Intron'], seq['G_in_Intron'], seq['T_in_Intron']]))+"\n")

print("\n{} file is created.".format(output.name), end="\n\n")
output.close()



seq_id = input("Enter ID of sequence: ")

sequence_file = open('DNAstats.txt', 'r')
sequence_data = sequence_file.readlines()

found = 0 
while True:
    
    for seq in sequence_data[1:]:
        data = seq.split('\t')
        if data[0].lower() == seq_id.lower():
            found = 1
           
            print("\nStatistics of {}:".format(seq_id))
            print("SequenceID:", data[0])
            print("#Exons:", data[1])
            print("#Introns:", data[2])
            print("AvgExonLength:", data[3])
            print("AvgIntronLeng:", data[4])
            print("%A in Exon:", data[5])
            print("%C In Exon:", data[6])
            print("%G In Exon:", data[7])
            print("%T In Exon:", data[8])
            print("%A in Intron:", data[9])
            print("%C In Intron:", data[10])
            print("%G In Intron:", data[11])
            print("%T In Intron:", data[12])
    
    if not found:
        print(f"\n{seq_id} is NOT there in the file.\n")
        seq_id = input("Enter another Sequence ID: ")
    else:
        break


codon_table = {'AUA': 'I', 'AUC': 'I', 'AUU': 'I', 'AUG': 'M', 'ACA': 'T', 
               'ACC': 'T', 'ACG': 'T', 'ACU': 'T', 'AAC': 'N', 'AAU': 'N', 
               'AAA': 'K', 'AAG': 'K', 'AGC': 'S', 'AGU': 'S', 'AGA': 'R', 
               'AGG': 'R', 'CUA': 'L', 'CUC': 'L', 'CUG': 'L', 'CUU': 'L', 
               'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCU': 'P', 'CAC': 'H', 
               'CAU': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGA': 'R', 'CGC': 'R', 
               'CGG': 'R', 'CGU': 'R', 'GUA': 'V', 'GUC': 'V', 'GUG': 'V', 
               'GUU': 'V', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCU': 'A', 
               'GAC': 'D', 'GAU': 'D', 'GAA': 'E', 'GAG': 'E', 'GGA': 'G', 
               'GGC': 'G', 'GGG': 'G', 'GGU': 'G', 'UCA': 'S', 'UCC': 'S', 
               'UCG': 'S', 'UCU': 'S', 'UUC': 'F', 'UUU': 'F', 'UUA': 'L', 
               'UUG': 'L', 'UAC': 'Y', 'UAU': 'Y', 'UAA': '_', 'UAG': '_', 
               'UGC': 'C', 'UGU': 'C', 'UGA': '_', 'UGG': 'W'}


rna_string = ''.join(sequences[seq_id]['exons']).replace('T', 'U')
print("\nmRNA Sequence:", rna_string, end="\n\n")


protein_string = ''
for i in range(0, len(rna_string), 3):
    codon = rna_string[i:i+3]
    if codon in codon_table:
        protein_string += codon_table[codon]
print("\nProtein Sequence:", protein_string, end="\n\n")



seq_before_exon = sequences[seq_id]['seq_before_exon']


Tm = []


for i in range(len(seq_before_exon)-20+1):
    tm_calc = (seq_before_exon[i:20+i].count('a')+seq_before_exon[i:20+i].count('t'))*2
    tm_calc += (seq_before_exon[i:20+i].count('c')+seq_before_exon[i:20+i].count('g'))*4
    Tm.append(tm_calc)

print("\nMelting Temperature Range: {}°C — {}°C".format(min(Tm), max(Tm)))
print("\nAverage Melting Temperature: {}°C".format(round(sum(Tm)/len(Tm),2)))

Number of sequences in the file: 6

SequenceID     #Exons         #Introns       AvgExonLength  AvgIntronLeng  %A in Exon     %C In Exon     %G In Exon     %T In Exon     %A in Intron   %C In Intron   %G In Intron   %T In Intron   
HCN1           8              7              1241.62        61642.86       30.25          20.22          20.0           29.53          32.1           16.94          17.69          33.27          
HCN2           8              7              427.5          3408.43        15.12          38.71          32.11          14.06          17.53          28.71          33.35          20.42          
KCNA1          2              1              1309.0         4062.0         27.73          24.41          21.28          26.59          28.39          20.83          20.7           30.08          
KCNA2          5              4              391.8          8984.0         24.4           26.9           24.2           24.5           25.49          23.22          23.41          