In [None]:
pip install Bio

Collecting Bio
  Downloading bio-1.7.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.1-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading bio-1.7.1-py3-none-any.whl (280 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.0/281.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gprofiler_official-1.0.0-py3-none-any.wh

In [None]:
import pandas as pd
import Bio
from Bio import SeqIO
import gzip
import io
import math

from collections import Counter


In [None]:
def fasta_file_to_txt(file_name,txt_file):
    df=pd.read_csv(file_name)
    colum=df.columns[0]
    merged_text=' '.join(df[colum])
    with open(txt_file,'w') as file:
        file.write(merged_text)


In [None]:
def open_reading_frame(file_name,file_type,orf_file):
    record = SeqIO.read(file_name, file_type)
    min_pro_len = 100
    results = []
    orfs=[]
    for strand, nuc in [(+1, record.seq), (-1, record.seq.reverse_complement())]:
         for frame in range(3):
             length = 3 * ((len(record) - frame) // 3)
             for pro in nuc[frame : frame + length].split("*"):
                 if len(pro) >= min_pro_len:
                     result=(
                         "%s...%s - length %i, strand %i, frame %i  "
                         % (pro[:30], pro[-3:], len(pro), strand, frame)
                     )
                     orfs.append(str(pro))
                     results.append(result)
    with open(orf_file,'w') as file:
        for line in str(orfs):
            file.write(line)

    return orf_file

In [None]:
file_name='GCF_000441575.1_ASM44157v1_genomic.fna'
orf_file='Candidatus_Carsonella_ruddii_DC.txt'
open_reading_frame(file_name,'fasta',orf_file)

'Candidatus_Carsonella_ruddii_DC.txt'

In [None]:
genome_sequence=[]
with open(orf_file,'r') as file:
  genome_sequence=file.read()

In [None]:
#splitting introns that start with 'GT' and end with 'AG'
def intron_split(genome_sequence):
    introns = []
    i = 0
    while i < len(genome_sequence):
        if genome_sequence[i:i+2] == 'GT':
            j = i + 2
            while j < len(genome_sequence):
                if genome_sequence[j:j+2] == 'AG':
                    introns.append(genome_sequence[i:j+2])
                    i = j + 2
                    break
                j += 1
            else:
                i = j
        else:
          i+=1
    return introns

introns = intron_split(genome_sequence)

print("Introns:", introns[:20])
print(len(introns))


Introns: ['GTTGCAAAAG', 'GTGTTAG', 'GTGTTTTTCAAATCACCTAAATCATTAACAG', 'GTGAAG', 'GTTATCGAG', 'GTAATTTATATTTGTCAACAATAATAATGGAATTTTTGATTAAG', 'GTAAAATTTCTTTAATGGAATGTGAAATGATTAATAATAAAATTATTTATAATAATGAG', 'GTTTAAG', 'GTATAATTAAAAATTCAAG', 'GTTTAG', 'GTTATTAAG', 'GTAG', 'GTTGGGAAATCAACTTTATTTAATAAATTATGTTTACAATATGATTCTATTGTAACAAATATACCTGGAACAACAACAAATACGATTACTAAACAAATATATTTTACTTCAAAAACAATTAATTTAAACGATACAG', 'GTTTAAAAATAAAAACAAAAAATTTAATAG', 'GTATAATGAAAAATATCAATAAATCTTACGAAG', 'GTTCTTTATATTATTGATAAATTTGATTTAAG', 'GTATTTTATAATACTCCATTAG', 'GTTAATAAATGTGATATTTTTGGAATAAAAG', 'GTAAAAAATTTATTTGTTATATTTTTATCTGCTAAACATAG', 'GTTTTAATTTTTAG']
15171


In [None]:
#splitting exons
def exon_split(genome_sequence):
    exons = []
    i = 0
    while i < len(genome_sequence):
        if genome_sequence[i:i+2] == 'GT':
            j = i + 2
            while j < len(genome_sequence):
                if genome_sequence[j:j+2] == 'AG':
                    i = j + 2
                    break
                j += 1
            else:
                exons.append(genome_sequence[i:])
                break
        else:
            exon_start = i
            while i < len(genome_sequence) and genome_sequence[i:i+2] != 'GT':
                i += 1
            exons.append(genome_sequence[exon_start:i])

        i += 1

    return exons

exons=exon_split(genome_sequence)
print('exons: ',exons[:20])
print(len(exons))

exons:  ["['ATGAAAAATATTATT", 'TTGCAAAA', 'TTACTCCTGATGATTTAACATCAATTTGCATTATTAGATTATCTGGAAAAAAATTAAGAAAATTTATAAAACCTTTAATTAAAAAAAAATTAAAGATTCAAAAATTAGAATATACAAAATTATATGGATTAAATAATCAATTTATAGACTTT', 'T', 'TTA', 'TA', 'T', 'TTTTTCAAATCACCTAAATCATTAACAG', 'TGAAGAT', 'TTATCGA', 'TTTCATTTACACG', 'TAATTTATATTT', 'TCAACAATAATAATGGAATTTTTGATTAA', 'TTAGGAGCTAAAATTGCAAAACCAGGGGAATTTTTAGAGAGAAGATATTTGAATG', 'TAAAATTTCTTTAATGGAAT', 'TGAAATGATTAATAATAAAATTATTTATAATAATGAGAATAT', 'TTTAA', 'TTAACTTCAAATTCTGAAAAAGATATATATCTTT', 'TATAATTAAAAATTCAAGATTTAGAATAAATATGCTAATTATTT', 'TTTAGAATTTATTTTAATAAATGAAAAAGAATC']
29224


In [None]:
#Compute introns and exons compressibility
def compressibility(sequence):
    data = sequence.encode('utf-8')

    compressed_data = io.BytesIO()
    with gzip.GzipFile(fileobj=compressed_data, mode='wb') as f:
        f.write(data)

    original_length = len(data)
    compressed_length = compressed_data.tell()
    print('original length: ',original_length)
    print('compressed length: ',compressed_length)
    print('The compressed length is ',(compressed_length*100/original_length),'% smaller than the original length')

    return compressed_length / original_length

print('for introns:')
compressibility(str(introns))
print('for exons:')
compressibility(str(exons))

for introns:
original length:  599058
compressed length:  131037
The compressed length is  21.873841931832978 % smaller than the original length
for exons:
original length:  1131769
compressed length:  264649
The compressed length is  23.38365867946551 % smaller than the original length


0.2338365867946551

In [None]:
#same function as in Tiana's notebook to compute entropy
def shannon_entropy(s):
    frequency = Counter(s)
    probabilities = [freq / len(s) for freq in frequency.values()]
    entropy = -sum(p * math.log2(p) for p in probabilities)
    return entropy
print('introns entropy: ',shannon_entropy(str(introns)))
print('exons entropy: ',shannon_entropy(str(exons)))


introns entropy:  2.150352133552201
exons entropy:  2.0804171579968145
