In [None]:
#load modules
import pandas as pd
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import AlignIO
import subprocess
from io import StringIO
import urllib
#also requires mafft to be installed - conda install -c bioconda mafft in command line

## Simple tools for sequence investigation

In [None]:
#Input the sequence
DNA_input = Seq("CAACAATCTCGACAGGTGCCTCAGGGACAGCTTCTTCAGCAG")

DNA_input

In [None]:
#Produce the complement of the sequence
DNA_input.complement()

In [None]:
#Produce the reverse complement of the sequence
DNA_input.reverse_complement()

In [None]:
#transcribe the DNA into RNA
transcribed_RNA = DNA_input.transcribe()
transcribed_RNA

In [None]:
#back transcribe RNA to DNA
DNA_from_RNA = transcribed_RNA.back_transcribe()
DNA_from_RNA

In [None]:
#check whether the back transcribe has worked
DNA_from_RNA == DNA_input

In [None]:
#can translate the RNA sequence into amino acids
protein = DNA_input.translate()

protein

In [None]:
#calculate GC% of DNA sequence
DNA_upper = DNA_input.upper()
print(DNA_upper)

In [None]:
#output tells you the percentage of the GC content in the sequence
A = DNA_upper.count("A")
T = DNA_upper.count("T")
C = DNA_upper.count("C")
G = DNA_upper.count("G")

GC_count = round(((G+C/(G+C+T+A)*100)),2)
print(GC_count)

## Alignment of multiple sequences

In [None]:
from urllib.request import urlretrieve
url = (
    "https://raw.githubusercontent.com/lottiewilson02/SWBio_Short_Project/refs/heads/main/Sequences_Fasta2.fa"
    )
filename = "Sequences_Fasta2.fa"

In [None]:
urlretrieve("https://raw.githubusercontent.com/lottiewilson02/SWBio_Short_Project/refs/heads/main/Sequences_Fasta2.fa", "Sequences_Fasta2.fa")


In [None]:
#print the sequence descriptions
for record in SeqIO.parse("Sequences_Fasta2.fa", "fasta"):
    print (record.id)

In [None]:
#print a list of the sequences
list_fasta = list(SeqIO.parse("Sequences_Fasta2.fa", "fasta"))
print(list_fasta)

In [None]:
#Convert the sequence object into a string
seq_str = ''
for seq in list_fasta:
    seq_str += '>' + seq.description + '\n'
    seq_str += str(seq.seq) + '\n'

seq_str

In [None]:
#make a subprocess for the alignment - runs a linux command line command from python
subpro = subprocess.Popen(['mafft', '--quiet', '-'], stdin = subprocess.PIPE, stdout = subprocess.PIPE)
subpro.stdin.write(seq_str.encode())
subpro_out = subpro.communicate()[0].decode('utf8')
seq_aligned = list(SeqIO.parse(StringIO(subpro_out), 'fasta'))
subpro.stdin.close()

#view alignment 
seq_aligned

In [None]:
#view length of all alignment sequences to check they match 
for seq in seq_aligned:
    print(len(seq))