In [1]:
########## Read in raw sequence in FASTA  ################

from Bio import SeqIO
for seq_record in SeqIO.parse("PKTest.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

gi|6273291|gb|AF191665.1|AF191665
Seq('TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAAAAAA...AGA', SingleLetterAlphabet())
902
gi|6273290|gb|AF191664.1|AF191664
Seq('TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAAAAAA...AGA', SingleLetterAlphabet())
899
gi|6273289|gb|AF191663.1|AF191663
Seq('TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAAAAAA...AGA', SingleLetterAlphabet())
899
gi|6273287|gb|AF191661.1|AF191661
Seq('TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAAAAAA...AGA', SingleLetterAlphabet())
895
gi|6273286|gb|AF191660.1|AF191660
Seq('TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAAAAAA...AGA', SingleLetterAlphabet())
893
gi|6273285|gb|AF191659.1|AF191659
Seq('TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAAAAAA...AGA', SingleLetterAlphabet())
894
gi|6273284|gb|AF191658.1|AF191658
Seq('TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAGAAAGAAAAAA...AGA', SingleLetterAlphabet())
896


In [2]:
######## Transcription #################

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

# 5' 3' DNA coding seqeunce (double strand)
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
print('The coding sequence is', coding_dna)

#3' 5' DNA Template sequence (Double Strand)
template_dna = coding_dna.reverse_complement()
print('The template sequence is', template_dna)

# T -> U RNA sequence (single strand)
messenger_rna = coding_dna.transcribe()
print('The mRNA sequence (of the coding sequence) is', messenger_rna)

The coding sequence is ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG
The template sequence is CTATCGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT
The mRNA sequence (of the coding sequence) is AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG


In [2]:
##### DNA Translation ############   
codonmatrix = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}
 
# a function to translate a single codon
def translate_codon(codon):
    return codonmatrix.get(codon.upper(), 'x')
 
# a function to split a sequence into codons
def split_into_codons(dna, frame):
    codons = []
    for i in range(frame - 1, len(dna)-2, 3):
        codon = dna[i:i+3]
        codons.append(codon)
    return codons
 
# a function to translate a dna sequence in a single frame
def translate_dna_single(dna, frame=1):
    codons = split_into_codons(dna, frame)
    amino_acids = ''
    for codon in codons:
        amino_acids = amino_acids + translate_codon(codon)
    return amino_acids
 
# a function to translate a dna sequence in 3 forward frames
def translate_dna(dna):
    all_translations = []
    for frame in range(1,4):
        all_translations.append(translate_dna_single(dna, frame))
    return all_translations
    
split = split_into_codons('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', 1)
print('The first frame codons are', split)

translation = translate_dna('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG')
print("The translation at each frame is", translation)

The first frame codons are ['ATG', 'GCC', 'ATT', 'GTA', 'ATG', 'GGC', 'CGC', 'TGA', 'AAG', 'GGT', 'GCC', 'CGA', 'TAG']
The translation at each frame is ['MAIVMGR_KGAR_', 'WPL_WAAERVPD', 'GHCNGPLKGCPI']


In [3]:
############ Sequence alignment

reference_sequence = 'actgatcgattgatcgatcgatcg'
query_sequence   = 'tttagatcgatctttgatc'

# here are the five bits of information we described before
def score_match(subject, query, subject_start, query_start, length):
    score = 0
    # for each base in the match
    for i in range(0,length):
        # first figure out the matching base from both sequences
        subject_base = subject[subject_start + i]
        query_base = query[query_start + i]
        # then adjust the score up or down depending on 
        # whether or not they are the same
        if subject_base == query_base:
            score = score + 1
        else:
            score = score - 1
    return score
    
# here is the score for the match we were looking at above
print("The first sequence similarity score is", score_match(reference_sequence, query_sequence, 7, 4, 8))
 
# the arguments are the five bits of information that define a match
def pretty_print_match(subject, query, subject_start, query_start, length):
 
    # first print the start/stop positions for the subject sequence
    print(str(subject_start) + (' ' * length) + str(subject_start+length))
 
    # then print the bit of the subject that matches
    print(' ' + subject[subject_start:subject_start+length])
 
    # then print the bit of the query that matches
    print(' ' + query[query_start:query_start+length])
 
    # finally print the start/stop positions for the query
    print(str(query_start) + (' ' * length) + str(query_start+length))
 
    print('n--------------------n')
    
    
def try_all_matches(subject, query, score_limit):
    for subject_start in range(0,len(subject)):
        for query_start in range(0,len(query)):
            for length in range(0,len(query)):
                if (subject_start + length < len(subject) and query_start + length < len(query)):
                    score = score_match(subject, query, subject_start, query_start, length)
                    # only print a line of output if the score is better than some limie
                    if (score >= score_limit):
                        print(subject_start, query_start, length, score)
                        
                        
def try_all_matches(subject, query, score_limit):
    for subject_start in range(0,len(subject)):
        for query_start in range(0,len(query)):
            for length in range(0,len(query)):
                if (subject_start + length < len(subject) and query_start + length < len(query)):
                    score = score_match(subject, query, subject_start, query_start, length)
                    # only print a line of output if the score is better than some limie
                    if (score >= score_limit):
                        print('Score : ' + str(score))
                        pretty_print_match(subject, query, subject_start, query_start, length)
 
try_all_matches(reference_sequence, query_sequence, 7)


The first sequence similarity score is 6
Score : 7
3       10
 gatcgat
 gatcgat
4       11
n--------------------n
Score : 7
8           19
 attgatcgatc
 ttagatcgatc
1           12
n--------------------n
Score : 7
9         18
 ttgatcgat
 tagatcgat
2         11
n--------------------n
Score : 8
9          19
 ttgatcgatc
 tagatcgatc
2          12
n--------------------n
Score : 7
9           20
 ttgatcgatcg
 tagatcgatct
2           13
n--------------------n
Score : 7
9             22
 ttgatcgatcgat
 tagatcgatcttt
2             15
n--------------------n
Score : 7
10         19
 tgatcgatc
 agatcgatc
3         12
n--------------------n
Score : 7
11       18
 gatcgat
 gatcgat
4       11
n--------------------n
Score : 8
11        19
 gatcgatc
 gatcgatc
4        12
n--------------------n
Score : 7
11         20
 gatcgatcg
 gatcgatct
4         13
n--------------------n
Score : 7
11           22
 gatcgatcgat
 gatcgatcttt
4           15
n--------------------n
Score : 7
12           23
 atcgatcgatc


In [4]:
#### Sequence alignment of p53
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist
 
matrix = matlist.blosum62
gap_open = -10
gap_extend = -0.5
 
# Human and mouse amino acid sequence
p53_h = "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP"
p53_m = "MEESQSDISLELPLSQETFSGLWKLLPPEDILPSPHCMDDLLLPQDVEEFFEGPSEALRV"
 
alns = pairwise2.align.globalds(p53_h, p53_m, matrix, gap_open, gap_extend)
 
top_aln = alns[0]
aln_human, aln_mouse, score, begin, end = top_aln
 
print("The pairwise alignment for p53 is", aln_human+'\n'+aln_mouse)



The pairwise alignment for p53 is MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP------
MEESQSDISLELPLSQETFSGLWKLLPPEDIL-PSP-HCMDDLLL-PQDVEEFF---EGPSEALRV


In [None]:
def ThanksCris(python):
    return pythonclass.get(python.concepts(), 'Paul')