In [3]:
def read_file(file):
  with open(file, "r") as f:
    return f.read().strip()

In [4]:
def read_fasta(file):
  res = []
  i = -1
  with open(file, "r") as f:
    for line in f:
      if line.startswith(">"):
        res.append([])
        i += 1
      else:
        res[i].append(line[:-1])

  return "\n".join("".join(x) for x in res)

## Counting DNA Nucleotides

In [5]:
def count_nucleotides(s):
  a, c, g, t = s.count("A"), s.count("C"), s.count("G"), s.count("T")

  return f"{a} {c} {g} {t}"

In [6]:
print(count_nucleotides(read_file("rosalind_dna.txt")))

202 235 205 221


## Transcribing DNA into RNA

In [7]:
def dna2rna(s):
  return s.replace("T", "U")

In [8]:
print(dna2rna(read_file("rosalind_rna.txt")))

GUGUGUCAACAGGAACACCUGACAAGACAUAACAACCGACUCCUCUUUGGUGACGGCCGUGGUUGAACCGUCGAACUAUCAGAAGAUCACUUGAUUUAUUUAAUCCACUGCCCUCGCGCAUCGCACCUCGCGAUCGCCUCGCUUAGCAUAGAUGAGAACUGGCUAUUGACUGCGAUCGCACUGGUAUUAAGCGCAGGAUGUUAAACACUGAUUCGGAGCCGCUCAGGUAUGUCCUCCUGGGGGUCCCCUCCACUGCGAUUGUAGAGCGGGAACCUUCACUUGUCGGACGUAUUUCGACGUCUUCGAUCACCAUGCAGGAGACCAAACAUGGGUUGCAACGACCCAUCUUGCAUUAGCGAUUCCUAUAAGACGACGCACGAAUACUACGGCAGAAGUACAGAAAGUUUUAAACGUAGACUGUACUCCAGAUUACAGCAUCCGUGUUAGAUGCUUAAUGGAACCCGAGUUCUCGACAAACAUUGAUUCUUUGACGGGUGAUACAAAGUUGGUGGCCUCAUGCCUAGAAGUUUGAUUCCGUUGAAUAGGAGCGCAACACUCGGCUCUCUUUUUGUACGGAACAACGCUGCUGCACGCCAGUCUUAGCGUCCGAGUAUUCCACGCUGGUGGUCUACUGGCUUUUAGAGUAUUAGAUCUCGUAGGAUGUAGGCAUACUGCGUUACCCGCCACUGAAUCGAAGAAUCGUUCCGCGAUUAACCUCGGAAGCGGUCAAAAGCUCCAAAAAACUCACACUCAGGAGUCCUGCAAUGGUUCACGCCAAUACAUUACGAGUUAGUCACUUCCGCAGAUUUCAGCCACUGGACCGGACAACGGGGUGUGAAAGACUAACUCGGUUAUCAAACGGCGAUACAAACAUUUAAACGCAGUGAUGCACGGUUGAGCAAACACCGGGAAUCCUUUGUACAUAACAUAACCGUGCCUUUCUGCG


## Complementing a Strand of DNA

In [9]:
def reverse_complement(s):
  return s.translate(str.maketrans("ACGT", "TGCA"))[::-1]

In [10]:
print(reverse_complement(read_file("rosalind_revc.txt")))

ATTATGTATTGTACGCTCCTACGTGAAGGAATCCATTGATGAATACTGTGAAAGGTTGCTGCAAAAGAGACTCATATGCTTCAATAGCCTCCTTAGACAGTGGCAGGACGAAGCAACCTTTCCCAAAAATTTTCTTAGAACCAAGGACAACGTGGTGAGCTAGGTTTTATCTGGTAGAGCTCTCGATACAGTGCCTTGATTCAACTGCGCAGGTTATGACAGGCCCAGTATGTATGGTAGCGAAACTCAATAATAACAGTAGATGCCGAGAGTTCGACCTTTCTTGTCTGGTTCCAACTATGTACGCTATATTCTGCAATAAGATATAAGCCGATTACACTTCACGTAGGGGAACCTCGGGCGCGGTTACGACACTTGTAAACCGGTTTCTTTACGTTATTGAGTACTTAACACTTTGACGCCGGTTGAGAATATTTCGTCGCACAGACCCCCTATACTATGATTTTCGACGCTCTTTTCAACTATCGCTGGGGTTGCTATCATTTCCGTTAGTTGTCACATAAATGTCATGTCACTGAAGGTTAACCCGATCATCTGTGTGCTGGAACCCCTCTAGTGGTTTCTCATGCGGGTTGGGCCTTGGTCCGACTGCCGGGATCACACGAATATCAACCAGAGACTGCGTAGCACGGTGACCCTCGACGCCACCGCCAATGACTGTCCCGAATTTGCGTGGAATGTCTATGCTCTCATAAGTGGGCCCTAGGGGGTCTGGCCGTGCTTGCTCGGTAGATGTGTGATGATAACGGGGTAACTAGTTTATTATTCCGCTATGAGCATTGCTTGAACCACGTTCGGCATTTGAACTATTTTTGGTATTGGCGTGACCACGAGGGCATTTAACGCCTCCGAGAGAACCGGCTGTGTGCGTACATGTGGTACCTAGGACCGTGCTCCCACCCGCGATTGGAGCGCTGTCTAATGCACTCGGC


## Finding a Motif in DNA

In [11]:
def find_motif(p, t: str):
  return " ".join(str(i + 1) for i in range(len(t)) if t.startswith(p, i))

In [12]:
subs = read_file("rosalind_subs.txt").split("\n")
print(find_motif(subs[1], subs[0]))

2 20 188 257 289 315 322 361 368 384 402 446 487 523 559 631 673 709 716 734 803 828 835 856 910


## Translating RNA into Protein

In [13]:
START_CODON_DNA = "ATG"
STOP_CODON_AMBER_DNA = "TAG"
STOP_CODON_OPAL_DNA = "TGA"
STOP_CODON_OCHRE_DNA = "TAA"

GENETIC_CODE_DNA = {
  "AAA": "K", "AAC": "N", "AAG": "K", "AAT": "N",
  "ACA": "T", "ACC": "T", "ACG": "T", "ACT": "T",
  "AGA": "R", "AGC": "S", "AGG": "R", "AGT": "S",
  "ATA": "I", "ATC": "I", START_CODON_DNA: "M", "ATT": "I",
  "CAA": "Q", "CAC": "H", "CAG": "Q", "CAT": "H",
  "CCA": "P", "CCC": "P", "CCG": "P", "CCT": "P",
  "CGA": "R", "CGC": "R", "CGG": "R", "CGT": "R",
  "CTA": "L", "CTC": "L", "CTG": "L", "CTT": "L",
  "GAA": "E", "GAC": "D", "GAG": "E", "GAT": "D",
  "GCA": "A", "GCC": "A", "GCG": "A", "GCT": "A",
  "GGA": "G", "GGC": "G", "GGG": "G", "GGT": "G",
  "GTA": "V", "GTC": "V", "GTG": "V", "GTT": "V",
  STOP_CODON_OCHRE_DNA: "*", "TAC": "Y", STOP_CODON_AMBER_DNA: "*", "TAT": "Y",
  "TCA": "S", "TCC": "S", "TCG": "S", "TCT": "S",
  STOP_CODON_OPAL_DNA: "*", "TGC": "C", "TGG": "W", "TGT": "C",
  "TTA": "L", "TTC": "F", "TTG": "L", "TTT": "F"
}

def rna2protein(s):
  return "".join(GENETIC_CODE_DNA[s[i:i + 3].replace("U", "T")] for i in range(0, len(s), 3))

In [14]:
print(rna2protein(read_file("rosalind_prot.txt")))

MEPCTSPWRRPKNLRICLSNSTGREIGLVVTLRTSTTAAISCILITARGVLKIKLAALPRGEDELGVILAYLSPPDTNIRSRLLSPRNGGLFRHEAGPKLSRSINARRSKSPVHKSLRKQNFRGNLPCPPALLHKRSPVHDYLLFFYACIGVKARAAGAKYLVSVPKFRCRDSRDVSLNNFNSHWHVNVDSTKYQQDQLHTRESNLLQSLSHSFLASRTVISDRQRFQFALIPTRSGTDTRQRLSQAPACRVACSTEESPYKQLYTGIAWLSPCRVDIVELASLCVGRRRTSLKVDKKVIALTSVWCVWRGVSRPHNGRHEGGDGRGFQITTGPRRDQSIRFLGVIARSLYSLDKATRSFSRTLGHDVTLPKSDQRFGGRICTLVGKNNTPYAANTATCLSCWCGCQDSARNNETKRLIRLVSYTQAPSETSFTPTLHPTLPVFQATQQTVSTKRTPSKRSGNRRIQRRPPPRQMPAIREPTCRDNKGGSFRGNCKRVGDYHVRTREGGLDCAVPVFLDPDKTNHEDVLVTTLMLGLVSLGGYDVLRACVGTRTAGRHFVMCRGWAILRGHFSVPKAPGHDHDLITSGVVFCWGYTTIGPKLKGDAGLYETLPRAAGSAWAYQVRQSSMGTRRFWTAAGCTRSSLSDGIEMLNLYSHGLLTLMRSRNPGVPVLCSPGKRRQPHQSVSECPTLSKPTWVMTTKSKGASASPEGTVNSTTPACSIFAQALPFLDYLEVCSILLLAYSTLLCKLLLHPPTGDPGQQPANTMEFMRQLFSHAVSTRPKYRPSPRRRHRVQPGLTVRLRGYNSRFSSVSSLHLRCRPVRITLLIVLRGSRDINAACLKGEQYTMQARLTGELFSPAVRRQQKPESTLVTHCVCRSGLRQELLQCSKVSCYLEGYAWTTNFGFAHIICFTSSSRWEHGLRTKRERSTYRSYTLDTAAGGVTYAFVSLQFHCRLSSAEILIMRVSSFSTRRKLAFSLCAAYDRKVLVYGNLRHGRTR

## Open Reading Frames

In [15]:
from collections import defaultdict

def ssorfs(s):
  res = defaultdict(list)
  curi = defaultdict(list)

  for i in range(len(s)):
    codon = s[i:i + 3]
    frame = i % 3

    if codon == START_CODON_DNA:
      curi[frame].append(i)

    for j in curi[frame]:
      if codon in GENETIC_CODE_DNA:
        res[f"{frame}_{j}"].append(GENETIC_CODE_DNA[codon])

    if codon == STOP_CODON_AMBER_DNA or codon == STOP_CODON_OCHRE_DNA or codon == STOP_CODON_OPAL_DNA:
      curi[frame] = []

  return ["".join(orf[:-1]) for orf in res.values() if orf[-1] == "*"]

def orfs(s):
  return "\n".join(list({*ssorfs(s), *ssorfs(reverse_complement(s))}))

In [16]:
print(orfs(read_fasta("rosalind_orf.txt")))

MPVGEPSQSMCERYSTQSADNAKATLPGATYHACDTNIVHSKTRQGYEHGVNFTGLTRVVLILLPATIRFLVRR
MLIPCPDSIECSCYGDCRAKHASFWFM
M
MTFHQRGRLPAPI
MCERYSTQSADNAKATLPGATYHACDTNIVHSKTRQGYEHGVNFTGLTRVVLILLPATIRFLVRR
MEWPCLYRAQIPSSVLATATVALNTRVFGLCSVRSLT
MTV
MYAIFPDL
MLP
MLVPLACLRMYYVGIARVIGSTRQCCLSVVGGLCRIAFTHRLAGLAYRHDVDVVHANLRQRISMTFHQRGRLPAPI
MCKEPTERQFQRLTRKRIVAGKRISTTLVSPVKFTPCSYPWRVFECTMLVSHA
MRFPSASTRILSVGGCGEPDN
MQDLQFQRTESAHDCMTSRVENVRHLSRFVATNLERWRTFVEANLFLVVVTKPASVPLRLRSVFYDFPPVSQGHLVGV
MTSRVENVRHLSRFVATNLERWRTFVEANLFLVVVTKPASVPLRLRSVFYDFPPVSQGHLVGV
MAYIRGSKPLLSGSHETSLRPS
MCTLRPLKLQVLH
MATPLGNSRPSLCVSRPQAQGFYP
MLVSHA
MAYILYTTCHTVMCTLRPLKLQVLH
MESGHGISMATPLGNSRPSLCVSRPQAQGFYP
MG
MM
MSQWRGSF
MSPSTGSLASASLPIGSLPTAED
MYYVGIARVIGSTRQCCLSVVGGLCRIAFTHRLAGLAYRHDVDVVHANLRQRISMTFHQRGRLPAPI


## RNA Splicing

In [17]:
def splice_rna(s, introns):
  for intron in introns:
    s = s.replace(intron, "")

  return rna2protein(s)

In [18]:
s, *introns = read_fasta("rosalind_splc.txt").split("\n")
print(splice_rna(s, introns))

MVILHLRHSCVKYSPLSVSHWIYGVLVLAFSLVNGVPLLAARIVRASKIESLSTNCVIVGVASKRDSILRCSKLSVLILANNIDRPTLTAASANYRPGYTYGRPLRKKHLFTPRSQLLQNRRFARLPYGFCRSYPGTIRRRVGLDDALKKFLSHVSGPSTDSLWYRVRPTRSDPNGLPSSYRKLRLQ*


## Finding a Spliced Motif

In [19]:
def find_spliced_motif(t: str, p: str):
  res = []
  i = 0
  for c in p:
    while i < len(t):
      i += 1
      if t[i - 1] == c:
        res.append(i)
        break

  return " ".join(map(str, res))

In [34]:
print(find_spliced_motif(*read_fasta("rosalind_sseq.txt").split("\n")))

3 4 5 10 11 12 15 20 24 25 27 38 40 47 49 50 52 58 59


## Counting Point Mutations

In [23]:
def count_point_mutations(s: str, t: str):
  return sum(s[i] != t[i] for i in range(len(s)))

In [26]:
print(count_point_mutations(*read_file("rosalind_hamm.txt").split("\n")))

484


## Finding a Shared Motif

In [477]:
def lcs(a: str, b: str):
  m = len(a) + 1
  n = len(b) + 1

  table = [[0] * n for _ in range(2)]

  maxlen = 0
  maxmatch = ""

  for i in range(1, m):
    for j in range(1, n):
      if a[i - 1] == b[j - 1]:
        table[-1][j] = table[-2][j - 1] + 1

        if table[-1][j] > maxlen:
          maxlen = table[-1][j]
          maxmatch = a[i - maxlen:i]
      else:
        table[-1][j] = 0

    table[-2] = table[-1]
    table[-1] = [0] * n

  return maxmatch

def find_shared_motif(ss: list[str]):
  # i = 0
  # sorted_ss = sorted(ss, key=lambda s: len(s))
  # motif = lcs(sorted_ss[0], sorted_ss[1])
  # if i in range(2, len(sorted_ss)):
  #   motif = lcs(motif, sorted_ss[i])

  # return motif

  a, b, *rest = sorted(ss, key=lambda s: len(s))
  motif = lcs(a, b)

  l = len(motif)
  while l > 0:
    for i in range(len(motif) - l + 1):
      lmer = motif[i:i + l]

      shared = True
      for s in rest:
        shared = shared and lmer in s
        if not shared:
          break

      if shared:
        return lmer

    l -= 1

  return ""

In [478]:
print(find_shared_motif(read_fasta("rosalind_lcsm.txt").split("\n")))

TTGGTTACCAGGTTGAAGATACAGTAGGTCACCGCATAACTATGTTATCCGGCGAGTGAGGCCTAGAACGAAAGGGTTACTGGCCCGTCCGCACGGTTTCCGGAGTGCCTTCTTCGGATTAGGATGCTGAAGACCATGTAAGCGAGGATCTCATTCTGGTGGATTATCGCTTCTAACTAGTTCCATACCCGCAGGGTCCTCTAGCCTAATAACTTCCCCGTGGAACTGACCGTTTCTCTACTCGCGCGGAGTAGCCTTACGCGTGCAGCTCTATTGTGGGAGCGCA


## Enumerating Gene Orders

In [115]:
from itertools import permutations

def permutations(iterable):
  return [(*p[:i], iterable[0], *p[i:]) for p in permutations(iterable[1:]) for i in range(len(p) + 1)] if len(iterable) > 1 else [tuple(iterable)]
 
def enumerate_gene_orders(n: int):
  orders = list(permutations(range(1, n + 1)))
  l = len(orders)

  return l, orders

In [116]:
l, orders = enumerate_gene_orders(int(read_file("rosalind_perm.txt")))
print(l)
for o in orders:
  print(o)

120
(1, 2, 3, 4, 5)
(2, 1, 3, 4, 5)
(2, 3, 1, 4, 5)
(2, 3, 4, 1, 5)
(2, 3, 4, 5, 1)
(1, 3, 2, 4, 5)
(3, 1, 2, 4, 5)
(3, 2, 1, 4, 5)
(3, 2, 4, 1, 5)
(3, 2, 4, 5, 1)
(1, 3, 4, 2, 5)
(3, 1, 4, 2, 5)
(3, 4, 1, 2, 5)
(3, 4, 2, 1, 5)
(3, 4, 2, 5, 1)
(1, 3, 4, 5, 2)
(3, 1, 4, 5, 2)
(3, 4, 1, 5, 2)
(3, 4, 5, 1, 2)
(3, 4, 5, 2, 1)
(1, 2, 4, 3, 5)
(2, 1, 4, 3, 5)
(2, 4, 1, 3, 5)
(2, 4, 3, 1, 5)
(2, 4, 3, 5, 1)
(1, 4, 2, 3, 5)
(4, 1, 2, 3, 5)
(4, 2, 1, 3, 5)
(4, 2, 3, 1, 5)
(4, 2, 3, 5, 1)
(1, 4, 3, 2, 5)
(4, 1, 3, 2, 5)
(4, 3, 1, 2, 5)
(4, 3, 2, 1, 5)
(4, 3, 2, 5, 1)
(1, 4, 3, 5, 2)
(4, 1, 3, 5, 2)
(4, 3, 1, 5, 2)
(4, 3, 5, 1, 2)
(4, 3, 5, 2, 1)
(1, 2, 4, 5, 3)
(2, 1, 4, 5, 3)
(2, 4, 1, 5, 3)
(2, 4, 5, 1, 3)
(2, 4, 5, 3, 1)
(1, 4, 2, 5, 3)
(4, 1, 2, 5, 3)
(4, 2, 1, 5, 3)
(4, 2, 5, 1, 3)
(4, 2, 5, 3, 1)
(1, 4, 5, 2, 3)
(4, 1, 5, 2, 3)
(4, 5, 1, 2, 3)
(4, 5, 2, 1, 3)
(4, 5, 2, 3, 1)
(1, 4, 5, 3, 2)
(4, 1, 5, 3, 2)
(4, 5, 1, 3, 2)
(4, 5, 3, 1, 2)
(4, 5, 3, 2, 1)
(1, 2, 3, 5, 4)
(2, 1, 3, 5, 4)
(2, 

## Enumerating k-mers Lexicographically

In [199]:
def enumerate_kmers(alphabet: list[str], k=None):
  if k is None:
    k = len(alphabet)

  if k == 0:
    return [()]
  elif k == 1:
    return [(c,) for c in alphabet]

  res = []
  for c in alphabet:
    new_alphabet = [a for a in alphabet]
    res.extend(map(lambda x: (c, *x), enumerate_kmers(new_alphabet, k - 1)))

  return res

In [201]:
alphabet, k = read_file("rosalind_lexf.txt").split("\n")
for kmer in enumerate_kmers(alphabet.split(), int(k)):
  print("".join(kmer))

AAA
AAB
AAC
AAD
AAE
AAF
AAG
AAH
ABA
ABB
ABC
ABD
ABE
ABF
ABG
ABH
ACA
ACB
ACC
ACD
ACE
ACF
ACG
ACH
ADA
ADB
ADC
ADD
ADE
ADF
ADG
ADH
AEA
AEB
AEC
AED
AEE
AEF
AEG
AEH
AFA
AFB
AFC
AFD
AFE
AFF
AFG
AFH
AGA
AGB
AGC
AGD
AGE
AGF
AGG
AGH
AHA
AHB
AHC
AHD
AHE
AHF
AHG
AHH
BAA
BAB
BAC
BAD
BAE
BAF
BAG
BAH
BBA
BBB
BBC
BBD
BBE
BBF
BBG
BBH
BCA
BCB
BCC
BCD
BCE
BCF
BCG
BCH
BDA
BDB
BDC
BDD
BDE
BDF
BDG
BDH
BEA
BEB
BEC
BED
BEE
BEF
BEG
BEH
BFA
BFB
BFC
BFD
BFE
BFF
BFG
BFH
BGA
BGB
BGC
BGD
BGE
BGF
BGG
BGH
BHA
BHB
BHC
BHD
BHE
BHF
BHG
BHH
CAA
CAB
CAC
CAD
CAE
CAF
CAG
CAH
CBA
CBB
CBC
CBD
CBE
CBF
CBG
CBH
CCA
CCB
CCC
CCD
CCE
CCF
CCG
CCH
CDA
CDB
CDC
CDD
CDE
CDF
CDG
CDH
CEA
CEB
CEC
CED
CEE
CEF
CEG
CEH
CFA
CFB
CFC
CFD
CFE
CFF
CFG
CFH
CGA
CGB
CGC
CGD
CGE
CGF
CGG
CGH
CHA
CHB
CHC
CHD
CHE
CHF
CHG
CHH
DAA
DAB
DAC
DAD
DAE
DAF
DAG
DAH
DBA
DBB
DBC
DBD
DBE
DBF
DBG
DBH
DCA
DCB
DCC
DCD
DCE
DCF
DCG
DCH
DDA
DDB
DDC
DDD
DDE
DDF
DDG
DDH
DEA
DEB
DEC
DED
DEE
DEF
DEG
DEH
DFA
DFB
DFC
DFD
DFE
DFF
DFG
DFH
DGA
DGB
DGC
DGD
DGE
DGF
DGG
DGH
DHA
DHB


## k-Mer Composition

In [260]:
def kmer_composition(s: str, k = 4):
  kmers = ["".join(kmer) for kmer in enumerate_kmers(["A", "C", "G", "T"], k)]

  counts = {kmer: 0 for kmer in kmers}
  for i in range(len(s) - k + 1):
    kmer = s[i:i + k]
    counts[kmer] += 1

  return list(counts.values())

In [261]:
print(" ".join(map(str, kmer_composition(read_fasta("rosalind_kmer.txt")))))

304 308 310 290 310 319 313 329 305 314 322 346 302 320 323 303 330 318 296 309 326 319 311 317 308 313 286 305 321 308 308 323 326 308 325 324 298 307 332 311 337 301 336 290 334 308 283 320 318 300 306 310 258 336 368 295 300 314 318 296 320 296 322 336 319 336 341 298 333 309 263 313 325 319 320 315 301 294 273 306 347 281 308 294 310 331 313 312 340 299 331 283 293 323 325 314 306 292 341 345 307 293 311 307 307 329 351 323 297 313 333 302 313 328 311 324 320 284 304 314 299 321 311 292 280 333 320 296 310 312 322 325 305 314 318 320 327 305 337 287 322 324 324 350 304 301 343 299 300 301 315 310 313 310 344 315 332 299 308 294 315 359 311 340 318 316 311 302 335 318 344 333 288 333 298 325 308 330 280 300 319 297 337 297 306 313 307 296 324 323 319 292 279 315 314 335 305 331 318 298 326 310 285 297 309 319 308 314 313 318 332 272 294 315 314 316 323 296 349 342 330 292 282 298 322 298 279 311 324 310 328 313 346 299 299 298 299 296 308 311 304 294 321 316 338 323 301 296 305 327 

## Finding a Shared Spliced Motif

In [567]:
def find_shared_spliced_motif(s: str, t: str):
  m = len(s) + 1
  n = len(t) + 1

  table = [[0] * n for _ in range(m)]

  for i in range(1, m):
    for j in range(1, n):
      table[i][j] = max(
        table[i][j - 1],
        table[i - 1][j],
        table[i - 1][j - 1] + (s[i - 1] == t[j - 1]),
      )

  lcs = []
  i, j = m - 1, n - 1
  while i > 0 and j > 0:
    if s[i - 1] == t[j - 1]:
      lcs.append(s[i - 1])
      i -= 1
      j -= 1
    elif table[i - 1][j] > table[i][j - 1]:
      i -= 1
    else:
      j -= 1

  return "".join(reversed(lcs))

In [568]:
print(find_shared_spliced_motif(*read_fasta("rosalind_lcsq.txt").split("\n")))

GATTAAATAACGGTTGCGTCCTGGGCATCGGGACGAACCTAATGGGAAAGCAAGACGGTTGGTTTGGACGCGATTAAACTTATTTTGTTGCCTGAGCGGCACTATGTACTGGGCATTCGGGTGACTTGACGATCAGCTCAGGGCATTTTACGCTTCGTCGGGCATTGTGAGCGTTACCTATTGTCTGGCCAAAAGAACGTAATGAAGTTACCCACCGCGTTTCTTATTAATTGTCTGCCTGCACCATCGGCTACATGCAAATCTCGAGGCGAATAAAAACCGCATCCGTTACATGGGCATAAGAATCGTGCCATTGATGACATAAATGAAAGATGCAATATCATATTTACCACTTGCACGAAATAATTTCCCCCTACCCCATGTTCACCTGATGGCGGGATGTGACTCTCCTGGCCATTAATATATTTAGTCCTCTGATGGCTAGAGCATGGGGAGTGGAGGATGATGGTCTTCAATCCTTCGGAGTAATTGTTAAGGCTTCTACATACTGGTAGCAACCGCTAGCGGTAGGCGGACCTAAGTCATCCCCGCCCCGATTGCTGTATAGAGGGGAA


## Edit Distance

In [569]:
def edit_distance(s: str, t: str):
  m = len(s) + 1
  n = len(t) + 1

  table = [[0] * n for _ in range(m)]
  for i in range(m):
    table[i][0] = i
  for j in range(n):
    table[0][j] = j

  for i in range(1, m):
    for j in range(1, n):
      table[i][j] = min(
        table[i][j - 1] + 1,
        table[i - 1][j] + 1,
        table[i - 1][j - 1] + (s[i - 1] != t[j - 1]),
      )

  return table[-1][-1]

In [571]:
print(edit_distance(*read_fasta("rosalind_edit.txt").split("\n")))

418


## Edit Distance Alignment

In [629]:
def align(s: str, t: str):
  m = len(s) + 1
  n = len(t) + 1

  table = [[0] * n for _ in range(m)]
  for i in range(m):
    table[i][0] = i
  for j in range(n):
    table[0][j] = j

  for i in range(1, m):
    for j in range(1, n):
      table[i][j] = min(
        table[i][j - 1] + 1,
        table[i - 1][j] + 1,
        table[i - 1][j - 1] + (s[i - 1] != t[j - 1]),
      )

  i, j = m - 1, n - 1
  s_aligned = []
  t_aligned = []
  while i > 0 and j > 0:
    if table[i][j] == table[i - 1][j - 1] + (s[i - 1] != t[j - 1]):
      s_aligned.append(s[i - 1])
      t_aligned.append(t[j - 1])
      i -= 1
      j -= 1
    elif table[i][j] == table[i - 1][j] + 1:
      s_aligned.append(s[i - 1])
      t_aligned.append("-")
      i -= 1
    else:
      s_aligned.append("-")
      t_aligned.append(t[j - 1])
      j -= 1

  return table[-1][-1], "".join(reversed(s_aligned)), "".join(reversed(t_aligned))

In [630]:
print(*align(*read_fasta("rosalind_edta.txt").split("\n")), sep="\n")

349
RKAYDKVPT----FPLMEEENCQWRPWYNSATENEGPSRCYYGK---T-PIDALTNA--------MAPGTDFHSDT-G----IGTT-RGQSYTFWFQTVRANPKHGDTSWCWWCMVRMIVRPDEMHIMEIYNDAQYMVSLCRVDLHHTPPHWPMTAAHVCMCATGVWGGYFVEPVRECQIRIQDGIADKKDHPRNCMPTSQMLTHHQWHPDGEICDVIYDVHQEIWE-YQSNITNGMRYPVMMLCREYMGILTDCPHMETHITTAQIQFHPSSEGQPPDTPSGDSTKYNKMIMRLVHWHLIISCL--RLYAERGRCCFIASRQWFQWKKASEEIS--NFPRRYG---LDKVFNMESQWIV-CMDLRAIPPYARWSHE-----VSSHWRG--W---Q--------EN--SKKY---ALCSWGFRQVMT-------TNKHRFWDHT-----HAMYRRIVGNVWNRATT----V--NHEMQVVT-------GMGDSRAHWPA-YHDVMLRYRPECADSNVFATPQIPAAFAKGIRNPM-YYREIMLIGICRYRVFHKPTEAS-A--SDAVDCGCPKRYAPINQYSCDTGEQMESRVLAGRS---LH---RMSVAKLLPVAENNRRQVKRHANHLLKIA----QPMHCVDQAYPGCNGPDNRFEIPDNFGGMLERFSPMKYLYCSWNSVLIIDDYKSQPIRMVNCRPPPEKCYRSQAMMQWCSDNPRDDGSGCPEFIFLWYDADEPKKHKFDCYWYWKGECVAAMKVKYSRKFYYTVGDLVEHDCNMLPKIKEVGL---TQSMPHTRWDEGENYSVSGNVRRTIRRPQFHENQEHANVHLITWSAERCMSDGGKSGGDRKMRGVMDMRLFSHICYGRNNIRRPCRDQVSMGYALTWHRCIFEAEINPICCSATRVMITWSDWFQTLITVWPEDMLHFWDQSMTCPLHEHMAG
RKAYDLVPTERIAKQDVGLEN-GWRPLYNSATENEGPSRCYYGKNYI

## Counting Optimal Alignments

In [636]:
def count_optimal_alignments(s: str, t: str):
  # TODO
  return s, t

In [637]:
print(count_optimal_alignments(*read_fasta("rosalind_ctea.txt").split("\n")))

('PLEASANTLY', 'MEANLY')
