# Counting DNA Nucleotides

In [2]:
def count_symbols_in_dna(dna_string):
    count_A, count_C, count_G, count_T = 0, 0, 0, 0

    for symbol in dna_string:
        if symbol == "A":
            count_A += 1
        elif symbol == "C":
            count_C += 1
        elif symbol == "G":
            count_G += 1
        elif symbol == "T":
            count_T += 1

    return f"{count_A} {count_C} {count_G} {count_T}"


sample_input = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC"
expected_output = "20 12 17 21"
counts = count_symbols_in_dna(sample_input)
assert counts == expected_output

In [3]:
fn = "data/rosalind_dna.txt"
with open(fn, "r") as file:
    dna_string = file.read().strip()

print(count_symbols_in_dna(dna_string))

194 197 226 188


# Transcribing DNA into RNA

In [4]:
def transcribe(t):
    return t.replace("T", "U")

In [5]:
sample_input = "GATGGAACTTGACTACGTAAATT"
expected_output = "GAUGGAACUUGACUACGUAAAUU"
output = transcribe(sample_input)
assert output == expected_output

In [6]:
fn = "data/rosalind_rna.txt"
with open(fn, "r") as file:
    dna_string = file.read().strip()

print(transcribe(dna_string))

CCCUUCCUAGAACCGCAGUGUGUUCGUGUGGAGCAUAAUGUCCUAAUGAGAGGUUUAGGAGACGGGACUCGGUCACGCAACGGUAAGCUUGACAUGGUGGCGGACCUGGUGUUGAUACCUUACGUUGUCAUUACGUCCAGGGCCCGCUGUAGGCCCCUGCGGUACGCGACGAGUAUCUUUGCGAGACCGCCAGUACUAAAUCGCAUCGUGGUGUUCCUACACCAGUUUCGUCUUGCCAUCUCCUGUCCAACGCUCAUAAAUAGUAUUCGAAUAGGAACUCUGGAGGCCAGUGGGAAGUCACCCCGAUUAUCUAGGUGGAGGCAAGGCAAAGCGUGUGGCAAAUUAAGGCGCGUCAAAGAAUACCUGGACUCCUACCGGUUGUCUAUGGACCUCGUUACGUCAAGGGCUUUCCAUAGAUCGAUCUAGCGUUCGGCCUGGAGGAGAGAAUCGUAACAUGCUAAGUUGCUAUUGAAGUCAAGGCAGCGGCCUGCCAUGACCUUUGGACCGCCGUCGCCUCACAAGGAGCAUGUUGCCAGAAUCGAGGUUAUCGCAGAGUCAGUGCGACGUAGUGCCCUACGAUAUAAAGCCACUGUUGUGGUAGAUAUGAAAUCUUGAUCCGAUUCGUCAAGAAAAUCUAAAUGGCACGCCAUAUGGAUGGUUCGUCCUCCAUAAACGCCUGCGAGAUCCUCAGCAAUUCUAGAUUGAGAGUUCUACGUUCCCAUGAUAGUACGGAAUUUCUGUCCUAACAUUGCGUAGAAGUCCUUUGGUCUCCCAGAGUCCUAAGGUUCGUUUGGCAAUUGUAGCCGAGGGCAGAAGUUUUGUGGAAUCCAUCUAAUGGCGGCUUACGUCCAUACAAAACUAAUUUAACGGAACGGCAUACUACAAGCUGUGACUAUAGCAUACAGUGGCUUCAUAUAAGCGGCGCUCCAGGAGUGAAGAUAUAGUCGCGUCGCCCGCCUGUAUGCGUAGGCACGUUCGGCUCUGGCA


# Complementing a Strand of DNA

In [15]:
def complement_strand(dna):
    dna = dna.lower()
    comps = {"a": "t", "t": "a", "c": "g", "g": "c"}
    reversed_dna = dna[::-1]
    return "".join([comps.get(char, char) for char in reversed_dna]).upper()

In [17]:
sample_input = "AAAACCCGGT"
expected_output = "ACCGGGTTTT"
output = complement_strand(sample_input)
assert output == expected_output

In [20]:
fn = "data/rosalind_revc.txt"
with open(fn, "r") as file:
    dna_string = file.read().strip()

print(complement_strand(dna_string))

CGAACTGTTGCCGTTGTGGGTAAATCTTAGTACCGCTCTTTGTTCAAGATAAGCGATTCTTTACCACATCACCCCACAGACTAGCTTACAGGTAATTCACCAAGCCCTGCTGCATACCATTCAGTTGTTTGTAGCTGACGAAATCGTGAATTAATCGTGCGCCTGCCTAATTGAAGAACTAATCCACCCAGAGCGCACGAGCCTCCATGGTAAGTTCCGCATAGTTTGATACGTCCATCTCCCATTGGATTCCTAACAGGTTCGTGTCACGTCCGAGAACCTAACGGATGTGAATAGCTCACATTGATGCCGCTAGCAGGGTCCCAGAAGCAACGGGGCCGTGGCAATTTCGCCAGTTATGCGATAGGCCTATTACGGATAGTCCAGTTGCAGGGCCACTGGTCGAGGATTCGGGCGTTGCCGATAAGGTAATCAAGGGCCGCGGCGGTAGGTTCTGTCCCACCATTAGTTTTGCCTCGTACGTGGCGCTTCGACTAATCATACCTGTCCTACGTTCGGTGCCCCTCATCTATTCTCCTTGCTGGATGGCATTAGAGGACTACGGTGTTGTTTCGCAACAATCAGGTTTGGCTAGTAATGGTCCGTTCAAAGGATCCACTCTAATTAATCTAATAGTTCGGCTGACTACGATTAACCAACGGCGGGTACCACTTTTCTAGTGGTTGCTAAAAATCCACTTTGCACGTAGACAGTTCCGCCACTCGGACCAAGAATACCGGCTAGCGAGCACTTTTTGACCGTCAGGATGCTGCGTACTGGGGTCACATGTGGCTCTCCAGACGTGTACGATCTAAGACAGTTTGCCAAGTAGTCCCGTCGGATTACGGGCTGGCCTAGTGCGTACTATGTTAGTTTGCTAGTCTACGTAAGACTGTCGTGGATGTTCTACAGCCCGAATTTCAGAGTGATTCACTGCTTACTCCTACTATAGATATAA


# Computing GC Content

In [24]:
def get_gc_content(input_string):
    sequences = input_string.split(">")
    sequences = [s.strip() for s in sequences if s.strip()]

    result_dict = {}

    for sequence in sequences:
        lines = sequence.split("\n", 1)
        name = lines[0]
        value = lines[1].replace("\n", "")
        result_dict[name] = value

    highest_gc = {}

    for name, value in result_dict.items():
        count_c = value.count("C")
        count_g = value.count("G")
        gc = round((count_c + count_g) / len(value) * 100, 6)
        if len(highest_gc) == 0 or gc > highest_gc["gc"]:
            highest_gc["label"] = name
            highest_gc["gc"] = gc

    return f"{highest_gc['label']}\n{highest_gc['gc']}"

In [27]:
sample_input = """>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT"""
expected_output = """Rosalind_0808
60.91954"""
output = get_gc_content(sample_input)
print(output)
assert output == expected_output

Rosalind_0808
60.91954


In [28]:
fn = "data/rosalind_gc.txt"
with open(fn, "r") as file:
    gc_string = file.read().strip()

print(get_gc_content(gc_string))

Rosalind_9583
53.139535


# Finding a Motif in DNA

In [23]:
def get_sub_str_idx(d):
    s, t = d.split()

    positions = []
    start = 0
    while True:
        position = s.find(t, start)

        if position == -1:
            break

        positions.append(position + 1)
        start = position + 1

    return " ".join(str(i) for i in positions)

In [24]:
sample_input = """GATATATGCATATACTT
ATAT"""
expected_output = "2 4 10"
output = get_sub_str_idx(sample_input)
print(output)
assert output == expected_output

2 4 10


In [25]:
fn = "data/rosalind_subs.txt"
with open(fn, "r") as file:
    dna = file.read().strip()

print(get_sub_str_idx(dna))

6 23 78 95 129 147 154 161 200 342 349 400 433 451 458 501 510 527 534 558 595 613 665 702 759 796 803 812 853 860 867 885
