# Overlap Graphs
**Given:** A collection of DNA strings in FASTA format having total length at most 10 kbp.

**Return:** The adjacency list corresponding to O3O3. You may return edges in any order.


# Sample Dataset

In [1]:
%%file Sample_Dataset.txt
>Rosalind_0498
AAATAAA
>Rosalind_2391
AAATTTT
>Rosalind_2323
TTTTCCC
>Rosalind_0442
AAATCCC
>Rosalind_5013
GGGTGGG


Overwriting Sample_Dataset.txt


# Sample Output

In [2]:
%%file Sample_Output.txt
Rosalind_0498 Rosalind_2391
Rosalind_0498 Rosalind_0442
Rosalind_2391 Rosalind_2323



Overwriting Sample_Output.txt


# Solution

In [3]:
import collections

def parseFastaFile(fasta_file_path):
    fasta_file = open(fasta_file_path,'r')
    fasta_file_lines = fasta_file.readlines()
    
    fasta_records = collections.OrderedDict()
    
    for line in fasta_file_lines:
        if line[0] == ">":
            fasta_records[line[1:].rstrip()] = ""
        else:
            fasta_records[next(reversed(fasta_records))] += line.rstrip()
            
    fasta_file.close()
    
    return fasta_records


In [4]:
import itertools

def getOverlapGraph(fasta_records, n = 3):
    "Given a collection of DNA strings in FASTA format having total length at most 10 kbp, return the adjacency list corresponding to O3O3. You may return edges in any order."
    sequence_names = list(fasta_records.keys())
    adjacency_list = []
    for from_sequence_name in sequence_names:
        for to_sequence_name in sequence_names:
            if from_sequence_name == to_sequence_name:
                continue
            else:
                if fasta_records[from_sequence_name][-3:] == fasta_records[to_sequence_name][:3]:
                    adjacency_list.append((from_sequence_name, to_sequence_name))
            
    return adjacency_list


In [5]:
def getOverlapGraphFromFileToFile(input_file_path, output_file_path, n = 3):
    "Wraps getOverlapGraph to read from input_file_path and write to output_file_path"
    
    output_file = open(output_file_path, 'w')
    adjacency_list = getOverlapGraph(parseFastaFile(input_file_path), n)
    output_string = "\n".join([" ".join(graph_edge) for graph_edge in adjacency_list])
    output_file.write("%s\n" % output_string)
    output_file.close()
    
    return


# Test Solution

In [6]:
getOverlapGraphFromFileToFile("Sample_Dataset.txt", "Test_Output.txt")

In [7]:
%%bash
echo Sample_Output.txt
md5sum Sample_Output.txt
cat Sample_Output.txt

Sample_Output.txt
3c576ec187807fb487117153703216b5  Sample_Output.txt
Rosalind_0498 Rosalind_2391
Rosalind_0498 Rosalind_0442
Rosalind_2391 Rosalind_2323


In [8]:
%%bash
echo Test_Output.txt
md5sum Test_Output.txt
cat Test_Output.txt

Test_Output.txt
3c576ec187807fb487117153703216b5  Test_Output.txt
Rosalind_0498 Rosalind_2391
Rosalind_0498 Rosalind_0442
Rosalind_2391 Rosalind_2323


In [9]:
%%bash
if [ $(md5sum Sample_Output.txt|cut -f1 -d' ') == $(md5sum Test_Output.txt|cut -f1 -d' ') ]
then
    echo Sample output matches test output.
else
    echo Sample output does not Match test output.
fi

Sample output matches test output.


# Downloaded Dataset

In [10]:
%%bash
cp ~/Downloads/rosalind_grph.txt ./
cat rosalind_grph.txt

>Rosalind_5452
AGAAGGACGTGTAGCCAAACTGAGTACCTCCCCGAATAGGGAGATCCCTACGTCAAACGC
TGAGCTGGCAAACTTTCCATTTTTATGTTTGCT
>Rosalind_8325
TGCGTTTGGCTTAGCTAGTTCAGCAGATTGCCGTATTAATTGGTTCCAGCTGTCCAATCC
TGATTCAGGATTCCCTCCGAGTCCGATTTATTAGCCGCC
>Rosalind_7543
TTTTAAGAACTGTTCGCAGCACAGGGTTGCACGAAGCTATAGCGTGCTAGGGATAATCTA
AAAGGACCGCTCAACTAGCTTC
>Rosalind_9575
TATTGTAGGACTTAAACGCACGCTTAGTTAGGCCCGTACCACTGCTATACTCGGCATGAA
CCTCTCAACCTGTGAGATGATTTAATT
>Rosalind_5189
ATCTCTGACTATCTCGCCACCTTCGTGACATTGACCAGTGTAGGCTTGAGAAGACCGTCG
TCAGGGGCCGGTCCTGAAGCTACCGCTTTGATTCGT
>Rosalind_6489
GGCGTTGACTACGGAAGAGAAGGCTCCCCTAATACCACAAACACCTTGTCATGTTGTAGT
GGAGTCAAGCCTTTCGATGGTCGGTATCAGAGGTATT
>Rosalind_5708
GAGTGGACTGGCGAAATCCACTTTTAGTGCTTAAACACATGAGGAGAGAGTTTGCATTAA
TGATCGCTTCGCACATCTTTGTAAAACATCTTTCAGAGA
>Rosalind_2003
CTCCGATTGCGACTCGGCGGTGGACAATCCCGGTGGTAGACCAGCCATTATGTTGCAGTA
GACTCCTGCCGACCCTCCGAGACTAAGCCTCC
>Rosalind_6605
AGTCTCGCGGAGAGAGGTGTCCCGAGAAAACCATCTGACTCCCCTTATACAGAGAATCAC
GAACAAGTTAGTAGTCCAGCAAAGCGAAGCAATAC
>Rosali

# Solution to Downloaded Dataset

In [11]:
getOverlapGraphFromFileToFile("rosalind_grph.txt", "Solution_Output.txt")

In [12]:
%%bash
cat Solution_Output.txt

Rosalind_5452 Rosalind_2327
Rosalind_5452 Rosalind_4665
Rosalind_8325 Rosalind_0257
Rosalind_7543 Rosalind_0456
Rosalind_5189 Rosalind_2190
Rosalind_5708 Rosalind_5452
Rosalind_5708 Rosalind_6206
Rosalind_5708 Rosalind_1315
Rosalind_2003 Rosalind_6571
Rosalind_2003 Rosalind_3251
Rosalind_8501 Rosalind_5755
Rosalind_8501 Rosalind_4248
Rosalind_3345 Rosalind_7543
Rosalind_2722 Rosalind_2327
Rosalind_2722 Rosalind_4665
Rosalind_5309 Rosalind_3015
Rosalind_5309 Rosalind_3828
Rosalind_5309 Rosalind_9214
Rosalind_4570 Rosalind_5708
Rosalind_4570 Rosalind_4015
Rosalind_4570 Rosalind_8844
Rosalind_4570 Rosalind_6465
Rosalind_4015 Rosalind_6605
Rosalind_4015 Rosalind_3402
Rosalind_2226 Rosalind_2003
Rosalind_2226 Rosalind_1886
Rosalind_2327 Rosalind_4665
Rosalind_1703 Rosalind_2226
Rosalind_1703 Rosalind_1084
Rosalind_1703 Rosalind_7151
Rosalind_1703 Rosalind_2101
Rosalind_1703 Rosalind_4817
Rosalind_7017 Rosalind_1884
Rosalind_7017 Rosalind_9315
Rosalind_7017 Rosalind_5408
Rosalind_9961 Rosali