In [92]:
import urllib.request
import math
import numpy
import sys

In [32]:
def dist(a_comp_vector, b_comp_vector):
    d = [((a_comp_vector[0] - b_comp_vector[0])**2), ((a_comp_vector[1] - b_comp_vector[1])**2), 
         ((a_comp_vector[2] - b_comp_vector[2])**2)]
    return round(math.sqrt(0.25*(d[0] + d[1] + d[2])), 2)

In [2]:
def readGenome(filename):
    """
    readGenome is a function that opens a Fasta format file available in the wd and reads the sequences on it.
    :param filename: name of the file on the wd.
    :return: a dictionary with genome name as key and the sequence as keyvalue.
    """
    # dna_sequences_by_name_dict is a dictionary with keys: genome name, values: sequence.
    dna_sequences_by_name_dict = dict()
    # Opens the file listed in filenames.
    with open(filename, 'r') as f:
        # for each line on the file:
        for line in f:
            # removes the entry key.
            line = line.strip()
            # if the line starts with > the key value will be empty, while the key will take the value of the line.
            if line[0] == '>':
                genome = ''
                sequence_name = line
            # if the line do not starts with > the keyvalue will take the line value and will join all of the next strings until next >.
            if line[0] != '>':
                genome += line
            dna_sequences_by_name_dict[sequence_name] = genome
    # returns the dictionary with all the read sequences of the file.
    return dna_sequences_by_name_dict

In [89]:
#for sequence_name, dna_sequence in translate_seq.items():
def composition_vector(dna_sequence):
    
    content_A = float(dna_sequence.count('A'))
    content_C = dna_sequence.count('C')
    content_G = dna_sequence.count('G')
    content_T = dna_sequence.count('T')
    length = float(content_A + content_C + content_G + content_T)
    return (
        round(float((content_A/length)), 2),
        round(float(content_C/length), 2),
        round(float(content_G/length), 2),
        round(float(content_T/length), 2)
    )

In [3]:
urls =["http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/simple1.fa",
    "http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/simple2.fa",
    "http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/human.fa",
    "http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/mouse.fa",
    "http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/fly.fa",
    "http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/yeast.fa",
    "http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/ecoli.fa",
    "http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/plasmodium.fa",
    "http://www.csc.kth.se/utbildning/kth/kurser/DD2404/appbio12/labs/lab2/thermus.fa"]
filenames = []
for gene in urls:
    filenames.append(gene.split('lab2/')[1])

url_by_genome_name = dict(zip(filenames, urls))

In [4]:
# for key, keyvalue in the dictionary items.
for filename, url in url_by_genome_name.items():
    with urllib.request.urlopen(url) as url:
        gene = url.read()
        gene = gene.strip().decode("utf-8")
    with open(filename, 'w') as f_out:
        f_out.write(gene)
       

In [45]:
# for filename in filenames:
#     read_gene =readGenome('simple1.fa')
#     read_gene2 = readGenome('simple2.fa')
#filenames_list = ['simple1.fa', 'simple2.fa', 'human.fa', 'mouse.fa']

In [111]:
dna_sequence_by_name = dict()
for filename in filenames_list:
    translate_seq = readGenome(filename)
    dna_sequence_by_name = {**dna_sequence_by_name, **translate_seq}
composition_dna_sequence_by_name = {
    sequ_name[:10]: composition_vector(dna_sequence) for sequ_name, dna_sequence in dna_sequence_by_name.items()
}

In [112]:
composition_dna_sequence_by_name

{'>simple1': (0.25, 0.25, 0.25, 0.25),
 '>simple2': (0.3, 0.2, 0.2, 0.3),
 '>Hsapiens ': (0.28, 0.23, 0.21, 0.28),
 '>Mus_muscu': (0.28, 0.22, 0.22, 0.28)}

In [119]:
my_little_matrix = numpy.zeros((len(composition_dna_sequence_by_name), len(composition_dna_sequence_by_name)))
for index, sequence_vector_a in enumerate(composition_dna_sequence_by_name.values()):
    for index_2, sequence_vector_b in enumerate(composition_dna_sequence_by_name.values()):
        my_little_matrix[index,index_2] = dist(sequence_vector_a,sequence_vector_b)
for index, sequence_name in enumerate(composition_dna_sequence_by_name.keys()):
    print(sequence_name, my_little_matrix[index])
        

    

>simple1 [0.   0.04 0.03 0.03]
>simple2 [0.04 0.   0.02 0.02]
>Hsapiens  [0.03 0.02 0.   0.01]
>Mus_muscu [0.03 0.02 0.01 0.  ]


In [44]:
if __name__ == '__main__':
    # filenames_list = list of entered arguments from command line.
    filenames_list = sys.argv[1:]
    dna_sequence_by_name = dict()
    for filename in filenames_list:
        translate_seq = readGenome(filename)
        dna_sequence_by_name = {**dna_sequence_by_name, **translate_seq}
    composition_dna_sequence_by_name = {
        sequ_name[:10]: composition_vector(dna_sequence) for sequ_name, dna_sequence in dna_sequence_by_name.items()
    }
    

[]


In [118]:
my_little_matrix

array([[0.  , 0.04, 0.03, 0.03],
       [0.04, 0.  , 0.02, 0.02],
       [0.03, 0.02, 0.  , 0.01],
       [0.03, 0.02, 0.01, 0.  ]])

In [64]:
new_dna_sequence_by_name = {seq_name: value**2 for seq_name, value in dna_sequence_by_name.items()}
new_dna_sequence_by_name

{'my_little_baby_0': 0, 'my_little_baby_1': 1, 'my_little_baby_2': 4}

In [61]:
dna_sequence_by_name

{'my_little_baby_0': 0, 'my_little_baby_1': 1, 'my_little_baby_2': 2}

In [85]:
read_gene['>simple1']

'ACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNNNNNNNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT'

In [90]:
composition_vector(read_gene['>simple1'])

(0.25, 0.25, 0.25, 0.25)

In [120]:
dna_sequence_by_name = dict()
for i in range(3):
    print('check our dna sequence dict before adding new one')
    print(dna_sequence_by_name)
    
    new_dict = {f'my_little_baby_{i}': i}
    dna_sequence_by_name = {**new_dict,**dna_sequence_by_name }
    
    print('check our dna sequence dict after adding new one')
    print(dna_sequence_by_name)
    print(' ')

check our dna sequence dict before adding new one
{}
check our dna sequence dict after adding new one
{'my_little_baby_0': 0}
 
check our dna sequence dict before adding new one
{'my_little_baby_0': 0}
check our dna sequence dict after adding new one
{'my_little_baby_1': 1, 'my_little_baby_0': 0}
 
check our dna sequence dict before adding new one
{'my_little_baby_1': 1, 'my_little_baby_0': 0}
check our dna sequence dict after adding new one
{'my_little_baby_2': 2, 'my_little_baby_1': 1, 'my_little_baby_0': 0}
 
