In [1]:
%%time
f = open('omsim_ecoli_05_015', 'r')

# Rmap -> Values
rmap_data = {}
n = 0
for line in f:
    line = line.split(' ')
    rmap_data[line[0]] = list(map(float, line[1:]))
    if n == 8000:break
    n += 1
print(len(rmap_data.keys()), "Rmaps,",sum(map(len, rmap_data.values())), "Fragments.")
f.close()

8001 Rmaps, 232626 Fragments.
CPU times: user 64 ms, sys: 12 ms, total: 76 ms
Wall time: 83.7 ms


In [2]:
%%time
import itertools

def create_kmers(li, k=5):
    ''' Splits a list into k-mers.'''
    return [tuple(li[i:i+k]) for i in range(len(li) - k + 1)]
'''
Creating two dictionaries that serve as lookups for kmer -> rmap and rmap -> kmer.
kmer_list is passed to DBSCAN to be clustered based on distance metric
Possible problem, we assume no two kmer's being exactly the same for the kmer -> rmap dictionary.
'''
rmap_to_k = {} # Rmap corresponding to Kmer
kmer_to_r = {} # Kmer corresponding to Rmap
kmer_list = [] # Raw list of kmers.

#Iterate through data to initialize the three objects above.
for rmap, values in rmap_data.items():
    kmrs = create_kmers(values)
    if rmap in rmap_to_k:
        print("Warning: a duplicate rmap name was found and will overwrite the kmer values")
    assert len(set(kmrs)) == len(kmrs),  "Error: A duplicate kmer was encountered."
    rmap_to_k[rmap] = kmrs
    kmer_to_r.update(zip(kmrs, itertools.repeat(rmap, len(kmrs))))
    kmer_list.extend(kmrs)
print(len(kmer_list), "Kmers")

200622 Kmers
CPU times: user 276 ms, sys: 4 ms, total: 280 ms
Wall time: 281 ms


In [3]:
%%time
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KDTree
import numpy as np
import sklearn
import itertools
from operator import itemgetter
from collections import defaultdict
from pprint import pprint

''' -*- Params here -*- '''
alpha = 10 # Parameter
epsilon = .3
''''''

#Output of DBSCAN is a list of cluster labels
cluster_labels = DBSCAN(eps=epsilon, min_samples = alpha, n_jobs = -1, metric='euclidean').fit_predict(kmer_list)

clusters_rmap = defaultdict(set)
clusters_kmer = defaultdict(set)

#Group clusters by label
for label, kmer in zip(cluster_labels, kmer_list):
    if label != -1:
        clusters_rmap[label].add(kmer_to_r[kmer])
        clusters_kmer[label].add(kmer)


CPU times: user 4.38 s, sys: 280 ms, total: 4.66 s
Wall time: 3.64 s


In [5]:
from collections import Counter
def kmers_between(r1, r2):    
    '''
    Looks for clusters that contain both rmaps, then looks up the resultant kmers that lead to the cluster 
    Prints list of rmap -> kmer
    '''
    matching_clusters = [i for i, r in clusters_rmap.items() if r1 in r and r2 in r]
    kmers_between = [kmer for clust in matching_clusters for kmer in clusters_kmer[clust]]
    with_rmap = [(kmer_to_r[k], k) for k in kmers_between if kmer_to_r[k] == r1 or kmer_to_r[k] == r2]
    return with_rmap

def rangeOf(rdata):
    ''' Given rmap_data key and the kmer, return the range of the kmer inside rmap_data[key]. '''
    name, kmer = rdata    
    values = rmap_data[name]
    # Naive string search for now
    i = 0
    while i < len(values):
        j = 0
        while j < len(kmer) and values[i+j] == kmer[j]: #Match
            j += 1
        if j == len(kmer): #Found word
            return (i, i+j)
        else:
            i += 1
    return None

'''
Exploratory data analysis
'''

k = kmers_between('Rmap_2029', 'Rmap_3861')
print("Rmaps with kmer and it's range")
pprint(list(zip(k, map(rangeOf, k))))

#For each cluster, count the combination pairs in the rmap_relation dict
rmap_relations = Counter()
for cluster in clusters_rmap.values():
    # Sorting the cluster list is necessary to ensure itertools's combinations are lexicographically consistent.
    # Update the counter with the number of combinations that are within each ckuster.
    rmap_relations.update(itertools.combinations(sorted(cluster), 2))
print("Printing matches found")
pprint(rmap_relations.most_common(20))


Rmaps with kmer and it's range
[(('Rmap_2029', (11.574, 4.465, 13.174, 4.794, 3.264)), (2, 7)),
 (('Rmap_3861', (11.611, 4.62, 13.296, 4.827, 3.385)), (8, 13)),
 (('Rmap_3861', (4.62, 13.296, 4.827, 3.385, 2.81)), (9, 14)),
 (('Rmap_2029', (4.465, 13.174, 4.794, 3.264, 2.97)), (3, 8)),
 (('Rmap_2029', (13.174, 4.794, 3.264, 2.97, 4.454)), (4, 9)),
 (('Rmap_3861', (13.296, 4.827, 3.385, 2.81, 4.679)), (10, 15)),
 (('Rmap_3861', (4.827, 3.385, 2.81, 4.679, 3.379)), (11, 16)),
 (('Rmap_2029', (4.794, 3.264, 2.97, 4.454, 3.347)), (5, 10)),
 (('Rmap_2029', (3.264, 2.97, 4.454, 3.347, 7.346)), (6, 11)),
 (('Rmap_3861', (3.385, 2.81, 4.679, 3.379, 7.581)), (12, 17)),
 (('Rmap_2029', (2.97, 4.454, 3.347, 7.346, 2.092)), (7, 12)),
 (('Rmap_3861', (2.81, 4.679, 3.379, 7.581, 2.131)), (13, 18)),
 (('Rmap_3861', (4.679, 3.379, 7.581, 2.131, 4.523)), (14, 19)),
 (('Rmap_2029', (4.454, 3.347, 7.346, 2.092, 4.559)), (8, 13)),
 (('Rmap_2029', (3.347, 7.346, 2.092, 4.559, 24.342)), (9, 14)),
 (('Rmap_3

In [5]:
'''
Random code here.
'''
# kd = KDTree(np.array(kmer_list))
# results = kd.query_radius(kmer_list, .5)
# clusts = set(frozenset(map(lambda x: kmer_list[x], r)) for r in results if len(r) > 1)
# clusts = set(frozenset(map(lambda x: kmap[kmer_list[x]], r)) for r in results if len(r) > 1)
# clusts = set(frozenset(r) for r in results if len(r) > 1)
# print(len(clusts))
# rmap_relations = defaultdict(int)
# for s in clusts:
#     for i_j in itertools.combinations(s, 2):
#         rmap_relations[i_j] += 1
# pprint(rmap_relations)
# pprint()
# print(clusters)
# clust2 = set(frozenset(k) for k in clusters.values())
# f = open('yeet.txt', 'w')
# f2 = open('yeet2.txt', 'w')
# f3 = open('y3.txt', 'w')
# f.write(pformat(clusts))
# f2.write(pformat(clust2))
# f3.write(pformat(clusts ^ clust2))
# f.close()
# f2.close()
# f3.close()
# print(sklearn.metrics.silhouette_score(kmer_list, clustering, metric='euclidean'))


'\nRandom code here.\n'