In [None]:
%matplotlib inline
#%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
import pickle
import sys
from collections import defaultdict
from scipy import spatial
from Bio import pairwise2
from tqdm import tqdm_notebook
from glob import glob

if '../' not in sys.path:
    sys.path.append('../')

from utils import number2patten

plt.style.use('ggplot')

### Load Embeddings

In [None]:
kmer_sizes = np.array([3, 4, 5])
index_offset = np.concatenate(([0], (4**kmer_sizes))).cumsum()

glob_str = '../max5_min3_mers_10padding_64embedding_epoch1_batch*.pickle'
emb_files = {int(f.rstrip('.pickle').split('batch')[1]): f for f in glob(glob_str)}
latest_emb_file = emb_files[sorted(emb_files)[-1]]
print(latest_emb_file)

kmer_emb = pickle.load(open(latest_emb_file, 'rb'))
print('All:', kmer_emb.shape)

# Get 5-mers
kmer_emb = kmer_emb[index_offset[2]:index_offset[3]]
print('Only 5mers:', kmer_emb.shape)

### Computing Nedleman-Wunsch Score and Cosine Distance

In [None]:
def get_comb(kmer_size=5):
    num_kmers = 4**kmer_size
    for i in range(0, num_kmers):
        for ii in range(i+1, num_kmers):
            yield i, ii

kmer_size = 5
combinations = (((4**kmer_size)**2) - 4**kmer_size) // 2
global_align_scores = np.zeros(combinations, dtype=np.int8)
cosine_similarity = np.zeros(global_align_scores.size)

for i, (num_seq1, num_seq2) in tqdm_notebook(enumerate(get_comb(kmer_size)), total=combinations):
    seq1 = number2patten(num_seq1, kmer_size)
    seq2 = number2patten(num_seq2, kmer_size)
    global_align_scores[i] = max([align[2] for align in pairwise2.align.globalxx(seq1, seq2)])
    cosine_similarity[i] = 1 - spatial.distance.cosine(kmer_emb[num_seq1], kmer_emb[num_seq2])
    
# Saving vectors to file.
np.save('global_align_scores_{}-mers'.format(kmer_size), global_align_scores)
np.save('cosine_similarity_{}-mers'.format(kmer_size), cosine_similarity)

In [None]:
# Load vectors from file.
global_align_scores = np.load('global_align_scores_{}-mers.npy'.format(kmer_size))
cosine_similarity = np.load('cosine_similarity_{}-mers.npy'.format(kmer_size))

### Plotting  Nedleman-Wunsch Score vs. Emb. Cosine Distance

In [None]:
box_data = defaultdict(list)
for a, b in zip(global_align_scores, cosine_similarity):
    box_data[a].append(b)
    
box_data_labels = sorted(box_data.keys())
box_data_lists = [box_data[d] for d in box_data_labels]

In [None]:
plt.boxplot(box_data_lists, labels=box_data_labels)
plt.xlabel('Nedleman-Wunsch Score')
plt.ylabel('Emb. Cosine Similarity')
plt.show()