Notebook uses the same functions for similarity comparison as used for comparing the lists of embeddings. This notebook, however, normalises the squiggles, chunks them, and passes these chunks directly to the comparison algorithm. A positive result from this notebook indicates that the autoencoder is not capturing relevant information from the squiggles.

In [1]:
from Settings import *
from HelperFunctions import loadReads, fullNormalise, getMinDist
import torch
import time
import json



In [2]:
squiggles, ids = loadReads()
ids

reading squiggles file...


['29fd6fb8-0f2f-4aa2-aa1a-c367f327acb6',
 '68b93e56-74fa-431b-9f40-a214a8bfc0b8',
 '7f7dfe85-7276-4a9d-ad84-2fa6dfa67635',
 '83e5572b-b883-4c88-a500-777144d48134',
 '8edd07ab-777f-4784-ab0b-c268772d3113',
 '93ab5242-dab6-4609-b82c-04746416b6f1',
 '9ee3897c-2329-4fc6-9ef3-d8eb97cd4919',
 'b61cb37c-ea80-4011-ac7a-6c853fd73eb6',
 'edd987c6-0a3f-4214-a630-a1883b4f0f1a',
 'f5e994e9-38a1-4079-adb5-bdacf43e0a17']

In [3]:
squiggles = [torch.tensor(fullNormalise(i)) for i in squiggles]         #normalised squiggles then turns each one into a 1D.
squiggles[0]


tensor([ 0.5236,  0.5671,  0.6729,  ..., -0.7513, -0.8819, -0.7513],
       dtype=torch.float64)

In [4]:
chunkedSquiggles = [i.unfold(dimension=0, size=sequenceLength, step=nGramStep) for i in squiggles]
chunkedSquiggles[0][0]

tensor([0.5236, 0.5671, 0.6729, 0.5858, 0.4179, 0.6853, 0.6729, 0.7040, 0.6977,
        0.6542, 0.6480, 0.6604, 0.6356, 0.7848, 0.7662, 0.6915, 0.6542, 0.6853,
        0.7537, 0.7288, 0.6045, 0.5796, 0.7102, 0.6480, 0.6045, 0.5236, 0.5858,
        0.5298, 0.4987, 0.7537, 0.8159, 0.6977, 0.7351, 0.6542, 0.6915, 0.6418,
        0.7226, 0.6604, 0.7288, 0.6231, 0.6231, 0.4739, 0.5982, 0.6231, 0.6542,
        0.7040, 0.7662, 0.6915, 0.6356, 0.5671, 0.6853, 0.5920, 0.6542, 0.7102,
        0.5485, 0.6356, 0.6356, 0.6418, 0.6107, 0.6604, 0.5485, 0.5734, 0.6729,
        0.7164, 0.6791, 0.6853, 0.5858, 0.6231, 0.5858, 0.6977, 0.6542, 0.5236,
        0.5982, 0.7102, 0.4925, 0.5360, 0.5796, 0.5858, 0.5858, 0.6853, 0.5547,
        0.6729, 0.6604, 0.6231, 0.6666, 0.6169, 0.5547, 0.7973, 0.5858, 0.5734,
        0.6542, 0.5360, 0.6293, 0.5298, 0.5982, 0.6915, 0.7537, 0.7040, 0.6729,
        0.6915, 0.7351, 0.7351, 0.7599, 0.5485, 0.5858, 0.4987, 0.6853, 0.7413,
        0.6045, 0.6480, 0.5796, 0.6231, 

In [5]:
chunkedSquiggles[0][1]

tensor([0.6045, 0.5796, 0.7102, 0.6480, 0.6045, 0.5236, 0.5858, 0.5298, 0.4987,
        0.7537, 0.8159, 0.6977, 0.7351, 0.6542, 0.6915, 0.6418, 0.7226, 0.6604,
        0.7288, 0.6231, 0.6231, 0.4739, 0.5982, 0.6231, 0.6542, 0.7040, 0.7662,
        0.6915, 0.6356, 0.5671, 0.6853, 0.5920, 0.6542, 0.7102, 0.5485, 0.6356,
        0.6356, 0.6418, 0.6107, 0.6604, 0.5485, 0.5734, 0.6729, 0.7164, 0.6791,
        0.6853, 0.5858, 0.6231, 0.5858, 0.6977, 0.6542, 0.5236, 0.5982, 0.7102,
        0.4925, 0.5360, 0.5796, 0.5858, 0.5858, 0.6853, 0.5547, 0.6729, 0.6604,
        0.6231, 0.6666, 0.6169, 0.5547, 0.7973, 0.5858, 0.5734, 0.6542, 0.5360,
        0.6293, 0.5298, 0.5982, 0.6915, 0.7537, 0.7040, 0.6729, 0.6915, 0.7351,
        0.7351, 0.7599, 0.5485, 0.5858, 0.4987, 0.6853, 0.7413, 0.6045, 0.6480,
        0.5796, 0.6231, 0.6666, 0.7102, 0.6791, 0.6915, 0.6729, 0.6729, 0.6666,
        0.5920, 0.6418, 0.6480, 0.6293, 0.6418, 0.5920, 0.5796, 0.4863, 0.5796,
        0.2624, 0.3868, 0.3806, 0.6045, 

In [6]:
#Same comparison code as in Tester.ipynb, but only comparing 1 set of vector sets
similarities = {}

outerStart = time.time()
for squiggleIndex in range(len(chunkedSquiggles)):
    print(ids[squiggleIndex])
    squiggleStart = time.time()
    innerDict = {}

    for innerSquiggleIndex in range(len(chunkedSquiggles)):
        if squiggleIndex != innerSquiggleIndex:
            print(f"    comparing squiggle {squiggleIndex} with squiggle {innerSquiggleIndex}")
            innerSquiggleStart = time.time()
            innerDict[ids[innerSquiggleIndex]] = 0

            for e in range(len(chunkedSquiggles[squiggleIndex])):
                if e % 1000 == 0:
                    print(f"        chunk {e}/{len(chunkedSquiggles[squiggleIndex])}")
                innerDict[ids[innerSquiggleIndex]] += getMinDist(chunkedSquiggles[squiggleIndex][e], chunkedSquiggles[innerSquiggleIndex])

            print(f"    {squiggleIndex} to {innerSquiggleIndex} comparisons took {time.time()-innerSquiggleStart}s.")
        
    similarities[ids[squiggleIndex]] = innerDict.copy()

    print(f"All {ids[squiggleIndex]} comparisons took {time.time()-squiggleStart}s.")

print(f"All comparisons took {time.time()-outerStart}s.")

29fd6fb8-0f2f-4aa2-aa1a-c367f327acb6
    comparing squiggle 0 with squiggle 1
        chunk 0/12758
        chunk 1000/12758
        chunk 2000/12758
        chunk 3000/12758
        chunk 4000/12758
        chunk 5000/12758
        chunk 6000/12758
        chunk 7000/12758
        chunk 8000/12758
        chunk 9000/12758
        chunk 10000/12758
        chunk 11000/12758
        chunk 12000/12758
    0 to 1 comparisons took 1788.0688486099243s.
    comparing squiggle 0 with squiggle 2
        chunk 0/12758
        chunk 1000/12758
        chunk 2000/12758
        chunk 3000/12758
        chunk 4000/12758
        chunk 5000/12758
        chunk 6000/12758
        chunk 7000/12758
        chunk 8000/12758
        chunk 9000/12758
        chunk 10000/12758
        chunk 11000/12758
        chunk 12000/12758
    0 to 2 comparisons took 1460.9816801548004s.
    comparing squiggle 0 with squiggle 3
        chunk 0/12758
        chunk 1000/12758
        chunk 2000/12758
        chunk 3000/1

In [7]:
with open("SquiggleSimilarties.json", "w") as file:
    json.dump(similarities, file)
