In [1]:
from ont_fast5_api.fast5_interface import get_fast5_file
import pandas as pd
from Settings import *
from HelperFunctions import fullNormalise
import LSTMAutoencoder
import numpy as np
from numpy.linalg import norm
import time

Old testset:
1 - 00014eb4-4e2c-4087-ac24-57dea735b7b4

2 - 601f7bc7-3c09-4a78-a9b7-097bddcde809

3 - 1b6939ba-4a35-4696-bc6f-2ddb3368266e

4 - 95c03c50-57a5-4092-9a5f-87182d06a12c

5 - b6d8845b-3eec-42f6-9510-b8ba1fc1ec44


1 <--> 2    90.1

3 <--> 4    91.2

4 <--> 5    95.9 or 90.1

New testset:

9ee3897c-2329-4fc6-9ef3-d8eb97cd4919 --> 29fd6fb8-0f2f-4aa2-aa1a-c367f327acb6

f5e994e9-38a1-4079-adb5-bdacf43e0a17 --> 83e5572b-b883-4c88-a500-777144d48134

b61cb37c-ea80-4011-ac7a-6c853fd73eb6 --> 68b93e56-74fa-431b-9f40-a214a8bfc0b8

93ab5242-dab6-4609-b82c-04746416b6f1 --> 8edd07ab-777f-4784-ab0b-c268772d3113

edd987c6-0a3f-4214-a630-a1883b4f0f1a --> 7f7dfe85-7276-4a9d-ad84-2fa6dfa67635


In [2]:
filepath = "../../similar_testdata/similar_squiggles.fast5"

def loadReads():

    print("reading squiggles file...")
    reads = []
    ids = []
    with get_fast5_file(filepath, mode="r") as f5:
        for read in f5.get_reads():
            if read.read_id in subset:
                raw_data = read.get_raw_data()
                reads.append(raw_data)
                ids.append(read.read_id)

    return reads, ids

In [3]:
squiggles, ids = loadReads()
ids

reading squiggles file...


['00014eb4-4e2c-4087-ac24-57dea735b7b4',
 '1b6939ba-4a35-4696-bc6f-2ddb3368266e',
 '601f7bc7-3c09-4a78-a9b7-097bddcde809',
 '95c03c50-57a5-4092-9a5f-87182d06a12c',
 'b6d8845b-3eec-42f6-9510-b8ba1fc1ec44']

In [4]:
model = LSTMAutoencoder.LSTM()
model.load_state_dict(torch.load(model_path))
model.eval()            #setting model to "predict mode"

LSTM(
  (lstm): LSTM(1, 30, num_layers=2)
  (fc): Linear(in_features=30, out_features=1, bias=True)
)

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
stateHs = []
stateCs = []
for i in range(len(ids)):
    print(f"embedding {i}...")
    s = time.time()
    x = fullNormalise(squiggles[i])

    x = torch.tensor(scaler.fit_transform(x.reshape(-1, 1))).squeeze(-1)
    x = x.type(torch.FloatTensor)

    state_hAcc = []
    state_cAcc = []

    chunks = x.unfold(dimension=0, size=sequenceLength, step=nGramStep)

    for chunkIndex in range(len(chunks)):#torch.split(x, sequenceLength):

        if chunkIndex % 1000 == 0:
            print(f"Chunk {chunkIndex}/{len(chunks)}")

        state_h = torch.zeros(1, hidden_size)
        state_c = torch.zeros(1, hidden_size)

        with torch.no_grad():
            for value in chunks[chunkIndex]:
                pred, (state_h, state_c) = model(value, (state_h, state_c))

        state_hAcc.append(state_h.squeeze(0))
        state_cAcc.append(state_c.squeeze(0))

        
    
    stateHs.append(state_hAcc)#stateHs.append(torch.mean(torch.stack(state_hAcc), dim=0))
    stateCs.append(state_cAcc)#stateCs.append(torch.mean(torch.stack(state_cAcc), dim=0))
    print(f"Took {time.time()-s}s.")

    

embedding 0...
Chunk 0/5989
Chunk 1000/5989
Chunk 2000/5989
Chunk 3000/5989
Chunk 4000/5989
Chunk 5000/5989
Took 168.62953281402588s.
embedding 1...
Chunk 0/9533
Chunk 1000/9533
Chunk 2000/9533
Chunk 3000/9533
Chunk 4000/9533
Chunk 5000/9533
Chunk 6000/9533
Chunk 7000/9533
Chunk 8000/9533
Chunk 9000/9533
Took 258.8833661079407s.
embedding 2...
Chunk 0/13764
Chunk 1000/13764
Chunk 2000/13764
Chunk 3000/13764
Chunk 4000/13764
Chunk 5000/13764
Chunk 6000/13764
Chunk 7000/13764
Chunk 8000/13764
Chunk 9000/13764
Chunk 10000/13764
Chunk 11000/13764
Chunk 12000/13764
Chunk 13000/13764
Took 361.98648953437805s.
embedding 3...
Chunk 0/10512
Chunk 1000/10512
Chunk 2000/10512
Chunk 3000/10512
Chunk 4000/10512
Chunk 5000/10512
Chunk 6000/10512
Chunk 7000/10512
Chunk 8000/10512
Chunk 9000/10512
Chunk 10000/10512
Took 275.84047627449036s.
embedding 4...
Chunk 0/5345
Chunk 1000/5345
Chunk 2000/5345
Chunk 3000/5345
Chunk 4000/5345
Chunk 5000/5345
Took 142.83982515335083s.


In [6]:
print(len(stateCs[0]))
print(len(squiggles[0]))
#print(stateCs[1])


5989
119987


In [7]:
#gets the max COSINE SIMILARITY between given vector and list of vectors (divided by length of veclist)
def getMaxSim(vec, vecList):
    maxSim = 0
    for other in vecList:
        sim = np.dot(vec,other)/(norm(vec)*norm(other))
        if sim > maxSim: maxSim = sim

    return maxSim

In [8]:
#gets the min EUCLIDEAN DISTANCE between given vector and list of vectors (divided by length of veclist)
def getMinDist(vec, vecList):
    minDist = 1000
    for other in vecList:
        dist = norm(vec-other)
        if dist < minDist: minDist = dist

    return minDist

In [9]:
#new similarity measure
#for each embeddings list in stateHs:
    #for each embedding:
        #for each embeddings list:
            #get the max similarity between current embedding and vectors in this list (and divide by length of list?)
            #add this max to the "similarities" measure of the current embeddings list
similartiesH = {}
similartiesC = {}

outerStart = time.time()
for el in range(len(stateHs)):
    print(ids[el])
    elStart = time.time()
    innerDictH = {}
    innerDictC = {}

    for innerel in range(len(stateHs)):
        if innerel != el:
            print(f"    comparing embeddings of squiggle {el} with embeddings of squiggle {innerel}")
            innerelStart = time.time()
            innerDictH[ids[innerel]] = 0
            innerDictC[ids[innerel]] = 0

            for e in range(len(stateHs[el])):
                if e % 1000 == 0:
                    print(f"        embedding {e}/{len(stateHs[el])}")
                innerDictH[ids[innerel]] += getMinDist(stateHs[el][e], stateHs[innerel])
                innerDictC[ids[innerel]] += getMinDist(stateCs[el][e], stateCs[innerel])

            print(f"    {el} to {innerel} comparisons took {time.time()-innerelStart}s.")
        
    similartiesH[ids[el]] = innerDictH.copy()
    similartiesC[ids[el]] = innerDictC.copy()

    print(f"All {ids[el]} comparisons took {time.time()-elStart}s.")

print(f"All comparisons took {time.time()-outerStart}s.")



00014eb4-4e2c-4087-ac24-57dea735b7b4
    comparing embeddings of squiggle 0 with embeddings of squiggle 1
        embedding 0/5989
        embedding 1000/5989
        embedding 2000/5989
        embedding 3000/5989
        embedding 4000/5989
        embedding 5000/5989
    0 to 1 comparisons took 1395.260793209076s.
    comparing embeddings of squiggle 0 with embeddings of squiggle 2
        embedding 0/5989
        embedding 1000/5989
        embedding 2000/5989
        embedding 3000/5989
        embedding 4000/5989
        embedding 5000/5989
    0 to 2 comparisons took 2141.911410331726s.
    comparing embeddings of squiggle 0 with embeddings of squiggle 3
        embedding 0/5989
        embedding 1000/5989
        embedding 2000/5989
        embedding 3000/5989
        embedding 4000/5989
        embedding 5000/5989
    0 to 3 comparisons took 1707.9949309825897s.
    comparing embeddings of squiggle 0 with embeddings of squiggle 4
        embedding 0/5989
        embedding 1000

In [10]:
import json

with open("hSimilarties.json", "w") as file:
    json.dump(similartiesH, file)

with open("cSimilarties.json", "w") as file:
    json.dump(similartiesC, file)

In [11]:
for id in range(len(ids)):
    print(ids[id], len(squiggles[id]))

00014eb4-4e2c-4087-ac24-57dea735b7b4 119987
1b6939ba-4a35-4696-bc6f-2ddb3368266e 191113
601f7bc7-3c09-4a78-a9b7-097bddcde809 275855
95c03c50-57a5-4092-9a5f-87182d06a12c 210492
b6d8845b-3eec-42f6-9510-b8ba1fc1ec44 107116
