In [1]:
from ont_fast5_api.fast5_interface import get_fast5_file
import pandas as pd
from Settings import *
from HelperFunctions import fullNormalise
import LSTMAutoencoder
import numpy as np
from numpy.linalg import norm
import time

1 - 00014eb4-4e2c-4087-ac24-57dea735b7b4

2 - 601f7bc7-3c09-4a78-a9b7-097bddcde809

3 - 1b6939ba-4a35-4696-bc6f-2ddb3368266e

4 - 95c03c50-57a5-4092-9a5f-87182d06a12c

5 - b6d8845b-3eec-42f6-9510-b8ba1fc1ec44


1 <--> 2    90.1

3 <--> 4    91.2

4 <--> 5    95.9 or 90.1


In [2]:
filepath = "../../similar_testdata/similar_squiggles.fast5"

def loadReads():

    print("reading squiggles file...")
    reads = []
    ids = []
    with get_fast5_file(filepath, mode="r") as f5:
        for read in f5.get_reads():
            if read.read_id in subset:
                raw_data = read.get_raw_data()
                reads.append(raw_data)
                ids.append(read.read_id)

    return reads, ids

In [3]:
squiggles, ids = loadReads()
ids

reading squiggles file...


['00014eb4-4e2c-4087-ac24-57dea735b7b4',
 '1b6939ba-4a35-4696-bc6f-2ddb3368266e',
 '601f7bc7-3c09-4a78-a9b7-097bddcde809',
 '95c03c50-57a5-4092-9a5f-87182d06a12c',
 'b6d8845b-3eec-42f6-9510-b8ba1fc1ec44']

In [4]:
model = LSTMAutoencoder.LSTM()
model.load_state_dict(torch.load(model_path))
model.eval()            #setting model to "predict mode"

LSTM(
  (lstm): LSTM(1, 200)
  (fc): Linear(in_features=200, out_features=1, bias=True)
)

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))
stateHs = []
stateCs = []
for i in range(len(ids)):
    print(f"embedding {i}...")
    s = time.time()
    x = fullNormalise(squiggles[i])

    x = torch.tensor(scaler.fit_transform(x.reshape(-1, 1))).squeeze(-1)
    x = x.type(torch.FloatTensor)

    state_hAcc = []
    state_cAcc = []

    i = 0
    for chunk in torch.split(x, sequenceLength):
        i += 1
        state_h = torch.zeros(1, hidden_size)
        state_c = torch.zeros(1, hidden_size)

        with torch.no_grad():
            for value in chunk:
                pred, (state_h, state_c) = model(value, (state_h, state_c))

        state_hAcc.append(state_h.squeeze(0))
        state_cAcc.append(state_c.squeeze(0))

        
    
    stateHs.append(torch.mean(torch.stack(state_hAcc), dim=0))
    stateCs.append(torch.mean(torch.stack(state_cAcc), dim=0))
    print(f"Took {time.time()-s}s.")

    

embedding 0...
Took 15.782442569732666s.
embedding 1...
Took 23.97514319419861s.
embedding 2...
Took 35.45511484146118s.
embedding 3...
Took 27.64120388031006s.
embedding 4...
Took 14.246598958969116s.


In [6]:
print(stateCs[1].shape)
print(stateCs[1].squeeze(0))


torch.Size([200])
tensor([-4.5373e-03,  6.7985e-03, -9.1541e-03, -7.3665e-02,  3.2414e-03,
        -1.0073e-02, -3.1867e-02, -3.6576e-02,  7.4433e-02,  3.7925e-02,
         2.7838e-02,  3.3201e-02,  2.1279e-02, -1.1884e-02, -2.8673e-02,
         3.3179e-02,  1.1229e-02,  1.6927e-02, -2.4000e-03,  1.2468e-01,
         2.1081e-02,  5.2922e-03, -1.3932e-02, -3.5839e-02, -4.7292e-01,
         1.4587e-02,  4.0346e-02,  4.4681e-03, -2.0754e-01,  9.6141e-03,
        -1.3412e-02, -1.7826e-01, -1.6193e-02, -1.4085e-01,  3.5205e-02,
         9.7193e-02,  3.9148e-03,  1.8299e-01, -1.2537e-01, -1.7144e-03,
         4.6300e-02,  5.1215e-02, -1.6271e-01,  1.0593e-01,  1.8448e-02,
        -7.6781e-03,  4.9559e-02, -4.6380e-02, -1.3024e-02,  1.9300e-02,
         2.7752e-02,  1.6751e-02,  2.9881e-02,  1.8914e-01,  6.1130e-01,
        -1.6562e-02,  6.9398e-03, -1.4370e-03,  6.4158e-02, -6.8973e-02,
        -1.1417e-02,  6.9219e-02, -6.8066e-03, -6.2245e-04,  1.7441e-02,
        -2.8285e-04, -2.1248e-02,

In [7]:
similartiesH = {}
similartiesC = {}

for i in range(len(ids)):
    innerDictH = {}
    innerDictC = {}
    for s in range(len(ids)):
        if s != i:
            innerDictH[ids[s]] = str(np.dot(stateHs[i],stateHs[s])/(norm(stateHs[i])*norm(stateHs[s])))#str(norm(stateHs[i]-stateHs[s]))
            innerDictC[ids[s]] = str(np.dot(stateCs[i],stateCs[s])/(norm(stateCs[i])*norm(stateCs[s])))#str(norm(stateCs[i]-stateCs[s]))


    similartiesH[ids[i]] = innerDictH.copy()
    similartiesC[ids[i]] = innerDictC.copy()


In [8]:
import json

with open("hSimilarties.json", "w") as file:
    json.dump(similartiesH, file)

with open("cSimilarties.json", "w") as file:
    json.dump(similartiesC, file)