# Main

In [1]:
import mydatasets
import mymodels
import utils
import numpy as np
import torch
import copy
import sys
import os
import json 
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from torchinfo import summary
import pickle

In [2]:
X = pd.read_csv('../data/gene_data.csv', header=None, index_col=0)
X

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,3958,3959,3960,3961,3962,3963,3964,3965,3966,3967
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRR1166318,0,0,1,0,0,4,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
ERR176810,0,0,1,0,2,3,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
ERR181956,0,0,1,0,1,3,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
SRR2100379,0,0,1,0,0,4,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
SRR924706,0,0,1,0,2,3,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR671746,0,0,1,0,0,3,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
SRR1146372,0,0,2,0,0,3,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
ERR176477,0,0,1,0,1,3,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
SRR1049074,0,0,1,0,0,4,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [3]:
Y = pd.read_csv('../data/AllLabels.csv', index_col='id')
Y = Y[Y.index.isin(X.index)]
Y

Unnamed: 0_level_0,amikacin,capreomycin,ciprofloxacin,ethambutol,ethionamide,isoniazid,kanamycin,moxifloxacin,ofloxacin,pyrazinamide,rifampicin,streptomycin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SRR3675211,,,,0.0,,0.0,,,,0.0,0.0,
SRR3675215,,,,0.0,,0.0,,,,0.0,0.0,
SRR3675217,,,,1.0,,1.0,,,,0.0,1.0,
SRR3675218,,,,1.0,,1.0,,,,1.0,0.0,
SRR3675224,,,,0.0,,0.0,,,,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
SRR924705,,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
SRR924706,,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
SRR924707,,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
SRR924708,,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0


In [4]:
NoYs = X[np.logical_not(X.index.isin(Y.index))]
print('Xs with no Y found:', NoYs.shape[0])
X = X.drop(NoYs.index)

Xs with no Y found: 115


In [5]:
X = X.sort_index().reset_index().rename(columns={0: 'id'})
Y = Y.sort_index().reset_index()

In [6]:
X_mat = X.iloc[:, 1:].to_numpy()
Y_mat = Y.iloc[:, 1:].to_numpy()

In [7]:
val_test_count = int(X_mat.shape[0] * 0.1)
val_test_indcs = (np.random.permutation(X_mat.shape[0])[:2*val_test_count]).reshape((2, -1))

X_val = X_mat[val_test_indcs[0], :]
Y_val = Y_mat[val_test_indcs[0], :]

X_test = X_mat[val_test_indcs[1], :]
Y_test = Y_mat[val_test_indcs[1], :]

X_train = np.delete(X_mat, val_test_indcs.flatten(), axis=0)
Y_train = np.delete(Y_mat, val_test_indcs.flatten(), axis=0)
print(f'Train Count: {X_train.shape[0]}')
print(f'Validation Count: {X_val.shape[0]}')
print(f'Test Count: {X_test.shape[0]}')

shuffle_indcs = np.random.permutation(X_train.shape[0])
X_train = X_train[shuffle_indcs, :]
Y_train = Y_train[shuffle_indcs, :]

Train Count: 6277
Validation Count: 784
Test Count: 784


In [8]:
def get_roc(x_train: np.ndarray, y_train: np.ndarray, x_eval: np.ndarray, y_eval: np.ndarray, model, n_neighbors=5):
    emb_train = model(torch.from_numpy(x_train.astype('float32'))).detach().numpy()
    emb_val = model(torch.from_numpy(x_eval.astype('float32'))).detach().numpy()
    KNN = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto').fit(emb_train)

    nbr_dists, nbr_indcs = KNN.kneighbors(emb_val)
    y_k_neghbors = y_train[nbr_indcs, :]
    y_pred = np.nanmean(y_k_neghbors, axis=1)
    y_pred[np.where(np.isnan(y_pred))] = 0.5
    #y_pred = np.round(y_pred)

    condition = np.where(np.logical_and(np.logical_not(np.isnan(y_eval)), np.logical_not(np.isnan(y_pred))))

    #### CALCULATE ROC

    #return np.sum(y_pred[condition] == y_eval[condition]) / y_eval[condition].shape[0]

In [9]:
def get_acc(x_train: np.ndarray, y_train: np.ndarray, x_eval: np.ndarray, y_eval: np.ndarray, model, n_neighbors=5):
    emb_train = model(torch.from_numpy(x_train.astype('float32'))).detach().numpy()
    emb_val = model(torch.from_numpy(x_eval.astype('float32'))).detach().numpy()
    KNN = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto').fit(emb_train)

    nbr_dists, nbr_indcs = KNN.kneighbors(emb_val)
    y_k_neghbors = y_train[nbr_indcs, :] #y_eval x 5 x 12
    y_pred = np.nanmean(y_k_neghbors, axis=1) #y_eval x 12
    y_pred = np.round(y_pred) #0.5 > --> 1

    condition = np.where(np.logical_and(np.logical_not(np.isnan(y_eval)), np.logical_not(np.isnan(y_pred))))

    return np.sum(y_pred[condition] == y_eval[condition]) / y_eval[condition].shape[0]

In [10]:
def get_acc_weighted(x_train: np.ndarray, y_train: np.ndarray, x_eval: np.ndarray, y_eval: np.ndarray, model, n_neighbors=5):
    emb_train = model(torch.from_numpy(x_train.astype('float32'))).detach().numpy()
    emb_val = model(torch.from_numpy(x_eval.astype('float32'))).detach().numpy()
    KNN = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto').fit(emb_train)

    nbr_dists, nbr_indcs = KNN.kneighbors(emb_val)
    nbr_weights = np.exp(-nbr_dists)
    y_k_neghbors = y_train[nbr_indcs, :] #y_eval x 5 x 12
    y_k_neighbors_weighted = y_k_neghbors * nbr_weights.reshape(-1, n_neighbors, 1)
    #y_pred = np.nanmean(y_k_neighbors_weighted, axis=1) #y_eval x 12
    y_pred = np.nansum(y_k_neighbors_weighted, axis=1) / np.sum(nbr_weights.reshape(-1, n_neighbors, 1), axis=1)
    y_pred = np.round(y_pred) #0.5 > --> 1

    condition = np.where(np.logical_and(np.logical_not(np.isnan(y_eval)), np.logical_not(np.isnan(y_pred))))

    return np.sum(y_pred[condition] == y_eval[condition]) / y_eval[condition].shape[0]

In [11]:
def get_acc_per_drug(x_train: np.ndarray, y_train: np.ndarray, x_eval: np.ndarray, y_eval: np.ndarray, model, n_neighbors=5):
  emb_train = model(torch.from_numpy(x_train.astype('float32'))).detach().numpy()
  emb_val = model(torch.from_numpy(x_eval.astype('float32'))).detach().numpy()
  KNN = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(emb_train)

  nbr_dists, nbr_indcs = KNN.kneighbors(emb_val)
  y_k_neghbors = Y_train[nbr_indcs, :] #y_eval x 5 x 12
  y_pred = np.nanmean(y_k_neghbors, axis=1) #y_eval x 12
  y_pred = np.round(y_pred) #0.5 > --> 1

  acc_per_drug = np.zeros(y_eval.shape[1])
  for i in range(y_eval.shape[1]):
    condition = np.where(np.logical_and(np.logical_not(np.isnan(y_eval[:, i])), np.logical_not(np.isnan(y_pred[:, i]))))
    acc_per_drug[i] = np.sum(y_pred[condition, i] == y_eval[condition, i]) / y_eval[condition, i].shape[1]
  return acc_per_drug

In [12]:
def get_acc_per_drug_weighted(x_train: np.ndarray, y_train: np.ndarray, x_eval: np.ndarray, y_eval: np.ndarray, model, n_neighbors=5):
    emb_train = model(torch.from_numpy(x_train.astype('float32'))).detach().numpy()
    emb_val = model(torch.from_numpy(x_eval.astype('float32'))).detach().numpy()
    KNN = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto').fit(emb_train)

    nbr_dists, nbr_indcs = KNN.kneighbors(emb_val)
    nbr_weights = np.exp(-nbr_dists)
    y_k_neghbors = y_train[nbr_indcs, :] #y_eval x 5 x 12
    y_k_neighbors_weighted = y_k_neghbors * nbr_weights.reshape(-1, n_neighbors, 1)
    #y_pred = np.nanmean(y_k_neighbors_weighted, axis=1) #y_eval x 12
    y_pred = np.nansum(y_k_neighbors_weighted, axis=1) / np.sum(nbr_weights.reshape(-1, n_neighbors, 1), axis=1)
    y_pred = np.round(y_pred) #0.5 > --> 1

    acc_per_drug = np.zeros(y_eval.shape[1])
    for i in range(y_eval.shape[1]):
        condition = np.where(np.logical_and(np.logical_not(np.isnan(y_eval[:, i])), np.logical_not(np.isnan(y_pred[:, i]))))
        acc_per_drug[i] = np.sum(y_pred[condition, i] == y_eval[condition, i]) / y_eval[condition, i].shape[1]
    return acc_per_drug

In [13]:
def get_triplets(embeddings_tensor, y_batch, max_negatives_per_positive, max_trips_per_anchor, factors):
    triplets = []
    embeddings = embeddings_tensor.detach().numpy()
    num_fine_trips = 0
    num_coarse_trips = 0
    for i in range(embeddings.shape[0]):
        num_anchor_trips = 0
        anchor_emb = embeddings[i, :]
        anchor_y = y_batch[i, :]

        #get similarities
        #sim_scores = np.nansum(y_batch * (anchor_y * factors), axis=1) #dot product similarity
        sim_scores = utils.my_sim(y_batch, anchor_y, factors) #Equality Similarity

        #get embedding distances
        distances_emb = np.sqrt(np.sum((embeddings - anchor_emb)**2, axis=1))

        #sort similarities w.r.t. distances
        sorted_distances_indcs = np.argsort(distances_emb)
        similarities_sorted_by_distance = sim_scores[sorted_distances_indcs[:50]]

        for pos_sim_limit in reversed(range(y_batch.shape[1])):
            if pos_sim_limit == 0: continue
            positive_indcs = np.nonzero(similarities_sorted_by_distance == pos_sim_limit)[0]
            
            for positive_idx in np.flip(positive_indcs):
                num_negatives = 0
                #Excluding the Anchor
                if sorted_distances_indcs[positive_idx] == i:
                    continue

                positive_similarity = similarities_sorted_by_distance[positive_idx]
                
                #Points with a lower similarity and also a lower distance
                positive_misorderings_condition = np.logical_and(similarities_sorted_by_distance[:positive_idx] < positive_similarity, similarities_sorted_by_distance[:positive_idx] > 0)

                for negative_idx in np.nonzero(positive_misorderings_condition)[0]:
                    triplets.append((i, sorted_distances_indcs[positive_idx], sorted_distances_indcs[negative_idx]))
                    num_anchor_trips += 1
                    num_negatives += 1
                    num_fine_trips +=1
                    if num_negatives > max_negatives_per_positive: break
                
                if num_anchor_trips >= max_trips_per_anchor: break
                zero_condition = np.nonzero(similarities_sorted_by_distance[:positive_idx] == 0)[0]
                if len(zero_condition) == 0: continue
                num_negatives = np.minimum(zero_condition.shape[0], max_negatives_per_positive)
                for _ in range(num_negatives):
                    idx = np.random.randint(len(zero_condition))
                    zero_idx = zero_condition[idx]
                    triplets.append((i, sorted_distances_indcs[positive_idx], sorted_distances_indcs[zero_idx]))
                    num_anchor_trips += 1
                    num_coarse_trips += 1


                if num_anchor_trips >= max_trips_per_anchor: break
            if num_anchor_trips >= max_trips_per_anchor: break
        
    if len(triplets) == 0: return None

    anchors = []
    positives = []
    negatives = []
    for (a,p,n) in triplets:
        anchors.append(embeddings_tensor[a, :].reshape(1, -1))
        positives.append(embeddings_tensor[p, :].reshape(1, -1))
        negatives.append(embeddings_tensor[n, :].reshape(1, -1))
    
    anchors = torch.cat(anchors, dim=0)
    positives = torch.cat(positives, dim=0)
    negatives = torch.cat(negatives, dim=0)
    return anchors, positives, negatives, num_fine_trips, num_coarse_trips, pos_sim_limit
                
            
                
        

In [14]:
def get_triplets2(embeddings_tensor, y_batch, max_negatives_per_positive, max_trips_per_anchor, factors):
    triplets = []
    embeddings = embeddings_tensor.detach().numpy()
    num_fine_trips = 0
    num_coarse_trips = 0
    for i in range(embeddings.shape[0]):
        num_anchor_trips = 0
        anchor_emb = embeddings[i, :]
        anchor_y = y_batch[i, :]

        #get similarities
        #sim_scores = np.nansum(y_batch * (anchor_y * factors), axis=1) #dot product similarity
        sim_scores = utils.my_sim(y_batch, anchor_y, factors) #Equality Similarity

        #get embedding distances
        distances_emb = np.sqrt(np.sum((embeddings - anchor_emb)**2, axis=1))

        #sort similarities w.r.t. distances
        sorted_distances_indcs = np.argsort(distances_emb)
        similarities_sorted_by_distance = sim_scores[sorted_distances_indcs[:50]]

        for pos_sim_limit in reversed(range(y_batch.shape[1])):
            if pos_sim_limit == 0: continue
            positive_indcs = np.nonzero(similarities_sorted_by_distance == pos_sim_limit)[0]
            
            for positive_idx in np.flip(positive_indcs):
                num_negatives = 0
                #Excluding the Anchor
                if sorted_distances_indcs[positive_idx] == i:
                    continue

                positive_similarity = similarities_sorted_by_distance[positive_idx]
                
                #Points with a lower similarity and also a lower distance
                positive_misorderings_condition = np.logical_and(
                    similarities_sorted_by_distance[positive_idx:] < positive_similarity,
                    distances_emb[sorted_distances_indcs[positive_idx:50]] < distances_emb[sorted_distances_indcs[positive_idx]] + 1,
                    similarities_sorted_by_distance[positive_idx:] > 0)

                for negative_idx in np.nonzero(positive_misorderings_condition)[0]:
                    triplets.append((i, sorted_distances_indcs[positive_idx], sorted_distances_indcs[negative_idx]))
                    num_anchor_trips += 1
                    num_negatives += 1
                    num_fine_trips +=1
                    if num_negatives > max_negatives_per_positive: break
                
                if num_anchor_trips >= max_trips_per_anchor: break
                zero_condition = np.nonzero(similarities_sorted_by_distance[:positive_idx] == 0)[0]
                if len(zero_condition) == 0: continue
                num_negatives = np.minimum(zero_condition.shape[0], max_negatives_per_positive)
                for _ in range(num_negatives):
                    idx = np.random.randint(len(zero_condition))
                    zero_idx = zero_condition[idx]
                    triplets.append((i, sorted_distances_indcs[positive_idx], sorted_distances_indcs[zero_idx]))
                    num_anchor_trips += 1
                    num_coarse_trips += 1


                if num_anchor_trips >= max_trips_per_anchor: break
            if num_anchor_trips >= max_trips_per_anchor: break
        
    if len(triplets) == 0: return None

    anchors = []
    positives = []
    negatives = []
    for (a,p,n) in triplets:
        anchors.append(embeddings_tensor[a, :].reshape(1, -1))
        positives.append(embeddings_tensor[p, :].reshape(1, -1))
        negatives.append(embeddings_tensor[n, :].reshape(1, -1))
    
    anchors = torch.cat(anchors, dim=0)
    positives = torch.cat(positives, dim=0)
    negatives = torch.cat(negatives, dim=0)
    return anchors, positives, negatives, num_fine_trips, num_coarse_trips, pos_sim_limit
                
            
                
        

In [15]:
model=mymodels.SimpleNet(X_train.shape[1], 30, [X_train.shape[1], 1500, 30])
loss_list = []
acc_train_list = []
acc_eval_list = []
factors=np.zeros(Y_train.shape[1])+1.0
log_every=1

In [16]:
np.sum(Y_mat == 0)/np.sum(np.logical_not(np.isnan(Y_mat)))

0.6738830892827951

In [17]:
# #Change the Folder Name
# #ONLY Run if you want to load a model
# model = torch.load('model.pth')
# with open('../FC_weightedKNN_newtrips/lists.list', 'rb') as f:
#   loss_list, acc_train_list, acc_eval_list = pickle.load(f)

In [18]:
summary(model, input_size=(500,3967), device='cpu', verbose=0, col_names=['kernel_size', 'output_size', 'num_params'])

Layer (type:depth-idx)                   Kernel Shape              Output Shape              Param #
SimpleNet                                --                        --                        --
├─ModuleList: 1-1                        --                        --                        --
│    └─Linear: 2-1                       [3967, 1500]              [500, 1500]               5,952,000
│    └─Linear: 2-2                       [1500, 30]                [500, 30]                 45,030
Total params: 5,997,030
Trainable params: 5,997,030
Non-trainable params: 0
Total mult-adds (G): 3.00
Input size (MB): 7.93
Forward/backward pass size (MB): 6.12
Params size (MB): 23.99
Estimated Total Size (MB): 38.04

In [None]:
for epoch in range(200):
	# get scheduled values of hyper params
	tmargin=1
	batch_size=500
	lrate=0.001
	max_trips=100
	max_neg=3
	print("Epoch ",epoch,(tmargin,batch_size,lrate,max_trips,max_neg))
	# define loss and create optimizer
	triplet_loss = torch.nn.TripletMarginLoss(margin=tmargin, p=2)
	triplet_loss2 = torch.nn.TripletMarginLoss(margin=tmargin, p=2, reduction='none')	
	optimizer = torch.optim.Adam(model.parameters(),lr=lrate)
	# get batches
	mini_batches=utils.make_batches(X_train, Y_train, batch_size)
	loss_values=[]
	for batch_num,batch in enumerate(mini_batches):
		x_batch,y_batch=batch
		# generate embeddings
		embeddings=model(torch.from_numpy(x_batch.astype('float32')))
		# generate triplets (online)
		trips=get_triplets2(embeddings,y_batch,max_neg,max_trips,factors)
		# trips=utils.get_triplets(embeddings,y_batch,max_neg,max_trips,factors,debug=False)
		if trips is None:
			continue
		# anch, pos, neg=trips
		anch, pos, neg, num_fine_trips, num_coarse_trips, last_pos_sim_limit=trips
		# compute loss
		loss_triplet=triplet_loss(anch,pos,neg)
		#loss_var = (1 - torch.mean(torch.var(embeddings, dim=1)))**2
		loss_batch = loss_triplet
		loss_batch2=triplet_loss2(anch,pos,neg).detach()
		loss_values.append(loss_batch.detach().numpy())
		# backprop
		optimizer.zero_grad()
		loss_batch.backward(retain_graph=True)
		optimizer.step()
		print("Batch size :",anch.shape[0]," - Fine Trips: ",num_fine_trips, " - Coarse Trips:", num_coarse_trips," - Last pos limit: ",last_pos_sim_limit,",- Loss value :",loss_batch.detach().numpy())
		print("Below Margin? ", torch.sum(loss_batch2 < tmargin).item(), " - Triplet Loss: ", loss_triplet.item())
		# print("Batch size :",anch.shape[0],",- Loss value :",loss_batch.detach().numpy())
	loss_mean=np.mean(np.array(loss_values))
	train_acc = get_acc_weighted(X_train, Y_train, X_train, Y_train, model, n_neighbors=5)
	val_acc = get_acc_weighted(X_train, Y_train, X_val, Y_val, model, n_neighbors=5)
	loss_list.append(loss_mean)
	acc_train_list.append(train_acc)
	acc_eval_list.append(val_acc)
	print("\tTrain Loss for this epoch :",loss_mean)
	print("\tTrain Accuracy for this epoch:", train_acc)
	print("\tValidation Accuracy for this epoch:", val_acc)

	if (epoch+1)%5 == 0:
		#Change Folder Name
		torch.save(model, '../test/model.pth')
		with open('../test/lists.list', 'wb') as f:
			pickle.dump((loss_list, acc_train_list, acc_eval_list), f)
		print('model saved!')

	if (epoch+1)%log_every==0:
		utils.log_epoch_metrics('t.txt',epoch,loss_mean,model,X_train,Y_train,X_val,Y_val,5)
	print('='*60)
	# # evaluate model
	# if (epoch+1)%checkpoint_every==0:
	# 	torch.save(model,args["run_dir"]+"/model_"+str(epoch+1))

Epoch  0 (1, 500, 0.001, 100, 3)


  sims = np.sum((y_batch * (anchor_y*factors)) == 1, axis=1) / np.sum(np.logical_or((y_batch * (anchor_y*factors)) == 1, (y_batch * (anchor_y*factors))==-1), axis=1)


Batch size : 35543  - Fine Trips:  21148  - Coarse Trips: 14395  - Last pos limit:  0 ,- Loss value : 1.1006488
Below Margin?  472  - Triplet Loss:  1.1006487607955933
Batch size : 33740  - Fine Trips:  20460  - Coarse Trips: 13280  - Last pos limit:  1 ,- Loss value : 1.0430931
Below Margin?  488  - Triplet Loss:  1.0430930852890015
Batch size : 33534  - Fine Trips:  20764  - Coarse Trips: 12770  - Last pos limit:  0 ,- Loss value : 1.022487
Below Margin?  669  - Triplet Loss:  1.0224870443344116
Batch size : 33416  - Fine Trips:  19952  - Coarse Trips: 13464  - Last pos limit:  1 ,- Loss value : 1.0149903
Below Margin?  443  - Triplet Loss:  1.0149903297424316
Batch size : 35834  - Fine Trips:  23100  - Coarse Trips: 12734  - Last pos limit:  0 ,- Loss value : 1.0098954
Below Margin?  1948  - Triplet Loss:  1.0098954439163208
Batch size : 35264  - Fine Trips:  24277  - Coarse Trips: 10987  - Last pos limit:  0 ,- Loss value : 1.008948
Below Margin?  1630  - Triplet Loss:  1.008947968

In [None]:
get_acc_weighted(X_train, Y_train, X_test, Y_test, model, 15)

# Plots

In [None]:
fig, ax = plt.subplots()
x = np.arange(len(loss_list))
y = np.array(loss_list)
ax.plot(x, y)

In [None]:
fig, ax = plt.subplots()
ax.plot(x, np.array(acc_train_list))
ax.plot(x, np.array(acc_eval_list))
ax.set_ylim((0,1))

In [None]:
acc_per_drug = get_acc_per_drug(X_train, Y_train, X_val, Y_val, model, 5)
acc_per_drug_test = get_acc_per_drug(X_train, Y_train, X_test, Y_test, model, 5)
drug_names = Y.columns[1:].to_numpy()
fig, ax = plt.subplots()
x = np.arange(drug_names.shape[0])
ax.plot(x, acc_per_drug)
ax.plot(x, acc_per_drug_test)
ax.set_xticks(x)
ax.set_ylim((0,1.1))
ax.set_xticklabels(drug_names, rotation=70)
ax.legend(['eval', 'test'])
fig.set_size_inches((12,7))
fig.set_facecolor('w')

In [None]:
def plot_sim_count(
        X_plt, Y_plt, trained_model, untrained_model, inter_threshold=5, intra_threshold=0, max_points_per_sim=1, weighted_radius=False, custom_anchor_idx=None,
        colors = ['bisque', 'forestgreen', 'slategrey', 'royalblue', 'lawngreen', 'red', 'magenta', 'cyan', 'gold', 'lime', 'peru', 'indigo']
    ):
    if custom_anchor_idx is not None:
        maxi = custom_anchor_idx
        maxlen = np.unique(np.sum(Y_plt == Y_plt[maxi], axis=1)).shape[0]
        print('Custom Anchor:')
        print(f'Max Similarity Count: {maxlen}')
        print(f'Custom Anchor Index: {maxi}')
    else:
        # Find a good anchor
        maxlen = 0
        maxi = 0
        Y_plt_maskable = np.ma.array(Y_plt, mask=False)
        for i in tqdm(range(Y_plt.shape[0])):
            Y_plt_maskable.mask[i] = True
            sim_uniq, sim_counts = np.unique(np.sum(Y_plt_maskable == Y_plt[i], axis=1), return_counts=True)
            sim_uniq = sim_uniq.compressed()
            if maxlen < sim_uniq.shape[0] and np.all(sim_counts[:-1] > 10):
                maxlen = sim_uniq.shape[0]
                maxi = i
            Y_plt_maskable.mask[i] = False
        print(f'Max Similarity Count: {maxlen}')
        print(f'Best Anchor Index: {maxi}')


    #Seperate each similarity count
    sim_list_indcs = []
    for i in range(maxlen):
        sim_list_indcs.append(np.where(np.sum(Y_plt == Y_plt[maxi], axis=1) == i)[0])


    #TSNE
    emb_plt = trained_model(torch.from_numpy(X_plt.astype('float32'))).detach().numpy()
    tsne_plt = TSNE(n_components=2, metric='euclidean').fit_transform(emb_plt)

    # Sort indcs w.r.t. their distance from anchor
    for i in range(maxlen): 
        criteria = np.argsort(np.linalg.norm(tsne_plt[sim_list_indcs[i]] - tsne_plt[maxi], axis=1))
        sim_list_indcs[i] = sim_list_indcs[i][criteria]

    #Calculate points radius
    if weighted_radius:
        radius = [sim.shape[0] for sim in sim_list_indcs]
        radius = np.array(radius) / np.sum(radius) * 100000
    else:
        radius = [150 for sim in sim_list_indcs]


    
    # Plot
    fig, ax = plt.subplots()
    #prev_norm = np.linalg.norm(tsne_plt[sim_list_indcs[-1][1]] - tsne_plt[maxi]) - inter_threshold
    prev_norm = 0.01
    ax.scatter(tsne_plt[maxi, 0], tsne_plt[maxi, 1], s=150)
    ax.annotate('A', (tsne_plt[maxi, 0], tsne_plt[maxi, 1]), fontsize= 20)
    selected_points_dict = {'A': (maxi, X.iloc[maxi, 0])}
    for i in reversed(range(maxlen)):
        for j in range(sim_list_indcs[i].shape[0]):
            current_norm = np.linalg.norm(tsne_plt[sim_list_indcs[i][j]] - tsne_plt[maxi])
            if current_norm >= prev_norm + inter_threshold:
                prev_norm = current_norm
                current_centroid = tsne_plt[sim_list_indcs[i][j]]
                selected_points_dict[i] = (sim_list_indcs[i][j], X.iloc[sim_list_indcs[i][j], 0])
                point_count = 0
                while intra_threshold >= np.linalg.norm(tsne_plt[sim_list_indcs[i][j]] - current_centroid) and point_count < max_points_per_sim:
                    ax.scatter(tsne_plt[sim_list_indcs[i][j], 0], tsne_plt[sim_list_indcs[i][j], 1], s=150, c=colors[i])
                    ax.annotate(str(i), (tsne_plt[sim_list_indcs[i][j], 0], tsne_plt[sim_list_indcs[i][j], 1]), fontsize=20)
                    point_count += 1
                    j += 1
                    if j == sim_list_indcs[i].shape[0]: break
                break

    fig.set_facecolor('w')
    fig.set_size_inches((15,10))
    fig.savefig('tsne_trained.png', dpi=300)



    ####### plot the same points with no train
    #TSNE
    emb_plt = untrained_model(torch.from_numpy(X_plt.astype('float32'))).detach().numpy()
    tsne_plt = TSNE(n_components=2, metric='euclidean').fit_transform(emb_plt)


    fig, ax = plt.subplots()
    #prev_norm = np.linalg.norm(tsne_plt[sim_list_indcs[-1][1]] - tsne_plt[maxi]) - inter_threshold
    for key in selected_points_dict.keys():
        idx = selected_points_dict[key][0]
        ax.scatter(tsne_plt[idx, 0], tsne_plt[idx, 1], s=150, c=colors[key if key != 'A' else -1])
        ax.annotate(str(key), (tsne_plt[idx, 0], tsne_plt[idx, 1]), fontsize=20)

    fig.set_facecolor('w')
    fig.set_size_inches((15,10))
    fig.savefig('tsne_untrained.png', dpi=300)

    return selected_points_dict

In [None]:
untrained = mymodels.SimpleNet(X_train.shape[1], 30, [X_train.shape[1], 1500, 30])
plot_sim_count(
    X_mat,
    Y_mat,
    model,
    untrained,
    inter_threshold=5,
    intra_threshold=0,
    max_points_per_sim=1,
    weighted_radius=False
)

# PROTOTYPE CODE - NO NEED TO RUN ANYTHING BELOW THIS TEXT

In [None]:
maxlen = 0
maxi = 0
Y_train_maskable = np.ma.array(Y_mat, mask=False)
for i in tqdm(range(Y_mat.shape[0])):
    Y_train_maskable.mask[i] = True
    sim_uniq, sim_counts = np.unique(np.sum(Y_train_maskable == Y_mat[i], axis=1), return_counts=True)
    sim_uniq = sim_uniq.compressed()
    if maxlen < sim_uniq.shape[0] and np.all(sim_counts[:-1] > 10):
        maxlen = sim_uniq.shape[0]
        maxi = i
    Y_train_maskable.mask[i] = False

print(f'Max Similarity Count: {maxlen}')
print(f'Best Anchor Index: {maxi}')

In [None]:
sim_list_indcs = []
for i in range(maxlen):
    sim_list_indcs.append(np.where(np.sum(Y_mat == Y_mat[maxi], axis=1) == i)[0])

In [None]:
Y_mat[1514]

In [None]:
emb_train = model(torch.from_numpy(X_mat.astype('float32'))).detach().numpy()
tsne_train = TSNE(n_components=2, metric='euclidean').fit_transform(emb_train)

In [None]:
for i in range(maxlen): 
    criteria = np.argsort(np.linalg.norm(tsne_train[sim_list_indcs[i]] - tsne_train[maxi], axis=1))
    sim_list_indcs[i] = sim_list_indcs[i][criteria]

In [None]:
radius = [sim.shape[0] for sim in sim_list_indcs]
radius = np.array(radius) / np.sum(radius) * 100000

In [None]:
np.linalg.norm(tsne_train[sim_list_indcs[11]] - tsne_train[maxi], axis=1)

In [None]:
model(torch.tensor(X_mat[[maxi]].astype('float32')))

In [None]:
mmmm = copy.deepcopy(model)
for layer in mmmm.children():
   if hasattr(layer, 'reset_parameters'):
       layer.reset_parameters()

In [None]:
fig, ax = plt.subplots()
threshold = 5
prev_norm = np.linalg.norm(tsne_train[sim_list_indcs[-1][1]] - tsne_train[maxi]) - threshold
prev_norm = 0.01
threshold2 = 0
max_points = 1
colors = ['bisque', 'forestgreen', 'slategrey', 'royalblue', 'lawngreen', 'red', 'magenta', 'cyan', 'gold', 'lime', 'peru', 'indigo']
ax.scatter(tsne_train[maxi, 0], tsne_train[maxi, 1], s=150)
ax.annotate('A', (tsne_train[maxi, 0], tsne_train[maxi, 1]), fontsize= 20)
selected_points_dict = {'A': X.iloc[maxi, 0]}
for i in reversed(range(maxlen)):
    for j in range(sim_list_indcs[i].shape[0]):
        current_norm = np.linalg.norm(tsne_train[sim_list_indcs[i][j]] - tsne_train[maxi])
        if current_norm >= prev_norm + threshold:
            prev_norm = current_norm
            current_centroid = tsne_train[sim_list_indcs[i][j]]
            selected_points_dict[i] = X.iloc[sim_list_indcs[i][j], 0]
            point_count = 0
            while threshold2 >= np.linalg.norm(tsne_train[sim_list_indcs[i][j]] - current_centroid) and point_count < max_points:
                ax.scatter(tsne_train[sim_list_indcs[i][j], 0], tsne_train[sim_list_indcs[i][j], 1], s=150, c=colors[i])
                ax.annotate(str(i), (tsne_train[sim_list_indcs[i][j], 0], tsne_train[sim_list_indcs[i][j], 1]), fontsize=20)
                point_count += 1
                j += 1
                if j == sim_list_indcs[i].shape[0]:
                    break
            break

fig.set_facecolor('w')
fig.set_size_inches((15,10))
fig.savefig('tsne.png', dpi=300)

In [None]:
fig, ax = plt.subplots()

ax.scatter(tsne_train[maxi, 0], tsne_train[maxi, 1], s=300, c='black')
ax.annotate('A', (tsne_train[maxi, 0], tsne_train[maxi, 1]), fontsize= 30)
colors = ['bisque', 'forestgreen', 'slategrey', 'royalblue', 'purple', 'red', 'magenta', 'cyan', 'gold', 'lime', 'peru', 'indigo']
for i in range(maxlen):        
    ax.scatter(tsne_train[sim_list_indcs[i][:], 0], tsne_train[sim_list_indcs[i][:], 1], s=20, c=colors[i])
    #ax.annotate(str(i), (tsne_train[sim_list_indcs[i][0], 0], tsne_train[sim_list_indcs[i][0], 1]), fontsize=20)
ax.legend([str(i) for i in range(maxlen)])
fig.set_size_inches((15,10))