In [1]:
import numpy as np
from numpy import random
import pandas as pd

from sklearn.model_selection import KFold
import torch.utils.data
import matplotlib
import os
import sys
matplotlib.use('Agg')
import multiprocessing as mp

import aitac
import plot_utils
import time

<function get_worker_info at 0x7fed6f076160>


In [2]:
#create output directory
output_file_path = "../outputs/valid10x10/"
directory = os.path.dirname(output_file_path)
if not os.path.exists(directory):
    print("Creating directory %s" % output_file_path)
    os.makedirs(directory)
else:
     print("Directory %s exists" % output_file_path)

Directory ../outputs/valid10x10/ exists


In [3]:
# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [4]:
# Hyper parameters
num_epochs = 10
num_classes = 141
batch_size = 10
learning_rate = 0.001
num_filters = 300
run_num = 'second'

In [5]:
# Load all data
x = np.load('../BRCA_data/mini_sample_one_hot_seqs.npy')
x = x.astype(np.float32)
y = np.load('../BRCA_data/mini_sample_cell_type_array.npy')
y = y.astype(np.float32)
peak_names = np.load('../BRCA_data/mini_sample_peak_names.npy')

In [6]:
splitter = KFold(n_splits=10, shuffle=True, random_state = 123) #creamos los cachos que van a ir separados
folds = list(splitter.split(x, y))

In [7]:
def benchmark_models(x, y, peak_names, output_file_path, split):
    """
    Helper function to benchmark models --> it executes the training and the testing of the model with a part of our data
    X : array
    y : array
    split : tuple
    Training and test indices (split[train], split[test])
    """
    
    torch.set_num_threads(1)
    
    #split the data
    train = 0
    test = 1
    
    train_data, eval_data = x[split[train], :, :], x[split[test], :, :]
    train_labels, eval_labels = y[split[train], :], y[split[test], :]
    train_names, eval_name = peak_names[split[train]], peak_names[split[test]]
    
    #Data loader
    train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(train_data), torch.from_numpy(train_labels))
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    eval_dataset = torch.utils.data.TensorDataset(torch.from_numpy(eval_data), torch.from_numpy(eval_labels))
    eval_loader = torch.utils.data.DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    
    
    
    # create model 
    model = aitac.ConvNet(num_classes, num_filters).to(device)

    # Loss and optimizer
    criterion = aitac.pearson_loss
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # train model
    model, best_loss = aitac.train_model(train_loader, eval_loader, model, device, criterion,  optimizer, num_epochs, output_file_path)

    # Predict on test set
    predictions, max_activations, max_act_index = aitac.test_model(eval_loader, model, device)

    # plot the correlations histogram
    correlations = plot_utils.plot_cors(eval_labels, predictions, output_file_path)
    
    results = [predictions, correlations, eval_name]
    
    return results
    

We start the parallization.

In [9]:

#pool = mp.Pool(3)
# Python can count the available cores for you in most cases: mp.cpu_count()
pool = mp.Pool(processes=1) # 2cpus:9' | 3cpus:9' | 10processs:9' processeses = 10 | 4 

Saving the best model weights at Epoch [1], Best Valid Loss: 0.6827
Saving the best model weights at Epoch [2], Best Valid Loss: 0.6592
Saving the best model weights at Epoch [3], Best Valid Loss: 0.6461
Saving the best model weights at Epoch [4], Best Valid Loss: 0.6382
Saving the best model weights at Epoch [5], Best Valid Loss: 0.6357
weighted_cor is 0.3453173649766286
number of NaN values: 0


In [10]:
predictions_all = []
correlations_all = []
peak_order_all = []
def log_result(results):
    '''
    Take the results from the different cross validation process and put them together for the next step
    '''
    predictions_all.append(results[0])
    correlations_all.append(results[1])
    peak_order_all.append(results[2])
    

In [10]:
#datos = pool.map(benchmark_models, folds) # 8'

In [11]:
start_time = time.time()

for fold in range(1): # We do the k-fold cross validation process
    pool.apply_async(benchmark_models, args=(x, y, peak_names, output_file_path,folds[fold]), callback = log_result)

In [12]:
pool.close()

In [14]:
end_time = time.time()
print("--- %s seconds ---" % ((end_time - start_time)))

--- 138.28150033950806 seconds ---


In [20]:
predictions_all = np.vstack(predictions_all)
print(predictions_all.shape)
print(predictions_all)

(100, 141)
[[-4.1411767  -4.4942946   2.8506343  ...  1.9243295   0.7322075
   0.63073987]
 [-2.6575267  -2.8265278   1.477456   ...  1.060821    0.6428787
   0.6315372 ]
 [-2.5525937  -2.8361564   1.0452759  ...  1.1196467   0.55728346
   0.43491462]
 ...
 [-3.939746   -4.416634    2.4228427  ...  1.6891941   1.0567546
   0.7317235 ]
 [-3.5343888  -4.1001406   1.394879   ...  1.1335149   0.9797529
   0.7878465 ]
 [-2.9661171  -3.5968142   1.5589011  ...  1.1235495   0.8489444
   0.6138202 ]]


In [21]:
correlations_all = np.hstack(correlations_all)
print(correlations_all.shape)
print(correlations_all)

(100,)
[-0.03796543 -0.07729042  0.6312153   0.52034206  0.67949516  0.48805899
  0.62585262  0.28420466  0.58397668  0.51585311 -0.03796543 -0.07729042
  0.6312153   0.52034206  0.67949516  0.48805899  0.62585262  0.28420466
  0.58397668  0.51585311 -0.03796543 -0.07729042  0.6312153   0.52034206
  0.67949516  0.48805899  0.62585262  0.28420466  0.58397668  0.51585311
 -0.0396199  -0.07178639  0.63946932  0.52415306  0.68731179  0.48257162
  0.61297366  0.2606187   0.57776137  0.512845   -0.0396199  -0.07178639
  0.63946932  0.52415306  0.68731179  0.48257162  0.61297366  0.2606187
  0.57776137  0.512845   -0.0396199  -0.07178639  0.63946932  0.52415306
  0.68731179  0.48257162  0.61297366  0.2606187   0.57776137  0.512845
 -0.05519678 -0.07679066  0.62896369  0.52993627  0.69207883  0.48150297
  0.61888756  0.27522814  0.58279148  0.5138411  -0.05519678 -0.07679066
  0.62896369  0.52993627  0.69207883  0.48150297  0.61888756  0.27522814
  0.58279148  0.5138411  -0.05519678 -0.0767906

In [19]:
peak_order_all =  np.hstack(peak_order_all)
print(peak_order_all.shape)

(100,)
