# Without 10 - fold - cross - validation and Leave out methods

In [None]:
import numpy as np
from numpy import random
from sklearn.model_selection import train_test_split
import torch.utils.data
import matplotlib
import os
import sys
import pathlib
matplotlib.use('Agg')

import aitac
import plot_utils

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
# Hyper parameters
num_epochs = 10
num_classes = 141
batch_size = 10
learning_rate = 0.001
num_filters = 300

In [4]:
#create output figure directory
model_name = 'mini_sample'
output_file_path = "../outputs/" + model_name + "/training/"
directory = os.path.dirname(output_file_path)
if not os.path.exists(directory):
    print("Creating directory %s" % output_file_path)
    pathlib.Path(output_file_path).mkdir(parents=True, exist_ok=True) 
else:
     print("Directory %s exists" % output_file_path)

Directory ../outputs/mini_sample/training/ exists


In [5]:
x = np.load('../BRCA_data/mini_sample_one_hot_seqs.npy')
x = x.astype(np.float32)
y = np.load('../BRCA_data/mini_sample_cell_type_array.npy')
y = y.astype(np.float32)
peak_names = np.load('../BRCA_data/mini_sample_peak_names.npy')
peak_names = peak_names.astype(np.bytes_)

In [6]:
# split the data into training and test sets
train_data, eval_data, train_labels, eval_labels, train_names, eval_names = train_test_split(x, y, peak_names, test_size=0.1, random_state=40)

In [7]:
#print(train_data.shape)
#print(train_labels.shape)
#print(train_names.shape)


In [8]:
# Data loader
train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(train_data), torch.from_numpy(train_labels))
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)


eval_dataset = torch.utils.data.TensorDataset(torch.from_numpy(eval_data), torch.from_numpy(eval_labels))
eval_loader = torch.utils.data.DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=False)



In [9]:
# create model 
model = aitac.ConvNet(num_classes, num_filters).to(device)


In [10]:
# Loss and optimizer
criterion = aitac.pearson_loss
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [11]:
# train model
model, best_loss = aitac.train_model(train_loader, eval_loader, model, device, criterion,  optimizer, num_epochs, output_file_path)


Saving the best model weights at Epoch [1], Best Valid Loss: 0.5331
Saving the best model weights at Epoch [2], Best Valid Loss: 0.5167
Saving the best model weights at Epoch [3], Best Valid Loss: 0.5057


In [12]:
# save the model checkpoint
models_file_path = "../models/"
models_directory = os.path.dirname(models_file_path)
if not os.path.exists(models_directory):
    print("Creating directory %s" % models_file_path)
    os.makedirs(models_directory)
else:
     print("Directory %s exists" % models_file_path)

torch.save(model.state_dict(), '../models/' + model_name + '.ckpt')


Directory ../models/ exists


In [13]:
#save the whole model
torch.save(model, '../models/' + model_name + '.pth')


In [14]:
# Predict on test set
predictions, max_activations, max_act_index = aitac.test_model(eval_loader, model, device)


In [15]:
# plot the correlations histogram
# returns correlation measurement for every prediction-label pair
print("Creating plots...")


correlations = plot_utils.plot_cors(eval_labels, predictions, output_file_path)

Creating plots...
weighted_cor is 0.5043595900345061
number of NaN values: 0


In [16]:
plot_utils.plot_corr_variance(eval_labels, correlations, output_file_path)


In [17]:
quantile_indx = plot_utils.plot_piechart(correlations, eval_labels, output_file_path)



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = ret.dtype.type(ret / rcount)


1st sd: nan, 2nd sd: 0.8692730069160461, 3rd sd: 0.812441349029541, 4th sd: 0.8164322376251221
1st qr: nan, 2nd qr: 0.9810259103775024, 3rd qr: 1.1518596336245537, 4th qr: 0.7607938684523106


In [18]:
#plot_utils.plot_random_predictions(eval_labels, predictions, correlations, quantile_indx, eval_names, output_file_path)


In [19]:
#save predictions
np.save(output_file_path + "predictions.npy", predictions)

#save correlations
np.save(output_file_path + "correlations.npy", correlations)

#save max first layer activations
np.save(output_file_path + "max_activations.npy", max_activations)
np.save(output_file_path + "max_activation_index.npy", max_act_index)

#save test data set
np.save(output_file_path + "test_OCR_names.npy", eval_names)


In [20]:
preds = np.load('../outputs/first_approach/training/predictions.npy')
preds = preds.astype(np.float32)
print(preds.shape)
print(preds)

(10, 81)
[[-2.86464393e-02 -2.54055023e-01 -1.96895465e-01 -3.39933425e-01
  -2.34695658e-01 -1.55599892e-01 -1.52297348e-01  5.08297741e-01
   8.93353596e-02 -2.96935141e-01 -1.87630534e-01 -1.86651647e-01
  -1.00746080e-01 -2.76303232e-01 -1.77532941e-01 -2.63246357e-01
  -2.50320673e-01 -1.19697228e-01 -2.84811527e-01 -3.20420861e-01
  -3.39516193e-01  1.99190304e-01  3.97057712e-01  2.24804968e-01
   4.20901537e-01  2.93686897e-01  2.77857572e-01 -7.96647817e-02
  -3.47397089e-01 -1.15854129e-01 -3.83361243e-02  2.47826427e-03
  -2.50325114e-01 -2.69547999e-01 -1.96070820e-01  1.87034428e-01
   1.39741868e-01 -2.04218522e-01 -2.59569705e-01  1.44902915e-01
   1.81711257e-01  2.11720482e-01  1.47468239e-01  1.59467697e-01
   1.45012185e-01  1.62869155e-01 -5.96384704e-02  4.40161452e-02
  -1.52916070e-02  2.73716431e-02 -1.92085996e-01  6.70953095e-02
   1.31481430e-02  6.95892274e-02 -6.29362538e-02 -5.15135340e-02
  -2.54632309e-02 -6.22182526e-03 -7.54150599e-02  1.97923943e-01
 

In [21]:
corr = np.load('../outputs/first_approach/training/correlations.npy')
corr = corr.astype(np.float32)
print(corr.shape)
print(corr)

(10,)
[ 0.25016937  0.1202623   0.45097762  0.29923952  0.32403216  0.39362302
  0.17025955  0.57016045  0.35901845 -0.10707181]


In [22]:
m_act = np.load('../outputs/first_approach/training/max_activations.npy')
m_act = m_act.astype(np.float32)
print(m_act.shape)
print(m_act)

(10, 300)
[[0.7485167  0.798457   0.49164894 ... 0.78751886 0.8667381  0.88092035]
 [0.8127548  0.7196656  0.58814245 ... 0.85957146 0.876065   0.90636307]
 [0.8324806  0.7036093  0.5567735  ... 0.78909206 0.7539064  0.77290446]
 ...
 [0.91021234 0.9436616  0.6237757  ... 0.75349617 1.1155276  0.8165227 ]
 [0.8433365  0.78347164 0.707476   ... 0.72404957 0.8136774  0.95867497]
 [0.8048836  0.7528756  0.8367028  ... 0.7510655  0.8746428  0.8321827 ]]


In [23]:
m_acti = np.load('../outputs/first_approach/training/max_activation_index.npy')
m_acti = m_acti.astype(np.float32)
print(m_acti.shape)
print(m_acti)

(10, 300)
[[224. 114. 203. ...  94.  16.  75.]
 [204. 236.  24. ... 152.  13. 228.]
 [106. 156. 188. ...   3. 141. 189.]
 ...
 [219. 235.  66. ...  77. 229. 127.]
 [  6. 236. 105. ... 129. 239. 195.]
 [ 28.  70.  33. ...  88. 230. 208.]]


In [24]:
t_ocr = np.load('../outputs/first_approach/training/test_OCR_names.npy')
#t_ocr = t_ocr.astype(np.float32)
print(t_ocr.shape)
print(t_ocr)

(10,)
[b'ImmGenATAC1219.peak_127' b'ImmGenATAC1219.peak_123'
 b'ImmGenATAC1219.peak_102' b'ImmGenATAC1219.peak_24'
 b'ImmGenATAC1219.peak_67' b'ImmGenATAC1219.peak_20'
 b'ImmGenATAC1219.peak_69' b'ImmGenATAC1219.peak_77'
 b'ImmGenATAC1219.peak_68' b'ImmGenATAC1219.peak_100']


# With 10 - fold - cross - validation and Leave out methods