In [54]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import numpy
import torch
import pandas
import matplotlib.pyplot as plt

from bpnetlite.io import extract_loci
from bpnetlite.performance import calculate_performance_measures

timestamp = "2023-06-14_10:16:58"

In [55]:
root = '/users/myin25/projects/human_proseq/data/coPRO_3prime/K562/'

peaks = root + 'peaks_fold1_val.bed.gz'
seqs = '/users/myin25/projects/human_proseq/refs/hg38.fasta'
signals = [root + '3prime.pos.bigWig', root + '3prime.neg.bigWig']
controls = None

valid_chroms = ['chr{}'.format(i) for i in range(0, 23)]
valid_chroms.append('chrX')
valid_chroms.append('chrY')

model_directory = '/users/myin25/projects/human_proseq/models'
model_path = model_directory + '/{}.final.torch'.format(timestamp)

val_save_dir = root + "model_out/"
os.makedirs(val_save_dir, exist_ok=True)

pred_counts_path = val_save_dir + timestamp + "_val.counts.npy"
pred_profiles_path = val_save_dir + timestamp + "_val.profs.npy"
metrics_path = val_save_dir + timestamp + "_metrics.tsv"

In [72]:
# Load Model
model = torch.load(model_path)
model.eval()
model = model.cuda()
    
# Load Data
val_sequences, val_profs = extract_loci(peaks, seqs, signals, controls, chroms=valid_chroms, max_jitter=0)

# Predict on Validation Set
with torch.no_grad():
    val_sequences = torch.tensor(val_sequences, dtype=torch.float32).cuda()
    pred_profile, pred_counts = model.predict(val_sequences)
    
    
# Save Predictions
numpy.save(pred_profiles_path, pred_profile)
numpy.save(pred_counts_path, pred_counts)


# re-format arrays for performance metrics code

#val_profs = val_profs.reshape(val_profs.shape[0], -1)
val_profs = numpy.swapaxes(numpy.expand_dims(val_profs, 1), 2, 3)
val_counts = val_profs.sum(axis=2)
val_counts = numpy.sum(val_counts, axis=2)
val_counts = val_counts[...,None]
print(val_counts.shape)


#pred_profile = pred_profile.reshape(pred_profile.shape[0], -1)
pred_profile = numpy.swapaxes(numpy.expand_dims(pred_profile, 1),2,3)
pred_counts = numpy.expand_dims(pred_counts, 1)
# pred_log_counts = log1p(pred_counts)

# convert everything to tensors
val_counts = torch.tensor(val_counts, dtype=torch.float32)
pred_profile = torch.tensor(pred_profile, dtype=torch.float32)
pred_counts = torch.tensor(pred_counts, dtype=torch.float32)

# Compute Performance Metrics

print(val_counts.shape)
print(pred_profile.shape)
print(pred_counts.shape)

metrics = calculate_performance_measures(pred_profile, val_counts, pred_counts,
    kernel_sigma=7, kernel_width=81, smooth_true=False, 
    smooth_predictions=False, measures=None)

metrics_to_save = ["nll", "jsd", "profile_pearson"]
metrics_dict = { metric : list(vals.squeeze()) for metric, vals in metrics.items() if metric in metrics_to_save }
metrics_df = pandas.DataFrame(metrics_dict)
metrics_df.to_csv(metrics_path, sep="\t", index=False)

metrics_to_report = ["nll", "jsd", "profile_pearson", "count_pearson", "count_mse"]
metrics_summary = [str(metrics[metric].mean()) for metric in metrics_to_report] 

print("Peaks: " + peaks)
print("Model: " + model_path)
print("Pred_profiles: " + pred_profiles_path)
print("Pred_counts: " + pred_counts_path)
mean_metrics = ["Mean " + metric + ": " + val for metric, val in zip(metrics_to_report, metrics_summary)]
print("\n".join(mean_metrics))

  val_sequences = torch.tensor(val_sequences, dtype=torch.float32).cuda()


(3699, 1, 1)
torch.Size([3699, 1, 1])
torch.Size([3699, 1, 1000, 2])
torch.Size([3699, 1, 1])


RuntimeError: expand(torch.FloatTensor{[200, 200, 1000]}, size=[200, 1]): the number of sizes provided (2) must be greater or equal to the number of dimensions in the tensor (3)

In [None]:
_, true_profiles = extract_loci(peaks, seqs, signals, controls, chroms=valid_chroms, max_jitter=0)

In [None]:
val_save_dir = proj_root + "model_out/" + expt_name + "/"
pred_counts_path = val_save_dir + timestamp + "_val.counts.npy"
pred_profiles_path = val_save_dir + timestamp + "_val.profs.npy"
metrics_path = val_save_dir + timestamp + "_metrics.tsv"

true_counts = true_profiles.sum(axis=2)
true_logcounts = numpy.log1p(true_counts)
pred_logcounts = numpy.load(pred_counts_path).squeeze()

In [71]:
print(val_counts.shape)

torch.Size([3699, 1, 1])
