# 实验1：random_seed的影响

In [1]:
import sys
sys.path.append("..")

from torch.utils.data import DataLoader
from enformer_pytorch import Enformer, from_pretrained

from MPRA_predict.utils import *
from MPRA_predict.datasets import *

In [3]:
for random_seed in [0,1]:
    set_seed(random_seed)
    model = from_pretrained('../../pretrained_models/enformer_weights')
    dataset = SeqLabelDataset(
        data_path='../../data/SirajMPRA/SirajMPRA_100.csv',
        seq_column='seq', 
        padding=True, 
        padded_length=196_608, 
        N_fill_value=0)
    test_data_loader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=4, pin_memory=True)
    y_pred = get_pred(model, test_data_loader)

    torch.cuda.empty_cache()
    np.save(f'data/Enformer_Siraj_pred_random_seed={random_seed}.npy', y_pred)

 24%|██▍       | 6/25 [00:02<00:08,  2.28it/s]

# 结论1：random_seed不影响输出

# 实验2：batch_size的影响

In [3]:
for batch_size in [1,2,4]:
    model = from_pretrained('../../pretrained_models/enformer_weights')
    dataset = SeqLabelDataset(
        data_path='../../data/SirajMPRA/SirajMPRA_100.csv',
        seq_column='seq', 
        padding=True, 
        padded_length=196_608, 
        N_fill_value=0)
    test_data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    y_pred = get_pred(model, test_data_loader)

    torch.cuda.empty_cache()
    np.save(f'data/Enformer_Siraj_pred_batch_size={batch_size}.npy', y_pred)

set all labels to 0


100%|██████████| 100/100 [00:11<00:00,  8.99it/s]


set all labels to 0


100%|██████████| 50/50 [00:10<00:00,  4.56it/s]


set all labels to 0


100%|██████████| 25/25 [00:10<00:00,  2.33it/s]


In [4]:
y_pred_list = []
for batch_size in [1,2,4]:
    y_pred = np.load(f'data/Enformer_Siraj_pred_batch_size={batch_size}.npy')
    print(y_pred.shape)
    y_pred_list.append(y_pred)

y_pred_list = np.array(y_pred_list).reshape(-1)
print(np.corrcoef(y_pred_list))

(100, 896, 5313)
(100, 896, 5313)
(100, 896, 5313)
1.0


# 实验2：batch_size不影响输出

# 实验3：target_length的影响

In [None]:
for target_length in [2,4,896]:
    model = from_pretrained('../../pretrained_models/enformer_weights', target_length=target_length)
    dataset = SeqLabelDataset(
        data_path='../../data/SirajMPRA/SirajMPRA_100.csv',
        seq_column='seq', 
        padding=True, 
        padded_length=196_608, 
        N_fill_value=0)
    test_data_loader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=4, pin_memory=True)
    y_pred = get_pred(model, test_data_loader)

    torch.cuda.empty_cache()
    np.save(f'data/Enformer_Siraj_pred_target_length={target_length}.npy', y_pred)

set all labels to 0


100%|██████████| 25/25 [00:10<00:00,  2.45it/s]


set all labels to 0


100%|██████████| 25/25 [00:10<00:00,  2.48it/s]


set all labels to 0


100%|██████████| 25/25 [00:10<00:00,  2.34it/s]


In [None]:
y_pred_list = []

y_pred = np.load(f'data/Enformer_Siraj_pred_target_length=2.npy')
print(y_pred.shape)
y_pred_list.append(y_pred)

y_pred = np.load(f'data/Enformer_Siraj_pred_target_length=4.npy')
print(y_pred.shape)
y_pred_list.append(y_pred[:, 1:3])

y_pred = np.load(f'data/Enformer_Siraj_pred_target_length=896.npy')
print(y_pred.shape)
y_pred_list.append(y_pred[:, 447:449])

y_pred_list = np.array(y_pred_list)

print(np.corrcoef(y_pred_list.reshape(y_pred_list.shape[0], -1)))

(100, 2, 5313)
(100, 4, 5313)
(100, 896, 5313)
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]


# 结论3：target_length不影响速度和结果, 轻微影响显存

# TF gamma

In [4]:
for target_length in [2]:
    model = from_pretrained('../pretrained_models/enformer_weights', target_length=target_length, use_tf_gamma=True)

    dataset = SeqLabelDataset(
        data_path='data/enformer_sequences_test_100.csv',
        data_type='seq',
        input_column='seq',
        # data_type='bed',
        # genome_path='../../../genome/hg38.fa',
        crop=False,
        cropped_length=196608,
        padding=False,
        N_fill_value=0)
    test_data_loader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=4, pin_memory=True)
    y_pred = get_pred(model, test_data_loader)

    torch.cuda.empty_cache()
    np.save(f'data/Enformer_pred_use_tf_gamma_true.npy', y_pred)

100%|██████████| 25/25 [00:10<00:00,  2.41it/s]


In [5]:
for target_length in [2]:
    model = from_pretrained('../pretrained_models/enformer_weights', target_length=target_length, use_tf_gamma=False)

    dataset = SeqLabelDataset(
        data_path='data/enformer_sequences_test_100.csv',
        data_type='seq',
        input_column='seq',
        # data_type='bed',
        # genome_path='../../../genome/hg38.fa',
        crop=False,
        cropped_length=196608,
        padding=False,
        N_fill_value=0)
    test_data_loader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=4, pin_memory=True)
    y_pred = get_pred(model, test_data_loader)

    torch.cuda.empty_cache()
    np.save(f'data/Enformer_pred_use_tf_gamma_false.npy', y_pred)

100%|██████████| 25/25 [00:10<00:00,  2.45it/s]


In [6]:
y_pred_list = []
y_pred = np.load(f'data/Enformer_pred_use_tf_gamma_true.npy')
y_pred_list.append(y_pred)

y_pred = np.load(f'data/Enformer_pred_use_tf_gamma_false.npy')
y_pred_list.append(y_pred)

y_pred_list = np.stack(y_pred_list)
print(y_pred_list.shape)
print(np.corrcoef(y_pred_list.reshape(y_pred_list.shape[0], -1)))

(2, 100, 2, 5313)
[[1.         0.95737918]
 [0.95737918 1.        ]]
