# 实验5：augmentation (rc, shift) 的影响

In [1]:
import sys
sys.path.append("../../")

from torch.utils.data import DataLoader
from enformer_pytorch import from_pretrained

from MPRA_predict.utils import *
from MPRA_predict.datasets import *

np.set_printoptions(linewidth=120)

# 5.1 reverse complement

In [2]:
# df = pd.read_csv('data/enformer_sequences_test_100.csv')
# dataset = SeqLabelDataset(
#     data_df=df,
#     data_type='bed',
#     genome_path='../../../../genome/hg38.fa',
#     crop=True, 
#     crop_method='center',
#     cropped_length=65536,
#     padding=True, 
#     padding_method='N',
#     padded_length=131072,
#     N_fill_value=0)
# print(dataset[0]['seq'].shape)

In [7]:
# only rc, crop

df = pd.read_csv('data/enformer_sequences_test_100.csv')
df['seq'] = seqs_rc(df['seq'])

for cropped_length in [256, 1024, 4096, 16384, 65536, 196608]:
    model = from_pretrained('../../pretrained_models/enformer_weights', target_length=2)
    dataset = SeqLabelDataset(
        data_df=df,
        data_type='seq',
        input_column='seq', 
        crop=True, 
        crop_method='center',
        cropped_length=cropped_length,
        padding=False, 
        padding_method='N',
        padded_length=196608,
        N_fill_value=0)
    
    test_data_loader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=1)
    y_pred = get_pred(model, test_data_loader)

    torch.cuda.empty_cache()
    np.save(f'data/Enformer_pred_crop_{cropped_length}_only_rc.npy', y_pred)

100%|██████████| 25/25 [00:00<00:00, 120.68it/s]
100%|██████████| 25/25 [00:00<00:00, 97.67it/s] 
100%|██████████| 25/25 [00:00<00:00, 69.82it/s]
100%|██████████| 25/25 [00:00<00:00, 28.17it/s]
100%|██████████| 25/25 [00:03<00:00,  7.78it/s]
100%|██████████| 25/25 [00:06<00:00,  3.86it/s]
100%|██████████| 25/25 [00:10<00:00,  2.46it/s]


In [3]:
pred_list = []
for cropped_length in [256, 1024, 4096, 16384, 65536, 196608]:
    pred = np.load(f'data/Enformer_pred_crop_{cropped_length}_only_rc.npy')
    # print(pred.shape)
    pred_list.append(pred)
pred_list = np.stack(pred_list)
pred_list = pred_list[:, :, ::-1, :] # reverse

target = np.load('data/enformer_targets_test_100.npy')
pred_list = np.concatenate([pred_list, target[None, :]], axis=0)
print(pred_list.shape)

corr = np.corrcoef(pred_list.reshape(pred_list.shape[0], -1))
print(corr)

(7, 100, 2, 5313)
[[1.         0.70612375 0.6293935  0.59162501 0.54703034 0.52029941 0.40075368]
 [0.70612375 1.         0.83789585 0.77859312 0.70366764 0.65053733 0.51687827]
 [0.6293935  0.83789585 1.         0.88607703 0.80093604 0.75080774 0.60675379]
 [0.59162501 0.77859312 0.88607703 1.         0.92260414 0.86011253 0.68952862]
 [0.54703034 0.70366764 0.80093604 0.92260414 1.         0.96524066 0.71557783]
 [0.52029941 0.65053733 0.75080774 0.86011253 0.96524066 1.         0.7073936 ]
 [0.40075368 0.51687827 0.60675379 0.68952862 0.71557783 0.7073936  1.        ]]


In [4]:
pred_list_only_rc = pred_list

In [6]:
pred_list = []
for cropped_length in [256, 1024, 4096, 16384, 65536, 196608]:
    pred = np.load(f'data/Enformer_pred_crop_{cropped_length}.npy')
    print(pred.shape)
    pred_list.append(pred)
pred_list = np.stack(pred_list)

target = np.load('data/enformer_targets_test_100.npy')
pred_list = np.concatenate([pred_list, target[None, :]], axis=0)
print(pred_list.shape)

corr = np.corrcoef(pred_list.reshape(pred_list.shape[0], -1))
print(corr)

(100, 2, 5313)
(100, 2, 5313)
(100, 2, 5313)
(100, 2, 5313)
(100, 2, 5313)
(100, 2, 5313)
(7, 100, 2, 5313)
[[1.         0.71225783 0.63852997 0.60830955 0.55520755 0.51254985 0.39979347]
 [0.71225783 1.         0.85230813 0.78216532 0.69783025 0.64432036 0.51913269]
 [0.63852997 0.85230813 1.         0.89114613 0.79638438 0.7340984  0.6019289 ]
 [0.60830955 0.78216532 0.89114613 1.         0.91738696 0.85833112 0.68609631]
 [0.55520755 0.69783025 0.79638438 0.91738696 1.         0.96676081 0.71926996]
 [0.51254985 0.64432036 0.7340984  0.85833112 0.96676081 1.         0.71367927]
 [0.39979347 0.51913269 0.6019289  0.68609631 0.71926996 0.71367927 1.        ]]


In [7]:
pred_average = (pred_list_only_rc + pred_list) / 2
corr = np.corrcoef(pred_average.reshape(pred_average.shape[0], -1))
print(corr)

[[1.         0.72493006 0.64354748 0.60834189 0.55795098 0.52298575 0.40448726]
 [0.72493006 1.         0.85925448 0.79192229 0.7113229  0.65752534 0.52488829]
 [0.64354748 0.85925448 1.         0.89562007 0.80451738 0.74786613 0.61009971]
 [0.60834189 0.79192229 0.89562007 1.         0.92401923 0.86374077 0.6926645 ]
 [0.55795098 0.7113229  0.80451738 0.92401923 1.         0.96809289 0.72150798]
 [0.52298575 0.65752534 0.74786613 0.86374077 0.96809289 1.         0.7149443 ]
 [0.40448726 0.52488829 0.61009971 0.6926645  0.72150798 0.7149443  1.        ]]


In [11]:
# only rc, crop + pad

df = pd.read_csv('data/enformer_sequences_test_100.csv')
df['seq'] = seqs_rc(df['seq'])

for cropped_length in [256, 1024, 4096, 16384, 65536, 196608]:
    model = from_pretrained('../../pretrained_models/enformer_weights', target_length=2)
    dataset = SeqLabelDataset(
        data_df=df,
        data_type='seq',
        input_column='seq', 
        crop=True, 
        crop_method='center',
        cropped_length=cropped_length,
        padding=True, 
        padding_method='N',
        padded_length=196608,
        N_fill_value=0)
    
    test_data_loader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=1)
    y_pred = get_pred(model, test_data_loader)

    torch.cuda.empty_cache()
    np.save(f'data/Enformer_pred_crop_{cropped_length}_pad_{196608}_only_rc.npy', y_pred)

100%|██████████| 25/25 [00:10<00:00,  2.43it/s]
100%|██████████| 25/25 [00:10<00:00,  2.46it/s]
100%|██████████| 25/25 [00:10<00:00,  2.46it/s]
100%|██████████| 25/25 [00:10<00:00,  2.46it/s]
100%|██████████| 25/25 [00:10<00:00,  2.46it/s]
100%|██████████| 25/25 [00:10<00:00,  2.46it/s]
100%|██████████| 25/25 [00:10<00:00,  2.46it/s]


In [13]:
pred_list = []
for cropped_length in [256, 1024, 4096, 16384, 65536, 196608]:
    pred = np.load(f'data/Enformer_pred_crop_{cropped_length}_pad_{196608}_only_rc.npy')
    pred_list.append(pred)
pred_list = np.stack(pred_list)
# reverse
pred_list = pred_list[:, :, ::-1, :]

target = np.load('data/enformer_targets_test_100.npy')
pred_list = np.concatenate([pred_list, target[None, :]], axis=0)
print(pred_list.shape)

corr = np.corrcoef(pred_list.reshape(pred_list.shape[0], -1))
print(corr)

(7, 100, 2, 5313)
[[1.         0.95677731 0.86985035 0.70680899 0.58357178 0.59107797 0.44318385]
 [0.95677731 1.         0.91142401 0.7392873  0.61210595 0.61861607 0.46606412]
 [0.86985035 0.91142401 1.         0.84528424 0.69048596 0.68514539 0.52014077]
 [0.70680899 0.7392873  0.84528424 1.         0.87561296 0.85288352 0.64663612]
 [0.58357178 0.61210595 0.69048596 0.87561296 1.         0.9732937  0.70090694]
 [0.59107797 0.61861607 0.68514539 0.85288352 0.9732937  1.         0.7073936 ]
 [0.44318385 0.46606412 0.52014077 0.64663612 0.70090694 0.7073936  1.        ]]


In [14]:
pred_list_only_rc = pred_list

In [15]:
pred_list = []
for cropped_length in [256, 1024, 4096, 16384, 65536, 196608]:
    pred = np.load(f'data/Enformer_pred_crop_{cropped_length}_pad_{196608}.npy')
    print(pred.shape)
    pred_list.append(pred)
pred_list = np.stack(pred_list)

target = np.load('data/enformer_targets_test_100.npy')
pred_list = np.concatenate([pred_list, target[None, :]], axis=0)
print(pred_list.shape)

corr = np.corrcoef(pred_list.reshape(pred_list.shape[0], -1))
print(corr)

(100, 2, 5313)
(100, 2, 5313)
(100, 2, 5313)
(100, 2, 5313)
(100, 2, 5313)
(100, 2, 5313)
(7, 100, 2, 5313)
[[1.         0.95455736 0.86542427 0.68575154 0.56846538 0.57255159 0.4404765 ]
 [0.95455736 1.         0.91101703 0.72208454 0.59874756 0.60434005 0.46874516]
 [0.86542427 0.91101703 1.         0.83634713 0.68089644 0.66930191 0.51791235]
 [0.68575154 0.72208454 0.83634713 1.         0.88303408 0.85181005 0.65401343]
 [0.56846538 0.59874756 0.68089644 0.88303408 1.         0.97222186 0.70434762]
 [0.57255159 0.60434005 0.66930191 0.85181005 0.97222186 1.         0.71367927]
 [0.4404765  0.46874516 0.51791235 0.65401343 0.70434762 0.71367927 1.        ]]


In [16]:
pred_average = (pred_list_only_rc + pred_list) / 2
corr = np.corrcoef(pred_average.reshape(pred_average.shape[0], -1))
print(corr)

[[1.         0.95706816 0.87086308 0.70047873 0.5782828  0.58421287 0.4426665 ]
 [0.95706816 1.         0.91388396 0.73508161 0.60764326 0.61400025 0.46817002]
 [0.87086308 0.91388396 1.         0.84620891 0.68946656 0.68155718 0.52098455]
 [0.70047873 0.73508161 0.84620891 1.         0.88422577 0.85756708 0.65520819]
 [0.5782828  0.60764326 0.68946656 0.88422577 1.         0.97450285 0.70649129]
 [0.58421287 0.61400025 0.68155718 0.85756708 0.97450285 1.         0.7149443 ]
 [0.4426665  0.46817002 0.52098455 0.65520819 0.70649129 0.7149443  1.        ]]


In [None]:
# 需要调查，N和no 平均似乎效果更好？？