This notebook creates the alanine scan mutations, the resulting sequences can be predicted using 'predict.py --data_path \<mutated_df_path\>'

In [1]:
MAX_WINDOW_SIZE = 40

In [2]:
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from IPython.display import display, Markdown

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_seq_items', 500)
pd.set_option("display.max_colwidth", 150)

In [3]:
df = pd.read_pickle('../../data/all_with_candidates.pickle')

In [4]:
# Only host factors
df = df[df['label'] == 1.0].copy()

In [5]:
mutated_dfs = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    seq = row['seq']
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    # Keep wildtype sequence
    wt_seq = {'original_seq': seq, 'seq': seq, 'position': -1, 'window_size': -1, 'original_aa': -1}

    mutated_seqs = [wt_seq]
    for window_size in [1] + list(range(5, MAX_WINDOW_SIZE+1, 5)):
        for pos in range(0, min(len(seq) - window_size + 1, 1024)):
            mutated_seq = seq[:pos] + ('A' * window_size) + seq[pos+window_size:]
            mutated_df_ = {'original_seq': seq, 'seq': mutated_seq, 'position': pos+1, 'window_size': window_size, 'original_aa': seq[pos:pos+window_size]}
            mutated_seqs.append(mutated_df_)

    mutated_df = pd.DataFrame(mutated_seqs)
    
    mutated_df['group_split_0'] = [row['group_split_0']] * mutated_df.shape[0]
    mutated_df['group_split_1'] = [row['group_split_1']] * mutated_df.shape[0]
    mutated_df['group_split_2'] = [row['group_split_2']] * mutated_df.shape[0]
    mutated_df['group_split_3'] = [row['group_split_3']] * mutated_df.shape[0]
    mutated_df['group_split_4'] = [row['group_split_4']] * mutated_df.shape[0]
    mutated_df['label'] = [row['label']] * mutated_df.shape[0]
    mutated_df['protein_ac'] = [row['protein_ac']] * mutated_df.shape[0]

    mutated_dfs.append(mutated_df)
mutated_dfs = pd.concat(mutated_dfs)

100%|██████████████████████████████████████| 1045/1045 [00:08<00:00, 119.83it/s]


In [6]:
mutated_dfs.shape

(5244902, 12)

In [7]:
mutated_dfs.to_pickle('mutated_data.pickle.zip')