In [9]:
import logging

import pandas as pd

from shmple import AttentionModel

from epam.mutsel_simulation import parent_position_is_hydrophobic, parent_position_is_next_to_hydrophobic, MutSelSimulator, HYDROPHOBIC_AAS
from epam.toy_simulation import mimic_mutations
from epam.sequences import translate_sequences

In [2]:
shmple_weights_directory = "/Users/matsen/re/epam/data/shmple_weights/my_shmoof"
shmple_model = AttentionModel(weights_dir=shmple_weights_directory, log_level=logging.WARNING)

criterion = parent_position_is_next_to_hydrophobic
simulator = MutSelSimulator(criterion, shmple_model)
parent_dna_sequence = "GCTGTCGCAAAGCGT"
target_mut_count = 5
child_dna_sequence = simulator.simulate_child_sequence(parent_dna_sequence, target_mut_count)
print("Simulated child DNA sequence:", child_dna_sequence)




Simulated child DNA sequence: ATTGCTGCAGAGCGT


In [3]:
pcp_df = pd.read_csv("~/data/wyatt-10x-1p5m_pcp_2023-10-07.csv", index_col=0)
pcp_df = pcp_df[pcp_df["parent"] != pcp_df["child"]].reset_index(drop=True)
pcp_df["sub_count"] = pcp_df.apply(lambda row: sum([p != c for p, c in zip(row["parent"], row["child"])]), axis=1)
pcp_df

Unnamed: 0,sample_id,family,parent,child,v_gene,child_is_leaf,sub_count
0,0,149198,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,IGHV3-33*01,False,3
1,0,149198,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,IGHV3-33*01,False,4
2,0,149198,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,CAAGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,IGHV3-33*01,False,16
3,0,149198,CAAGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,CAAATGCAGATGGTGGAGTCGGGGGGAGGCGTGGTCCAGCCAGGGA...,IGHV3-33*01,True,19
4,0,149198,CAAGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,CAAGTGCAACTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,IGHV3-33*01,True,7
...,...,...,...,...,...,...,...
60288,0,17934,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,IGHV4-39*01,True,17
60289,0,17934,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,IGHV4-39*01,False,2
60290,0,17934,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTTGTGAAGCCTTCGG...,IGHV4-39*01,True,8
60291,0,269306,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,IGHV3-23*01,True,3


In [4]:
fake_df = pd.DataFrame({"parent": pcp_df["parent"], "child": mimic_mutations(simulator.simulate_child_sequence, pcp_df["parent"], pcp_df["sub_count"])})

fake_df["sub_count"] = pcp_df.apply(lambda row: sum([p != c for p, c in zip(row["parent"], row["child"])]), axis=1)

Mutating sequences: 100%|██████████| 60293/60293 [56:54<00:00, 17.66it/s]  


In [5]:
assert (fake_df["sub_count"] == pcp_df["sub_count"]).all()

In [6]:
#fake_df.to_csv("~/data/wyatt-10x-1p5m_pcp_2023-10-07.mimic-only_allow_mutations_at_hydrophobic_sites.csv")
fake_df.to_csv("~/data/wyatt-10x-1p5m_pcp_2023-10-07.mimic-only_allow_mutations_next_to_hydrophobic_sites.csv")

In [7]:
fake_df["aa_parent"] = translate_sequences(fake_df["parent"])
fake_df["aa_child"] = translate_sequences(fake_df["child"])

In [8]:
hydrophobic_aas = set(list("AILMFWV"))

# count the number of hydrophobic sites in each parent sequence and add to a column "hydrophobic_count_parent"
fake_df["hydrophobic_count_parent"] = fake_df["aa_parent"].apply(lambda aa_seq: sum([aa in hydrophobic_aas for aa in aa_seq]))
# count the number of hydrophobic sites in each child sequence and add to a column "hydrophobic_count_child"
fake_df["hydrophobic_count_child"] = fake_df["aa_child"].apply(lambda aa_seq: sum([aa in hydrophobic_aas for aa in aa_seq]))

# take the difference of these two columns and add to a column "hydrophobic_count_diff"
fake_df["hydrophobic_count_diff"] = fake_df["hydrophobic_count_child"] - fake_df["hydrophobic_count_parent"]

fake_df["hydrophobic_count_diff"].value_counts()

hydrophobic_count_diff
 0     27338
 1     12459
-1      9821
 2      4497
-2      2709
 3      1687
-3       699
 4       614
 5       171
-4       152
 6        75
-5        31
 7        25
-6         9
 8         4
 10        1
 9         1
Name: count, dtype: int64

In [10]:
def is_next_to_hydrophobic(aa_str, pos):
    left_is_hydrophobic = pos != 0 and aa_str[pos - 1] in HYDROPHOBIC_AAS
    right_is_hydrophobic = pos != len(aa_str) - 1 and aa_str[pos + 1] in HYDROPHOBIC_AAS
    return left_is_hydrophobic or right_is_hydrophobic

def check_differences_concentrated_near_hydrophobic(fake_df):
    count_diff_adjacent_to_hydrophobic = 0
    count_total_differences = 0

    for parent_seq, child_seq in zip(fake_df["aa_parent"], fake_df["aa_child"]):
        for pos, (parent_aa, child_aa) in enumerate(zip(parent_seq, child_seq)):
            if parent_aa != child_aa:  # There's a difference at this position
                count_total_differences += 1
                if is_next_to_hydrophobic(parent_seq, pos):
                    count_diff_adjacent_to_hydrophobic += 1

    return count_diff_adjacent_to_hydrophobic, count_total_differences

# Usage example:
count_diff_adjacent_to_hydrophobic, count_total_differences = check_differences_concentrated_near_hydrophobic(fake_df)

print(f"Differences adjacent to hydrophobic sites: {count_diff_adjacent_to_hydrophobic}")
print(f"Total differences: {count_total_differences}")

if count_total_differences > 0:
    percentage = (count_diff_adjacent_to_hydrophobic / count_total_differences) * 100
    print(f"Percentage of differences adjacent to hydrophobic sites: {percentage:.2f}%")
else:
    print("No differences found.")


Differences adjacent to hydrophobic sites: 321259
Total differences: 327574
Percentage of differences adjacent to hydrophobic sites: 98.07%
