In [1]:
# Standard imports
import os
import shutil

# Imports for data handling
import numpy as np
import pandas as pd

# Imports for efficient string matching
import ahocorasick

# Imports for progress tracking
from tqdm import tqdm

# Imports for data visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns

In [2]:
# Set working paths.
DMS_files_path = "/Volumes/Intenso_SSD/Georgakopoulos_lab/nullp_pathogenesis/DMS/"
DMS_samples_with_subs_path = f"{DMS_files_path}Substitutions"
DMS_samples_with_indels_path = f"{DMS_files_path}Indels"

# Set nullpeptides path
nullpeptides_path = '/Volumes/Intenso_SSD/Georgakopoulos_lab/nullp_pathogenesis/null_peptides/'

# Read files for analysis as pandas dataframes.
DMS_samples_with_subs_info_df = pd.read_csv(f"{DMS_files_path}DMS_substitutions.csv")
DMS_samples_with_indels_info_df = pd.read_csv(f"{DMS_files_path}DMS_indels.csv")

# Keep only Human experiments.
human_only_DMS_samples_with_subs_info_df = DMS_samples_with_subs_info_df[DMS_samples_with_subs_info_df['taxon'] == 'Human']
human_only_DMS_samples_with_indels_info_df = DMS_samples_with_indels_info_df[DMS_samples_with_indels_info_df['taxon'] == 'Human']

# Save Human only analyses to csv.
human_only_DMS_samples_with_subs_info_df.to_csv('/Volumes/Intenso_SSD/Georgakopoulos_lab/nullp_pathogenesis/DMS/human_only_DMS_substitutions.csv')
human_only_DMS_samples_with_indels_info_df.to_csv('/Volumes/Intenso_SSD/Georgakopoulos_lab/nullp_pathogenesis/DMS/human_only_DMS_indels.csv')

# Set paths to save human samples.
human_DMS_samples_with_subs_path = f"{DMS_samples_with_subs_path}/human_samples"
human_DMS_samples_with_indels_path = f"{DMS_samples_with_indels_path}/human_samples"

# Define an output directory for each mutation type.
modified_subs_path = f"{human_DMS_samples_with_subs_path}/modified_human_samples"
modified_indels_path = f"{human_DMS_samples_with_indels_path}/modified_human_samples"

# Specify empty lists to store the dataframe for each sample.
modified_human_DMS_samples_with_subs_list = []
modified_human_DMS_samples_with_indels_list = []

In [3]:
def create_directory(path):
    """
    Creates a directory at the specified path if it does not already exist.
    """
    if not os.path.exists(path):
        os.makedirs(path)

def copy_human_samples(source_path, destination_path):
    """
    Copies files containing '_HUMAN_' in their name from the source path to the destination path.
    """
    for filename in os.listdir(source_path):
        if "_HUMAN_" in filename:
            source_file = os.path.join(source_path, filename)
            destination_file = os.path.join(destination_path, filename)
            shutil.copy(source_file, destination_file)

# Create paths if they don't exist
create_directory(human_DMS_samples_with_subs_path)
create_directory(human_DMS_samples_with_indels_path)
create_directory(modified_subs_path)
create_directory(modified_indels_path)

# Copy human DMS samples to their respective new folders
copy_human_samples(DMS_samples_with_subs_path, human_DMS_samples_with_subs_path)
copy_human_samples(DMS_samples_with_indels_path, human_DMS_samples_with_indels_path)

In [4]:
def add_wild_type_row_and_save(df_info, source_path, destination_path, filename, mutation_type):
    """
    For a given DMS sample file, adds a new row with the wild type sequence and saves the modified DataFrame.
    """
    file_path = os.path.join(source_path, filename)

    # Extract information based on the DMS filename
    DMS_file_info = df_info[df_info['DMS_filename'] == filename]

    if not DMS_file_info.empty:
        target_seq = DMS_file_info['target_seq'].values[0]
        DMS_binarization_cutoff = DMS_file_info['DMS_binarization_cutoff'].values[0]

        DMS_df = pd.read_csv(file_path)
        
        # Define a new row to be added
        if mutation_type == 'subs':
            new_row = {'mutant': 'Wild Type', 'mutated_sequence': target_seq, 'DMS_score': DMS_binarization_cutoff, 'DMS_score_bin': 1}
        elif mutation_type == 'indels':
            new_row = {'mutant': target_seq, 'DMS_score': DMS_binarization_cutoff, 'DMS_score_bin': 1}
        
        DMS_df = pd.concat([pd.DataFrame([new_row]), DMS_df], ignore_index=True)

        # Save the updated DataFrame to a new CSV file
        modified_filename = os.path.join(destination_path, f'modified_{filename}')
        DMS_df.to_csv(modified_filename, index=False)


# Process substitution samples
for filename in os.listdir(human_DMS_samples_with_subs_path):
    if filename.endswith('.csv'):
        add_wild_type_row_and_save(human_only_DMS_samples_with_subs_info_df, human_DMS_samples_with_subs_path, modified_subs_path, filename, 'subs')

# Process indel samples
for filename in os.listdir(human_DMS_samples_with_indels_path):
    if filename.endswith('.csv'):
        add_wild_type_row_and_save(human_only_DMS_samples_with_indels_info_df, human_DMS_samples_with_indels_path, modified_indels_path, filename, 'indels')

In [5]:
#Define a function to scale all DMS protein fitness scores to specific scale.
def scale_DMS_scores(data, score_column):
    """
    Scale DMS protein fitness scores relative to a cutoff value, where scores are positively scaled if above
    the cutoff and negatively scaled if below.

    The cutoff value is determined by the score of the wild type protein. The scaling maintains the sign difference
    based on the cutoff but does not confine the scores to a fixed maximum range.
    """
    # Identify the cutoff value (Wild Type score), which is the first row for each DMS sample.
    cutoff_value = data.loc[0, score_column]

    # Compute the maximum and minimum scores for normalization adjustments
    max_score = data[score_column].max()
    min_score = data[score_column].min()

    # Scale scores relative to the cutoff, maintaining proportional distance.
    # Scores above the cutoff are scaled proportionally above 0, and scores below are scaled proportionally below 0.
    data['Scaled_' + score_column] = data[score_column].apply(
        lambda x: (x - cutoff_value) / (max_score - cutoff_value) if x >= cutoff_value else
                  (x - cutoff_value) / (cutoff_value - min_score)
    )

    return data, cutoff_value

def process_DMS_sample(file_path, score_column, drop_columns, rename_columns=None):
    """
    Processes a DMS sample file by scaling scores, classifying sequences, and dropping unnecessary columns.
    """
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        df, cutoff_value = scale_DMS_scores(df, score_column)
        df['State'] = df[score_column].apply(lambda x: 'Benign' if x >= cutoff_value else 'Pathogenic')
        if rename_columns:
            df = df.rename(columns=rename_columns)
        df = df.drop(columns=drop_columns)
        df = df.iloc[1:]  # Drop the row with the wild type sequence
        return df
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def process_directory(directory_path, score_column, drop_columns, rename_columns=None):
    processed_samples = []
    for filename in filter(lambda x: x.endswith('.csv'), os.listdir(directory_path)):
        file_path = os.path.join(directory_path, filename)
        processed_df = process_DMS_sample(file_path, score_column, drop_columns, rename_columns)
        if processed_df is not None:
            processed_samples.append(processed_df)
    return processed_samples

# Process directories and update lists
modified_human_DMS_samples_with_subs_list = process_directory(
    modified_subs_path, 
    'DMS_score', 
    drop_columns=['mutant', 'DMS_score_bin'])

modified_human_DMS_samples_with_indels_list = process_directory(
    modified_indels_path,'DMS_score',
    drop_columns=['DMS_score_bin'],
    rename_columns={'mutant': 'mutated_sequence'})

In [6]:
#Define a function to plot the results as histograms so that to study the distribution of the scaled DMS scores for each study.
def plot_histogram(df, index, output_directory, state_column='State', score_column='Scaled_DMS_score'):
    plt.figure(figsize=(10, 6))

    # Using seaborn's histplot for a more integrated approach
    sns.histplot(data=df, x=score_column, hue=state_column, 
                 palette={"Pathogenic": "red", "Benign": "green"}, 
                 bins=50, alpha=0.6)

    plt.title(f'Distribution of Scaled DMS Scores - Sample {index+1}')
    plt.xlabel('Scaled DMS Score')
    plt.ylabel('Frequency')

    plt.savefig(f'{output_directory}/Histogram_{index+1}.png')
    plt.close()

#Specify the output directories for each mutation case.
plots_directory = '/Volumes/Intenso_SSD/Georgakopoulos_lab/nullp_pathogenesis/DMS/Plots/'

#Specify the output directories for each mutation case.
subs_output_directory = f'{plots_directory}/Subs'
indels_output_directory = f'{plots_directory}/Indels'

for i, df in enumerate(modified_human_DMS_samples_with_subs_list):
    plot_histogram(df, i, subs_output_directory)

for i, df in enumerate(modified_human_DMS_samples_with_indels_list):
    plot_histogram(df, i, indels_output_directory)



In [7]:
# Define function to build the automaton for the nullpeptide categories.
def build_nullpeptide_automaton(nullpeptides):
    """
    Creates an Aho - Corasick Automaton datastructure to store the nullpeptides.
    """
    nullpeptide_automaton = ahocorasick.Automaton() # Empty Automaton list.
    for nullpeptide in nullpeptides:
        nullpeptide_automaton.add_word(nullpeptide, nullpeptide) # Iterates through all the nullpeptides and stores each one to the automaton datastructure.
    nullpeptide_automaton.make_automaton()
    return nullpeptide_automaton

def read_and_build_automatons(nullpeptides_path, nullpeptide_lengths):
    """
    Reads nullpeptides of specific lengths and builds corresponding automatons.
    """
    automatons = {} # Initialize an empty dictionary to store automatons.

    # Loop through each specified nullpeptide length.
    for nullpeptide_length in nullpeptide_lengths:
        try:
            file_path = f"{nullpeptides_path}nullpeptides_{nullpeptide_length}amino_acids_gencode.v43.pc_translations.txt"
            
            # Read the nullpeptide sequences from the file into a pandas DataFrame.
            nullpeptides_df = pd.read_csv(file_path)
            
            # Convert the 'nullpeptides' column of the DataFrame into a list.
            nullpeptides = nullpeptides_df['nullpeptides'].tolist()
            
            # Build an automaton for the current list of nullpeptides.
            automatons[f'automaton_{nullpeptide_length}mer'] = build_nullpeptide_automaton(nullpeptides)
        
        # Catch and report any errors that occur during the process.
        except Exception as e:
            print(f"Error processing nullpeptides of length {nullpeptide_length}: {e}")

    return automatons

nullpeptide_lengths = [5, 6] # Specify nullpeptide length.

# Create the nullpeptides automatons.
automatons = read_and_build_automatons(nullpeptides_path, nullpeptide_lengths)

In [8]:
# Define functions to search the presence of each nullpeptide in the mutated sequences of the DMS experiments.
def find_nullpeptides(sequence, nullpeptide_automaton):
    """
    Searches each mutated sequence from the DMS experiments for any occurences of the nullpeptides using the previously created Automaton.
    """
    nullpeptide_matches = []  # Initialize an empty list to store match information
    for end_index, pattern in nullpeptide_automaton.iter(sequence):  
        # Iterate over matches found by the automaton 
        start_index = end_index - len(pattern) + 1  # Calculate start of match
        nullpeptide_matches.append((start_index, end_index, pattern)) # Store match details
    return nullpeptide_matches  # Return the list of matches

def process_DMS_sample(DMS_sample, nullpeptide_automaton, automaton_key):
    """
    Searches DMS samples for present nullpeptides in mutated sequences.
    """
    # Determine the nullpeptide length from the automaton key
    length = automaton_key.split('_')[1]

    # Column names for nullpeptides and their counts
    found_nullpeptides_col = f'{length}_Nullpeptides'
    found_nullpeptides_counts_col = f'{length}_Nullpeptides_Counts'

    # Initialize columns in DataFrame
    DMS_sample[found_nullpeptides_col] = ''
    DMS_sample[found_nullpeptides_counts_col] = 0

    # Process each row to find nullpeptides and count them
    for index, row in tqdm(DMS_sample.iterrows(), total=len(DMS_sample), desc=f"Processing for {length}"):
        matches = find_nullpeptides(row['mutated_sequence'], nullpeptide_automaton)
        if matches:
            matched_nullpeptides = [match[2] for match in matches]
            DMS_sample.at[index, found_nullpeptides_col] = ', '.join(matched_nullpeptides)
            DMS_sample.at[index, found_nullpeptides_counts_col] = len(matches)

    return DMS_sample

def thorough_nullpeptide_search(samples_list, automatons):
    """
    Creates a final DataFrame for each DMS experiment category and searches for nullpeptides.
    """
    # Concatenate and sort the samples DataFrame
    total_samples_df = pd.concat(samples_list, ignore_index=True)
    total_samples_df.sort_values(by=['State', 'DMS_score'], ascending=[False, False], inplace=True)

    # Process for each nullpeptide category defined by automatons
    for key, automaton in automatons.items():
        total_samples_df = process_DMS_sample(total_samples_df, automaton, key)

    # Combine found nullpeptides into a single column and sum their counts
    nullpeptide_cols = [f'{key.split("_")[1]}_Nullpeptides' for key in automatons.keys()]
    total_samples_df['Total_Nullpeptides'] = total_samples_df[nullpeptide_cols].apply(lambda row: ', '.join(filter(None, row)), axis=1).str.strip(', ')

    count_cols = [f'{key.split("_")[1]}_Nullpeptides_Counts' for key in automatons.keys()]
    total_samples_df['Total_Nullpeptide_Counts'] = total_samples_df[count_cols].sum(axis=1)

    return total_samples_df


In [9]:
#Process the DMS Samples
total_human_DMS_samples_with_subs_df = thorough_nullpeptide_search(modified_human_DMS_samples_with_subs_list, automatons)
total_human_DMS_samples_with_indels_df = thorough_nullpeptide_search(modified_human_DMS_samples_with_indels_list, automatons)
total_human_DMS_samples_df = pd.concat([total_human_DMS_samples_with_subs_df, total_human_DMS_samples_with_indels_df]).drop(columns='DMS_score')

Processing for 5mer: 100%|██████████| 485830/485830 [00:15<00:00, 31045.00it/s]
Processing for 6mer: 100%|██████████| 485830/485830 [00:27<00:00, 17859.15it/s]
Processing for 5mer: 100%|██████████| 6967/6967 [00:00<00:00, 35730.58it/s]
Processing for 6mer: 100%|██████████| 6967/6967 [00:00<00:00, 21518.36it/s]


In [10]:
#Plot the distribution of the total pathogenic and benign nullpeptide scaled scores.
plt.figure(figsize=(10, 6))
sns.histplot(data=total_human_DMS_samples_df, x="Scaled_DMS_score", hue="State", palette={"Pathogenic": "red", "Benign": "green"}, bins=100)
plt.title("Distribution of Scaled DMS Scores")
plt.xlabel("Scaled DMS Score")
plt.ylabel("Frequency")
plt.savefig(f'{plots_directory}/total_scaled_score_distribution.png')
plt.close()


In [None]:
total_human_DMS_samples_df

In [None]:
#Define a function to plot the results as barplots so that to use for the different nullpeptide lenghts.
def barplot_nullpeptides(DMS_df, nullpeptide_column, output_filename):
    #Count total sequences for each state.
    total_pathogenic_count = DMS_df[DMS_df['State'] == 'Pathogenic']['mutated_sequence'].count()
    total_benign_count = DMS_df[DMS_df['State'] == 'Benign']['mutated_sequence'].count()

    #Count sequences with at least with nullpeptide present in each state.
    pathogenic_with_nullpeptide_count = DMS_df[(DMS_df['State'] == 'Pathogenic') & (DMS_df[nullpeptide_column] > 0)]['mutated_sequence'].count()
    benign_with_nullpeptide_count = DMS_df[(DMS_df['State'] == 'Benign') & (DMS_df[nullpeptide_column] > 0)]['mutated_sequence'].count()

    #Calculate proportion of counts with nullpeptides compared to total counts.
    proportion_pathogenic_with_nullpeptide = pathogenic_with_nullpeptide_count / total_pathogenic_count
    proportion_benign_with_nullpeptide = benign_with_nullpeptide_count / total_benign_count

    state_labels = ['Benign', 'Pathogenic']
    proportions_values = [proportion_benign_with_nullpeptide, proportion_pathogenic_with_nullpeptide]

    fig, axes = plt.subplots(1, 1, figsize=(6, 4))

    axes.bar(state_labels, proportions_values, color=['blue', 'red'])
    axes.set_ylabel('Proportion with nullpeptides', labelpad=15)
    axes.grid(axis='both', linestyle='-', alpha=0.7)


    plt.tight_layout()
    plt.savefig(output_filename)
    plt.close()

    return proportion_benign_with_nullpeptide, proportion_pathogenic_with_nullpeptide

# Calculate proportions and create the bar plot for substitution samples
DMS_barplot_5mers = barplot_nullpeptides(total_human_DMS_samples_df, '5mer_Nullpeptides_Counts', f'{plots_directory}5mer_barplot.png')
DMS_barplot_6mers = barplot_nullpeptides(total_human_DMS_samples_df, '6mer_Nullpeptides_Counts', f'{plots_directory}6mer_barplot.png')

In [None]:
#Separate the DataFrame into two based on the 'State' column
total_human_DMS_samples_df_benign_only = total_human_DMS_samples_df[total_human_DMS_samples_df['State'] == 'Benign']
total_human_DMS_samples_df_pathogenic_only = total_human_DMS_samples_df[total_human_DMS_samples_df['State'] == 'Pathogenic']

In [None]:
#Extract 5mer and 6mer nullpeptides counts for pathogenic data
counts_5mer_pathogenic = total_human_DMS_samples_df_pathogenic_only['5mer_Nullpeptides_Counts']
counts_6mer_pathogenic = total_human_DMS_samples_df_pathogenic_only['6mer_Nullpeptides_Counts']

#Create a function to plot the distribution of pathogenic nullpeptide counts as a proportion of the total nullpeptides.
def plot_proportion_histogram(data, max_count, group_label, title, xlabel, ylabel, output_filepath):
    #Create bins with the last bin grouping all counts >= max_count.
    bins = list(range(0, max_count)) + [max_count, data.max() + 1]
    
    #Calculate histogram data.
    counts, bin_edges = np.histogram(data, bins=bins)
    proportions = counts / data.count()
    
    #Set bin labels, with the last label being 'max_count or more'.
    bin_labels = [str(bin_edge) for bin_edge in bin_edges[:-2]] + [group_label]

    fig, ax = plt.subplots()
    ax.bar(bin_labels, proportions, color='grey')
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_xticklabels(bin_labels, rotation=45)
    
    ax.grid(True, which='both', axis='y', linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.savefig(output_filepath)
    plt.close(fig)

plot_proportion_histogram(
data = counts_5mer_pathogenic,
max_count = 3, 
group_label = '3 or more',
title = 'Proportion of 5mer Nullpeptides in Pathogenic Data',
xlabel = 'Number of Nullpeptides',
ylabel = 'Proportion',
output_filepath = f'{plots_directory}/pathogenic_nullpeptide_proportion_5mer.png'
)

plot_proportion_histogram(
data = counts_6mer_pathogenic,
max_count = 6, 
group_label = '6 or more',
title = 'Proportion of 6mer Nullpeptides in Pathogenic Data',
xlabel = 'Number of Nullpeptides',
ylabel = 'Proportion',
output_filepath = f'{plots_directory}/pathogenic_nullpeptide_proportion_6mer.png'
)

In [None]:
#Specify directory to save scatter plot.
scatter_plot = f'{plots_directory}/scatter_plot.png'

#Create the scatter plot.
plt.figure(figsize=(10, 6))
plt.scatter(total_human_DMS_samples_df_pathogenic_only['Scaled_DMS_score'], total_human_DMS_samples_df_pathogenic_only['Total_Nullpeptide_Counts'], color='red', label='Pathogenic')
plt.scatter(total_human_DMS_samples_df_benign_only['Scaled_DMS_score'], total_human_DMS_samples_df_benign_only['Total_Nullpeptide_Counts'], color='blue', label='Benign')
plt.xlabel('Scaled DMS Score')
plt.ylabel('Total Nullpeptide Counts')
plt.title('Nullpeptides vs. Scaled DMS Score')
plt.legend()
plt.savefig(scatter_plot)
plt.close()

In [None]:
#Define a function to plot the times the top 10 nullpeptides are present in each state.
def plot_nullpeptide_counts(df, column_name, save_file, top_n=10):
    df_pathogenic = df[df['State'] == 'Pathogenic']
    df_benign = df[df['State'] == 'Benign']

    #Function to calculate counts
    def calculate_counts(dataframe):
        return dataframe[column_name].str.split(',').explode().value_counts().head(top_n)

    #Calculate counts for each state
    pathogenic_counts = calculate_counts(df_pathogenic)
    benign_counts = calculate_counts(df_benign)

    #Preparing data for plotting
    plot_data_pathogenic = pathogenic_counts.reset_index()
    plot_data_benign = benign_counts.reset_index()
    plot_data_pathogenic.columns = ['Nullpeptide', 'Count']
    plot_data_benign.columns = ['Nullpeptide', 'Count']

    #Plotting
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))

    sns.barplot(ax=axes[0], x='Count', y='Nullpeptide', data=plot_data_pathogenic, palette="viridis")
    axes[0].set_title('Top Nullpeptides Counts in Pathogenic Sequences')
    axes[0].set_xlabel('Count')
    axes[0].set_ylabel('Nullpeptides')

    sns.barplot(ax=axes[1], x='Count', y='Nullpeptide', data=plot_data_benign, palette="mako")
    axes[1].set_title('Top Nullpeptides Counts in Benign Sequences')
    axes[1].set_xlabel('Count')
    axes[1].set_ylabel('Nullpeptides')

    plt.tight_layout()
    plt.savefig(save_file)
    plt.close()

plot_nullpeptide_counts(total_human_DMS_samples_df, '5mer_Nullpeptides', save_file = f'{plots_directory}/5mer_nullpeptide_counts.png')
plot_nullpeptide_counts(total_human_DMS_samples_df, '6mer_Nullpeptides', save_file = f'{plots_directory}/6mer_nullpeptide_counts.png')


In [None]:
#Define a function to plot the times the top 10 unique to each state nullpeptides are present.
def plot_unique_nullpeptides_by_state(df, column_name, save_file, top_n=10):
    df_pathogenic = df[df['State'] == 'Pathogenic']
    df_benign = df[df['State'] == 'Benign']

    pathogenic_nullpeptides = set(df_pathogenic[column_name].str.split(',').explode())
    benign_nullpeptides = set(df_benign[column_name].str.split(',').explode())

    #Identify unique nullpeptides for each state
    unique_pathogenic = pathogenic_nullpeptides - benign_nullpeptides
    unique_benign = benign_nullpeptides - pathogenic_nullpeptides

    #Count occurrences of the unique nullpeptides
    pathogenic_counts = df_pathogenic[column_name].str.split(',').explode().value_counts()
    benign_counts = df_benign[column_name].str.split(',').explode().value_counts()

    #Filter the counts for unique nullpeptides
    pathogenic_unique_counts = pathogenic_counts[pathogenic_counts.index.isin(unique_pathogenic)]
    benign_unique_counts = benign_counts[benign_counts.index.isin(unique_benign)]

    #Prepare plotting data
    plot_data_pathogenic = pathogenic_unique_counts.head(top_n).reset_index()
    plot_data_benign = benign_unique_counts.head(top_n).reset_index()
    plot_data_pathogenic.columns = ['Nullpeptide', 'Count']
    plot_data_benign.columns = ['Nullpeptide', 'Count']

    #Plotting
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))

    sns.barplot(ax=axes[0], x='Count', y='Nullpeptide', data=plot_data_pathogenic, palette="viridis")
    axes[0].set_title('Top Unique Nullpeptides in Pathogenic State')
    axes[0].set_xlabel('Count')
    axes[0].set_ylabel('Nullpeptides')

    sns.barplot(ax=axes[1], x='Count', y='Nullpeptide', data=plot_data_benign, palette="mako")
    axes[1].set_title('Top Unique Nullpeptides in Benign State')
    axes[1].set_xlabel('Count')
    axes[1].set_ylabel('Nullpeptides')

    plt.tight_layout()
    plt.savefig(save_file)
    plt.close()

plot_unique_nullpeptides_by_state(total_human_DMS_samples_df, '5mer_Nullpeptides', save_file = f'{plots_directory}/unique_5mer_nullpeptide_counts.png')
plot_unique_nullpeptides_by_state(total_human_DMS_samples_df, '6mer_Nullpeptides', save_file = f'{plots_directory}/unique_6mer_nullpeptide_counts.png')


In [None]:
#Define a function to plot the times the top 10 nullpeptides that are responsible for lower and higher Scaled DMS scores.
def plot_critical_nullpeptides_by_dms_score(df, column_name, save_file, top_n=10, percentile=10):
    df_pathogenic = df[(df['State'] == 'Pathogenic') & (df['Scaled_DMS_score'] < 0)]
    df_benign = df[(df['State'] == 'Benign') & (df['Scaled_DMS_score'] > 0)]

    #Determine the bottom and top percentiles
    low_threshold_pathogenic = df_pathogenic['Scaled_DMS_score'].quantile(percentile / 100.0)
    high_threshold_benign = df_benign['Scaled_DMS_score'].quantile(1 - percentile / 100.0)

    #Filter based on these thresholds
    df_critical_pathogenic = df_pathogenic[df_pathogenic['Scaled_DMS_score'] <= low_threshold_pathogenic]
    df_critical_benign = df_benign[df_benign['Scaled_DMS_score'] >= high_threshold_benign]

    #Count occurrences of nullpeptides in these filtered dataframes
    pathogenic_counts = df_critical_pathogenic[column_name].str.split(',').explode().value_counts()
    benign_counts = df_critical_benign[column_name].str.split(',').explode().value_counts()

    #Prepare plotting data
    plot_data_pathogenic = pathogenic_counts.head(top_n).reset_index()
    plot_data_benign = benign_counts.head(top_n).reset_index()
    plot_data_pathogenic.columns = ['Nullpeptide', 'Count']
    plot_data_benign.columns = ['Nullpeptide', 'Count']

    #Plotting
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))

    sns.barplot(ax=axes[0], x='Count', y='Nullpeptide', data=plot_data_pathogenic, palette="viridis")
    axes[0].set_title(f'Top Nullpeptides in Bottom {percentile}% of Pathogenic Scores')
    axes[0].set_xlabel('Count')
    axes[0].set_ylabel('Nullpeptides')
    sns.barplot(ax=axes[1], x='Count', y='Nullpeptide', data=plot_data_benign, palette="mako")
    axes[1].set_title(f'Top Nullpeptides in Top {percentile}% of Benign Scores')
    axes[1].set_xlabel('Count')
    axes[1].set_ylabel('Nullpeptides')

    plt.tight_layout()
    plt.savefig(save_file)
    plt.close()

plot_critical_nullpeptides_by_dms_score(total_human_DMS_samples_df, '5mer_Nullpeptides', save_file = f'{plots_directory}/critical_5mer_nullpeptide_counts.png', top_n=10, percentile=10)
plot_critical_nullpeptides_by_dms_score(total_human_DMS_samples_df, '6mer_Nullpeptides', save_file = f'{plots_directory}/critical_6mer_nullpeptide_counts.png', top_n=10, percentile=10)