In [1]:
# Standard imports
import os
import shutil

# Imports for data handling
import numpy as np
import pandas as pd

# Imports for efficient string matching
import ahocorasick

# Imports for progress tracking
from tqdm import tqdm

# Imports for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Imports for data processing
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from scipy import stats

In [2]:
# Set working paths.
DMS_files_path = "/Volumes/Intenso_SSD/Georgakopoulos_lab/nullp_pathogenesis/DMS/"
DMS_samples_with_subs_path = f"{DMS_files_path}Substitutions"
DMS_samples_with_indels_path = f"{DMS_files_path}Indels"

# Set nullpeptides path
nullpeptides_path = '/Volumes/Intenso_SSD/Georgakopoulos_lab/nullp_pathogenesis/null_peptides/'

# Read files for analysis as pandas dataframes.
DMS_samples_with_subs_info_df = pd.read_csv(f"{DMS_files_path}DMS_substitutions.csv")
DMS_samples_with_indels_info_df = pd.read_csv(f"{DMS_files_path}DMS_indels.csv")

# Keep only Human experiments.
human_only_DMS_samples_with_subs_info_df = DMS_samples_with_subs_info_df[DMS_samples_with_subs_info_df['taxon'] == 'Human']
human_only_DMS_samples_with_indels_info_df = DMS_samples_with_indels_info_df[DMS_samples_with_indels_info_df['taxon'] == 'Human']

# Save Human only analyses to csv.
human_only_DMS_samples_with_subs_info_df.to_csv('/Volumes/Intenso_SSD/Georgakopoulos_lab/nullp_pathogenesis/DMS/human_only_DMS_substitutions.csv')
human_only_DMS_samples_with_indels_info_df.to_csv('/Volumes/Intenso_SSD/Georgakopoulos_lab/nullp_pathogenesis/DMS/human_only_DMS_indels.csv')

# Set paths to save human samples.
human_DMS_samples_with_subs_path = f"{DMS_samples_with_subs_path}/human_samples"
human_DMS_samples_with_indels_path = f"{DMS_samples_with_indels_path}/human_samples"

# Define an output directory for each mutation type.
modified_subs_path = f"{human_DMS_samples_with_subs_path}/modified_human_samples"
modified_indels_path = f"{human_DMS_samples_with_indels_path}/modified_human_samples"

# Specify empty lists to store the dataframe for each sample.
modified_human_DMS_samples_with_subs_list = []
modified_human_DMS_samples_with_indels_list = []

#Specify the output directories for each mutation case.
plots_directory = '/Volumes/Intenso_SSD/Georgakopoulos_lab/nullp_pathogenesis/DMS/Plots/'

#Specify the output directories for each mutation case.
subs_output_directory = f'{plots_directory}/Subs'
indels_output_directory = f'{plots_directory}/Indels'


In [3]:
def create_directory(path):
    """
    Creates a directory at the specified path if it does not already exist.
    """
    if not os.path.exists(path):
        os.makedirs(path)

def copy_human_samples(source_path, destination_path):
    """
    Copies files containing '_HUMAN_' in their name from the source path to the destination path.
    """
    for filename in os.listdir(source_path):
        if "_HUMAN_" in filename:
            source_file = os.path.join(source_path, filename)
            destination_file = os.path.join(destination_path, filename)
            shutil.copy(source_file, destination_file)

# Create paths if they don't exist
create_directory(human_DMS_samples_with_subs_path)
create_directory(human_DMS_samples_with_indels_path)
create_directory(modified_subs_path)
create_directory(modified_indels_path)

# Copy human DMS samples to their respective new folders
copy_human_samples(DMS_samples_with_subs_path, human_DMS_samples_with_subs_path)
copy_human_samples(DMS_samples_with_indels_path, human_DMS_samples_with_indels_path)

In [4]:
def add_wild_type_row_and_save(df_info, source_path, destination_path, filename, mutation_type):
    """
    For a given DMS sample file, adds a new row with the wild type sequence and saves the modified DataFrame.
    """
    file_path = os.path.join(source_path, filename)

    # Extract information based on the DMS filename
    DMS_file_info = df_info[df_info['DMS_filename'] == filename]

    if not DMS_file_info.empty:
        target_seq = DMS_file_info['target_seq'].values[0]
        DMS_binarization_cutoff = DMS_file_info['DMS_binarization_cutoff'].values[0]

        DMS_df = pd.read_csv(file_path)
        
        # Define a new row to be added
        if mutation_type == 'subs':
            new_row = {'mutant': 'Wild Type', 'mutated_sequence': target_seq, 'DMS_score': DMS_binarization_cutoff, 'DMS_score_bin': 1}
        elif mutation_type == 'indels':
            new_row = {'mutant': target_seq, 'DMS_score': DMS_binarization_cutoff, 'DMS_score_bin': 1}
        
        DMS_df = pd.concat([pd.DataFrame([new_row]), DMS_df], ignore_index=True)

        # Save the updated DataFrame to a new CSV file
        modified_filename = os.path.join(destination_path, f'modified_{filename}')
        DMS_df.to_csv(modified_filename, index=False)


# Process substitution samples
for filename in os.listdir(human_DMS_samples_with_subs_path):
    if filename.endswith('.csv'):
        add_wild_type_row_and_save(human_only_DMS_samples_with_subs_info_df, human_DMS_samples_with_subs_path, modified_subs_path, filename, 'subs')

# Process indel samples
for filename in os.listdir(human_DMS_samples_with_indels_path):
    if filename.endswith('.csv'):
        add_wild_type_row_and_save(human_only_DMS_samples_with_indels_info_df, human_DMS_samples_with_indels_path, modified_indels_path, filename, 'indels')

In [5]:
#Define a function to scale all DMS protein fitness scores to specific scale.
def scale_DMS_scores(data, score_column):
    """
    Calculate a modified Z-score for each mutation, using the DMS score of the wild type sequence instead of the mean. This centers the data around the wild type score
    and calculates the deviations from this score.
    """
    wild_type_score = data.loc[0, score_column]
    
    # Calculate deviations from the wild type score
    deviations = data[score_column] - wild_type_score
    
    # Square these deviations
    squared_deviations = deviations ** 2
    
    # Calculate the mean of the squared deviations
    mean_squared_deviation = squared_deviations.mean()
    
    # Calculate the "custom deviation" (similar to standard deviation but centered on the wild type score)
    custom_deviation = np.sqrt(mean_squared_deviation)
    
    # Compute modified Z-scores using the custom deviation
    data['Scaled_' + score_column] = deviations / custom_deviation

    return data, wild_type_score

def process_DMS_sample(file_path, score_column, drop_columns, rename_columns=None):
    """
    Processes a DMS sample file by scaling scores, classifying sequences, and dropping unnecessary columns.
    """
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
        df, cutoff_value = scale_DMS_scores(df, score_column)
        df['State'] = df[score_column].apply(lambda x: 'Benign' if x >= cutoff_value else 'Pathogenic')
        if rename_columns:
            df = df.rename(columns=rename_columns)
        df = df.drop(columns=drop_columns)
        df = df.iloc[1:]  # Drop the row with the wild type sequence
        return df
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def process_directory(directory_path, score_column, drop_columns, rename_columns=None):
    processed_samples = []
    for filename in filter(lambda x: x.endswith('.csv'), os.listdir(directory_path)):
        file_path = os.path.join(directory_path, filename)
        processed_df = process_DMS_sample(file_path, score_column, drop_columns, rename_columns)
        if processed_df is not None:
            processed_samples.append(processed_df)
    return processed_samples

# Process directories and update lists
modified_human_DMS_samples_with_subs_list = process_directory(
    modified_subs_path, 
    'DMS_score', 
    drop_columns=['mutant', 'DMS_score_bin'])

modified_human_DMS_samples_with_indels_list = process_directory(
    modified_indels_path,'DMS_score',
    drop_columns=['DMS_score_bin'],
    rename_columns={'mutant': 'mutated_sequence'})

In [6]:
#Define a function to plot the results as histograms so that to study the distribution of the scaled DMS scores for each study.
def plot_histogram(df, index, output_directory, state_column='State', score_column='DMS_score', scaled_score_column='Scaled_DMS_score'):
    plt.figure(figsize=(20, 6))

    # Create a subplot for the non-scaled scores
    plt.subplot(1, 2, 1)
    sns.histplot(data=df, x=score_column, hue=state_column, 
                 palette={"Pathogenic": "red", "Benign": "green"}, 
                 bins=50, alpha=0.6)
    plt.title(f'Non-Scaled DMS Scores - Sample {index+1}')
    plt.xlabel('DMS Score')
    plt.ylabel('Frequency')

    # Create a subplot for the scaled scores
    plt.subplot(1, 2, 2)
    sns.histplot(data=df, x=scaled_score_column, hue=state_column, 
                 palette={"Pathogenic": "red", "Benign": "green"}, 
                 bins=50, alpha=0.6)
    plt.title(f'Scaled DMS Scores - Sample {index+1}')
    plt.xlabel('Scaled DMS Score')

    # Save the figure
    plt.savefig(f'{output_directory}/Histogram_Comparison_{index+1}.png')
    plt.close()

for i, df in enumerate(modified_human_DMS_samples_with_subs_list):
    plot_histogram(df, i, subs_output_directory)

for i, df in enumerate(modified_human_DMS_samples_with_indels_list):
    plot_histogram(df, i, indels_output_directory)

In [7]:
# Define function to build the automaton for the nullpeptide categories.
def build_nullpeptide_automaton(nullpeptides):
    """
    Creates an Aho - Corasick Automaton datastructure to store the nullpeptides.
    """
    nullpeptide_automaton = ahocorasick.Automaton() # Empty Automaton list.
    for nullpeptide in nullpeptides:
        nullpeptide_automaton.add_word(nullpeptide, nullpeptide) # Iterates through all the nullpeptides and stores each one to the automaton datastructure.
    nullpeptide_automaton.make_automaton()
    return nullpeptide_automaton

def read_and_build_automatons(nullpeptides_path, nullpeptide_lengths):
    """
    Reads nullpeptides of specific lengths and builds corresponding automatons.
    """
    automatons = {} # Initialize an empty dictionary to store automatons.

    # Loop through each specified nullpeptide length.
    for nullpeptide_length in nullpeptide_lengths:
        try:
            file_path = f"{nullpeptides_path}nullpeptides_{nullpeptide_length}amino_acids_gencode.v43.pc_translations.txt"
            
            # Read the nullpeptide sequences from the file into a pandas DataFrame.
            nullpeptides_df = pd.read_csv(file_path)
            
            # Convert the 'nullpeptides' column of the DataFrame into a list.
            nullpeptides = nullpeptides_df['nullpeptides'].tolist()
            
            # Build an automaton for the current list of nullpeptides.
            automatons[f'automaton_{nullpeptide_length}mer'] = build_nullpeptide_automaton(nullpeptides)
        
        # Catch and report any errors that occur during the process.
        except Exception as e:
            print(f"Error processing nullpeptides of length {nullpeptide_length}: {e}")

    return automatons

nullpeptide_lengths = [5, 6] # Specify nullpeptide length.

# Create the nullpeptides automatons.
automatons = read_and_build_automatons(nullpeptides_path, nullpeptide_lengths)

In [8]:
# Define functions to search the presence of each nullpeptide in the mutated sequences of the DMS experiments.
def find_nullpeptides(sequence, nullpeptide_automaton):
    """
    Searches each mutated sequence from the DMS experiments for any occurences of the nullpeptides using the previously created Automaton.
    """
    nullpeptide_matches = []  # Initialize an empty list to store match information
    for end_index, pattern in nullpeptide_automaton.iter(sequence):  
        # Iterate over matches found by the automaton 
        start_index = end_index - len(pattern) + 1  # Calculate start of match
        nullpeptide_matches.append((start_index, end_index, pattern)) # Store match details
    return nullpeptide_matches  # Return the list of matches

def process_DMS_sample(DMS_sample, nullpeptide_automaton, automaton_key):
    """
    Searches DMS samples for present nullpeptides in mutated sequences.
    """
    # Determine the nullpeptide length from the automaton key
    length = automaton_key.split('_')[1]

    # Column names for nullpeptides and their counts
    found_nullpeptides_col = f'{length}_Nullpeptides'
    found_nullpeptides_counts_col = f'{length}_Nullpeptides_Counts'

    # Initialize columns in DataFrame
    DMS_sample[found_nullpeptides_col] = ''
    DMS_sample[found_nullpeptides_counts_col] = 0

    # Process each row to find nullpeptides and count them
    for index, row in tqdm(DMS_sample.iterrows(), total=len(DMS_sample), desc=f"Processing for {length}"):
        matches = find_nullpeptides(row['mutated_sequence'], nullpeptide_automaton)
        if matches:
            matched_nullpeptides = [match[2] for match in matches]
            DMS_sample.at[index, found_nullpeptides_col] = ', '.join(matched_nullpeptides)
            DMS_sample.at[index, found_nullpeptides_counts_col] = len(matches)

    return DMS_sample

def thorough_nullpeptide_search(samples_list, automatons):
    """
    Creates a final DataFrame for each DMS experiment category and searches for nullpeptides.
    """
    # Concatenate and sort the samples DataFrame
    total_samples_df = pd.concat(samples_list, ignore_index=True)
    total_samples_df.sort_values(by=['State', 'DMS_score'], ascending=[False, False], inplace=True)

    # Process for each nullpeptide category defined by automatons
    for key, automaton in automatons.items():
        total_samples_df = process_DMS_sample(total_samples_df, automaton, key)

    # Combine found nullpeptides into a single column and sum their counts
    nullpeptide_cols = [f'{key.split("_")[1]}_Nullpeptides' for key in automatons.keys()]
    total_samples_df['Total_Nullpeptides'] = total_samples_df[nullpeptide_cols].apply(lambda row: ', '.join(filter(None, row)), axis=1).str.strip(', ')

    count_cols = [f'{key.split("_")[1]}_Nullpeptides_Counts' for key in automatons.keys()]
    total_samples_df['Total_Nullpeptide_Counts'] = total_samples_df[count_cols].sum(axis=1)

    return total_samples_df


In [9]:
#Process the DMS Samples
total_human_DMS_samples_with_subs_df = thorough_nullpeptide_search(modified_human_DMS_samples_with_subs_list, automatons)
total_human_DMS_samples_with_indels_df = thorough_nullpeptide_search(modified_human_DMS_samples_with_indels_list, automatons)
total_human_DMS_samples_df = pd.concat([total_human_DMS_samples_with_subs_df, total_human_DMS_samples_with_indels_df]).drop(columns='DMS_score')
total_human_DMS_samples_df.reset_index(drop=True, inplace=True)

Processing for 5mer: 100%|██████████| 485830/485830 [00:15<00:00, 31045.00it/s]
Processing for 6mer: 100%|██████████| 485830/485830 [00:27<00:00, 17859.15it/s]
Processing for 5mer: 100%|██████████| 6967/6967 [00:00<00:00, 35730.58it/s]
Processing for 6mer: 100%|██████████| 6967/6967 [00:00<00:00, 21518.36it/s]


In [10]:
# Try different transformations on the data to find the most appropriate one.
def log_transform(scores):
    """
    Transforms the DMS scores using the natural logarithm.
    """
    if scores > 0:
        return np.log1p(scores)
    else:
        return -np.log1p(-scores)

# Initialize the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
# Perform the Yeo-Johnson transformation
pt = PowerTransformer(method='yeo-johnson')

# Set score and count variables
all_scores = total_human_DMS_samples_df[['Scaled_DMS_score']]

# Perform the scaling
total_human_DMS_samples_df['MinMax_DMS_score'] = scaler.fit_transform(all_scores)
total_human_DMS_samples_df['YeoJohnson_DMS_score'] = pt.fit_transform(all_scores)
total_human_DMS_samples_df['Log_DMS_score'] = total_human_DMS_samples_df['Scaled_DMS_score'].apply(log_transform)



# Plotting the distribution of the original and scaled scores using seaborn in subplots
fig, axs = plt.subplots(2, 2, figsize=(20, 10))

# Plot the MinMax_Scaled_DMS_score distribution
sns.histplot(data=total_human_DMS_samples_df, x="MinMax_DMS_score", hue="State", palette={"Pathogenic": "red", "Benign": "green"}, bins=100, ax=axs[0,0])
axs[0,0].set_xlabel('MinMax DMS Score')
axs[0,0].set_ylabel('Density')
axs[0,0].axvline(x=0.385, color='k', linestyle='--')  # Added vertical line at x=0

# Plot the distribution of the total pathogenic and benign nullpeptide scaled scores.
sns.histplot(data=total_human_DMS_samples_df, x="Scaled_DMS_score", hue="State", palette={"Pathogenic": "red", "Benign": "green"}, bins=100, ax=axs[0,1])
axs[0,1].set_title("Distribution of Scaled DMS Scores")
axs[0,1].set_xlabel("Scaled DMS Score")
axs[0,1].set_ylabel("Frequency")
axs[0,1].axvline(x=0, color='k', linestyle='--')  # Added vertical line at x=0

# Plot Log-transformed scores
sns.histplot(data=total_human_DMS_samples_df, x="Log_DMS_score", hue="State", palette={"Pathogenic": "red", "Benign": "green"}, bins=100, ax=axs[1,0])
axs[1,0].set_title('Log-transformed DMS Score Distribution')
axs[1,0].set_xlabel('Log DMS Score')
axs[1,0].set_ylabel('Frequency')
axs[1,0].axvline(x=0, color='k', linestyle='--')  # Added vertical line at x=0

# Plot Yeo-Johnson transformed scores
sns.histplot(data=total_human_DMS_samples_df, x="YeoJohnson_DMS_score", hue="State", palette={"Pathogenic": "red", "Benign": "green"}, bins=100, ax=axs[1,1])
axs[1,1].set_title('Yeo-Johnson DMS Score Distribution')
axs[1,1].set_xlabel('Yeo-Johnson DMS Score')
axs[1,1].set_ylabel('Frequency')
axs[1,1].axvline(x=0, color='k', linestyle='--')  # Added vertical line at x=0

plt.tight_layout()
plt.savefig(f'{plots_directory}different_scaling_distributions.png')
sns.despine()
plt.show()

# Drop unused columns and reorder the columns in the dataframe
total_human_DMS_samples_df = total_human_DMS_samples_df.drop(columns=['Log_DMS_score', 'MinMax_DMS_score', 'YeoJohnson_DMS_score'])
total_human_DMS_samples_df = total_human_DMS_samples_df[['mutated_sequence', 'Scaled_DMS_score', 'State', '5mer_Nullpeptides', '5mer_Nullpeptides_Counts', '6mer_Nullpeptides', '6mer_Nullpeptides_Counts', 'Total_Nullpeptides', 'Total_Nullpeptide_Counts']]


In [None]:
#Define a function to plot the results as barplots so that to use for the different nullpeptide lenghts.
def barplot_nullpeptides(DMS_df, nullpeptide_column, output_filename):
    #Count total sequences for each state.
    total_pathogenic_count = DMS_df[DMS_df['State'] == 'Pathogenic']['mutated_sequence'].count()
    total_benign_count = DMS_df[DMS_df['State'] == 'Benign']['mutated_sequence'].count()

    #Count sequences with at least with nullpeptide present in each state.
    pathogenic_with_nullpeptide_count = DMS_df[(DMS_df['State'] == 'Pathogenic') & (DMS_df[nullpeptide_column] > 0)]['mutated_sequence'].count()
    benign_with_nullpeptide_count = DMS_df[(DMS_df['State'] == 'Benign') & (DMS_df[nullpeptide_column] > 0)]['mutated_sequence'].count()

    #Calculate proportion of counts with nullpeptides compared to total counts.
    proportion_pathogenic_with_nullpeptide = pathogenic_with_nullpeptide_count / total_pathogenic_count
    proportion_benign_with_nullpeptide = benign_with_nullpeptide_count / total_benign_count

    state_labels = ['Benign', 'Pathogenic']
    proportions_values = [proportion_benign_with_nullpeptide, proportion_pathogenic_with_nullpeptide]

    fig, axes = plt.subplots(1, 1, figsize=(6, 4))

    axes.bar(state_labels, proportions_values, color=['blue', 'red'])
    axes.set_ylabel('Proportion with nullpeptides', labelpad=15)
    axes.grid(axis='both', linestyle='-', alpha=0.7)


    plt.tight_layout()
    plt.savefig(output_filename)
    plt.close()

    return proportion_benign_with_nullpeptide, proportion_pathogenic_with_nullpeptide

# Calculate proportions and create the bar plot for substitution samples
DMS_barplot_5mers = barplot_nullpeptides(total_human_DMS_samples_df, '5mer_Nullpeptides_Counts', f'{plots_directory}5mer_barplot.png')
DMS_barplot_6mers = barplot_nullpeptides(total_human_DMS_samples_df, '6mer_Nullpeptides_Counts', f'{plots_directory}6mer_barplot.png')

In [None]:
# Consolidate the extraction and plotting of nullpeptides counts into a single function
def process_and_plot_nullpeptides(dataframe, state, nullpeptide_lengths, plots_directory):
    state_df = dataframe[dataframe['State'] == state]
    
    for length in nullpeptide_lengths:
        column_name = f'{length}mer_Nullpeptides_Counts'
        counts = state_df[column_name]
        
        # Define the maximum count for binning based on nullpeptide length
        max_count = length if length <= 5 else length + 1
        
        # Create bins with the last bin grouping all counts >= max_count.
        bins = list(range(0, max_count)) + [max_count, counts.max() + 1]
        
        # Calculate histogram data
        counts, bin_edges = np.histogram(counts, bins=bins)
        proportions = counts / counts.sum()
        
        # Set bin labels, with the last label being 'max_count or more'.
        bin_labels = [str(bin_edge) for bin_edge in bin_edges[:-2]] + [f'{max_count} or more']
        
        # Plotting
        fig, ax = plt.subplots()
        ax.bar(bin_labels, proportions, color='grey')
        title = f'Proportion of {length}mer Nullpeptides in {state} Data'
        ax.set_title(title)
        ax.set_xlabel('Number of Nullpeptides')
        ax.set_ylabel('Proportion')
        ax.set_xticks(range(len(bin_labels)))  # Set tick positions
        ax.set_xticklabels(bin_labels, rotation=45)
        ax.grid(True, which='both', axis='y', linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        output_filepath = f'{plots_directory}/{state.lower()}_nullpeptide_proportion_{length}mer.png'
        plt.savefig(output_filepath)
        plt.show(fig)

# Define nullpeptide lengths to process
nullpeptide_lengths = [5, 6]

# Process and plot for pathogenic data
process_and_plot_nullpeptides(
    dataframe=total_human_DMS_samples_df,
    state='Pathogenic',
    nullpeptide_lengths=nullpeptide_lengths,
    plots_directory=plots_directory
)

In [None]:
# Extract pathogenic and benign scores and counts
pathogenic_scores = total_human_DMS_samples_df.loc[total_human_DMS_samples_df["State"] == "Pathogenic", "Scaled_DMS_score"]
benign_scores = total_human_DMS_samples_df.loc[total_human_DMS_samples_df["State"] == "Benign", "Scaled_DMS_score"]
pathogenic_counts = total_human_DMS_samples_df.loc[total_human_DMS_samples_df["State"] == "Pathogenic", "Total_Nullpeptide_Counts"]
benign_counts = total_human_DMS_samples_df.loc[total_human_DMS_samples_df["State"] == "Benign", "Total_Nullpeptide_Counts"]

# Check normality using Shapiro-Wilk test
pathogenic_normality = stats.shapiro(pathogenic_scores)
benign_normality = stats.shapiro(benign_scores)

# Print normality test results
print("Pathogenic group:")
print("Statistic:", pathogenic_normality.statistic)
print("p-value:", pathogenic_normality.pvalue)
print()

print("Benign group:")
print("Statistic:", benign_normality.statistic)
print("p-value:", benign_normality.pvalue)
print()

# Perform Spearman correlation
pathogenic_correlation = stats.spearmanr(pathogenic_scores, pathogenic_counts)
benign_correlation = stats.spearmanr(benign_scores, benign_counts)

# Print correlation results
print("Pathogenic group:")
print("Correlation coefficient:", pathogenic_correlation.correlation)
print("p-value:", pathogenic_correlation.pvalue)
print()

print("Benign group:")
print("Correlation coefficient:", benign_correlation.correlation)
print("p-value:", benign_correlation.pvalue)


In [None]:
filtered_df = total_human_DMS_samples_df[(total_human_DMS_samples_df['Total_Nullpeptide_Counts'] > 0)]
filtered_df = filtered_df.drop(columns=['mutated_sequence', '5mer_Nullpeptides', '5mer_Nullpeptides_Counts', '6mer_Nullpeptides', '6mer_Nullpeptides_Counts'])