Test to see how well the Levenshtein distance can be used as a predictor for relevant datasets:

In [1]:
# Import required modules
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Bio.SubsMat.MatrixInfo import blosum62
from Levenshtein import distance as levenshtein_distance
from scipy.stats import spearmanr
from sklearn.metrics import ndcg_score
import math

# Create a new blosum matrix that has both directions. 
SYMMETRIC_BLOSUM = {}
for (aa1, aa2), score in blosum62.items():
    SYMMETRIC_BLOSUM[(aa1, aa2)] = score
    SYMMETRIC_BLOSUM[(aa2, aa1)] = score



# General Functions

In [2]:
def evaluate_task(task_filename, reference_seq):
    
    # Load the task file and filter to just the test data
    task_df = pd.read_csv(task_filename)
    task_df = task_df.loc[task_df.set == "test"].copy()

    # Create output arrays
    refseq_len = len(reference_seq)
    all_seqs = task_df.sequence.tolist()
    levenshteins = np.empty(len(all_seqs))
    blosum_scores = np.empty(len(all_seqs))
    
    # Calculate levenshtein distance between each sequence and the reference
    levenshteins = np.array([levenshtein_distance(reference_seq, new_seq) 
                             for new_seq in task_df.sequence.values])
    
    # Calculate scores for each sequence    
    calculate_blosum = True
    for i, new_seq in enumerate(all_seqs):

        # Score by levenshtein
        levenshteins[i] = levenshtein_distance(reference_seq, new_seq)
        
        # Continue to calculate blosum unless the data is not aligned
        if calculate_blosum:
            
            # Make sure the reference sequence and this sequence align
            seqs_aligned = len(new_seq) == refseq_len
            if not seqs_aligned:
                print('seqs not aligned')
                calculate_blosum = False
                blosum_scores = None
                continue
            
            # Calculate blosum scores
            blosum_scores[i] = sum(SYMMETRIC_BLOSUM[(aa1, aa2)] for 
                                   aa1, aa2 in zip(reference_seq, new_seq))

    # Now get spearman rho and record. Negative levenshtein because we
    # expect a smaller distance to be correlated to larger fitness.
    l_rho, _ = spearmanr(-levenshteins, task_df.target.values)
    if blosum_scores is not None:
        b_rho, _ = spearmanr(blosum_scores, task_df.target.values)
        
        
        data = pd.DataFrame({'Predicted': blosum_scores, 'Labels': task_df.target.values})

        directory, original_filename = os.path.split(task_filename)
        filename = original_filename.split('_')[0] + '.csv'

        data.to_csv(f'/home/Lxc/FLIP_results/BLOSUM62/{filename}')
        k = math.floor(len(data['Predicted'])*0.01)
        ndcg = ndcg_score([data['Labels'].values], [data['Predicted'].values],k=k)
        print(round(ndcg, 2))
        # 将数据保存到 CSV 文件
        #data.to_csv('prediction.csv', index=False)
    else:
        b_rho = None
    
    return l_rho, b_rho

def evaluate_tasks(refseq_fileloc, taskfolder, task_to_file_dict):
    
    # Get the reference sequence
    reference_seq = str(next(SeqIO.parse(refseq_fileloc, "fasta")).seq)

    # Loop over each task
    results = [["Task", "Levenshtein Rho", "BLOSUM62 Rho"]]
    for taskname, taskfile in task_to_file_dict.items():
        rhos = evaluate_task(os.path.join(taskfolder, taskfile), 
                            reference_seq)
        results.append([taskname, *rhos])
        
    return results

# RhlA

In [11]:
def levenshtein_to_fitness_rhla():

    # Define the different rhla inputs
    rhla_refseq_file = "/home/Lxc/FLIP/tasks//RhlA//Rhla.fasta"
    rhla_taskfolder = "//home/Lxc/FLIP/tasks/RhlA/tasks"
    rhla_task_to_file = {
        #"design": "design_task_regression.csv",
        #"design_reversed": "design_task_reversed_regression.csv",
        #"natural1": "natural_task_1_regression.csv",
        #"natural2": "natural_task_2_regression.csv "
        '1_vs_rest_sum5a' : '1_vs_rest_sum5a.csv'
    }

    return evaluate_tasks(rhla_refseq_file,
                          rhla_taskfolder,
                          rhla_task_to_file)

levenshtein_to_fitness_rhla()

[['Task', 'Levenshtein Rho', 'BLOSUM62 Rho'],
 ['1_vs_rest_sum5a', -0.4677245318296993, -0.3373569334669312]]

# AAV

In [3]:
def levenshtein_to_fitness_aav():

    # Define the different aav inputs
    aav_refseq_file = "/home/Lxc/FLIP/tasks/avv/P03135.fasta"
    aav_taskfolder = "/home/Lxc/FLIP/tasks/avv/tasks"
    aav_task_to_file = {
        #"design": "design_task_regression.csv",
        #"design_reversed": "design_task_reversed_regression.csv",
        #"natural1": "natural_task_1_regression.csv",
        #"natural2": "natural_task_2_regression.csv"
        'test' : 'sampled.csv'
    }

    return evaluate_tasks(aav_refseq_file,
                          aav_taskfolder,
                          aav_task_to_file)

levenshtein_to_fitness_aav()

seqs not aligned


[['Task', 'Levenshtein Rho', 'BLOSUM62 Rho'],
 ['test', -0.11424957109787072, None]]

# Cas

In [4]:
def levenshtein_to_fitness_cas():

    # Define the different cas inputs
    refseq_file = "../tasks/cas/cas9_sequence.fasta"
    taskfolder = "../tasks/cas/tasks/"
    task_to_file = {
        "neg": "pi_domain_log_negative_selection_regression.csv",
        "pos": "pi_domain_log_positive_selection_regression.csv"
    }

    return evaluate_tasks(refseq_file,
                          taskfolder,
                          task_to_file)

levenshtein_to_fitness_cas()

[['Task', 'Levenshtein Rho', 'BLOSUM62 Rho'],
 ['neg', -0.010226623065258516, None],
 ['pos', 0.13659949947831676, None]]

# GB1

In [17]:
def levenshtein_to_fitness_gb1():
    
    # Define the inputs
    refseq_file = "/home/Lxc/FLIP/tasks/SpCas9/SpCas9.fasta"
    taskfolder = "/home/Lxc/FLIP/tasks/SpCas9/tasks"
    task_to_file = {
        "task_1": "SpCas9_mean-0vsrest-df1.csv",
        "task_2": "SpCas9_mean-1vsrest-df1.csv",
        "task_3": "SpCas9_mean-2vsrest-df1.csv",
        "task_4": "SpCas9_mean-3vsrest-df1.csv",
        #'test1' : '0_vs_rest_sum.csv',
        #'test2' : '1_vs_rest_sum.csv',
        #'test3' : '2_vs_rest_sum.csv',
        #'test4' : '3_vs_rest_sum.csv',
        #'test5' : '0_vs_rest_sum5a.csv',
        #'test6' : '1_vs_rest_sum5a.csv',
        #'test7' : '2_vs_rest_sum5a.csv',
        #'test8' : '3_vs_rest_sum5a.csv'
    

    }
    
    return evaluate_tasks(refseq_file,
                          taskfolder,
                          task_to_file)

levenshtein_to_fitness_gb1()

0.78
0.82
0.57
0.58


[['Task', 'Levenshtein Rho', 'BLOSUM62 Rho'],
 ['task_1', 0.3047852087567866, 0.2701889946573201],
 ['task_2', 0.28229457346871134, 0.24700573501708337],
 ['task_3', 0.21744875789758747, 0.17983746402576165],
 ['task_4', 0.1312655379831337, 0.08840527913469598]]

In [22]:
def levenshtein_to_fitness_gb1():
    
    # Define the inputs
    refseq_file = "/home/Lxc/FLIP/tasks/RhlA/Rhla.fasta"
    taskfolder = "/home/Lxc/FLIP/tasks/RhlA/tasks"
    task_to_file = {
        #"task_1": "four_mutations_task_1.csv",
        #"task_2": "four_mutations_task_2.csv",
        #"task_3": "four_mutations_task_3.csv",
        #"task_4": "four_mutations_task_4.csv"
        'test1' : 'SpCas9_mean-0vsrest-df1.csv'
      
    

    }
    
    return evaluate_tasks(refseq_file,
                          taskfolder,
                          task_to_file)

levenshtein_to_fitness_gb1()

0.83


[['Task', 'Levenshtein Rho', 'BLOSUM62 Rho'],
 ['test1', 0.30827452339378897, 0.2737801574576318]]