In [1]:
import pandas as pd
from scipy.stats import pearsonr
import math


In [2]:


def read_and_preprocess(file_path):
    predicted_df = pd.read_csv(file_path)
    print("predicted_df", predicted_df)


    true_df = pd.read_csv("curated_dataset/conservation_scores.csv")

    df = pd.DataFrame(true_df)
    df['conservation score'] = df['conservation score'].astype(str)

    true_df = df.groupby('sequence id')['conservation score'].agg(' '.join).reset_index()

    # partie common des sequences
    predicted_df['sequence_id'] = predicted_df['header'].str.extract(r'([A-Za-z0-9]+\.[0-9]+/[0-9]+-[0-9]+)')
    true_df['sequence_id'] = true_df['sequence id'].str.extract(r'([A-Za-z0-9]+\.[0-9]+/[0-9]+-[0-9]+)')

    return predicted_df, true_df

def calculate_pearson(predicted_df, true_df):
    # Fusioner les DataFrames par le cologne 'sequence_id'
    combined_df = pd.merge(predicted_df, true_df, how='inner',
                           left_on='sequence_id', right_on='sequence_id')

    # Convertir les valeurs de 'conservation' y 'conservation score' a float lists
    combined_df['conservation'] = combined_df['conservation'].apply(lambda x: [float(val) for val in x.split()])
    combined_df['conservation score'] = combined_df['conservation score'].apply(lambda x: [float(val) for val in x.split()])

    # Supprimer rows qui contient infinite ou NaNs
    combined_df = combined_df[~combined_df['conservation'].apply(lambda x: any(math.isinf(val) or math.isnan(val) for val in x))]
    combined_df = combined_df[~combined_df['conservation score'].apply(lambda x: any(math.isinf(val) or math.isnan(val) for val in x))]

    pearson_coefficient, _ = pearsonr(
        combined_df['conservation'].explode(), combined_df['conservation score'].explode())
    return pearson_coefficient

In [4]:
file_name = "output_results_curated_8M.csv"
predicted_df, true_df = read_and_preprocess(file_name)
print(calculate_pearson(predicted_df, true_df))

predicted_df                                                  header  \
0      A0A1Y2ANA8.1/421-668 Pfam=PF00003.25 type=domain   
1           V3ZBP4.1/12-278 Pfam=PF00003.25 type=domain   
2      A0A226EIY0.1/621-830 Pfam=PF00003.25 type=domain   
3      A0A6J1SJU2.1/386-517 Pfam=PF00004.32 type=domain   
4          Q54ST1.1/607-740 Pfam=PF00004.32 type=domain   
...                                                 ...   
35866      A0A5B9Q753.1/5-47 Pfam=PF20607.1 type=family   
35867      A0A3M1R399.1/5-47 Pfam=PF20607.1 type=family   
35868     A0A7K3NCN5.1/12-97 Pfam=PF20619.1 type=family   
35869      A0A1M6TTR7.1/1-91 Pfam=PF20619.1 type=family   
35870     A0A5J5ICB5.1/11-99 Pfam=PF20619.1 type=family   

                                                sequence  \
0      KWTDIPAIIIGGISVVGIIITLVVFFITVKNRNALVIRRSSPLFLY...   
1      AVMSWIIGCVIALVSLVYLVLNIKLRNTRLIKMSSPNLNCLVASGG...   
2      YLNPLAMAAMIFSGFGILLALFVLKVFWRYNDTPIIKAAGRELSYV...   
3      VLLFGPSGTGKTMLAHALAQDSAASVHTLIG