In [7]:
import glob
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from log_roc import calculate_roc_from_energies
from sklearn.metrics import auc

def log_auc(x, y):
    """Calculate AUC with log-transformed x-axis"""
    log_x = np.log(x)
    return auc(log_x, y)


protein_path_list = "rank_csv/*_combind_*.csv"
protein_path_list = glob.glob(protein_path_list)

def calculate_auc(merged_scores):
    #remove any rows with na
    merged_scores = merged_scores.dropna()
    #sort by score
    merged_scores = merged_scores.sort_values(by="score", ascending=False)
    y_true = merged_scores["standard_value"] > 0
    #to label
    y_true = y_true.astype(int).to_numpy()
    y_pred = merged_scores["score"].to_numpy()
    #calculate the auc
    auc = calculate_roc_from_energies(y_true, y_pred, lower_is_better=False)
    return auc
protein_log_auc = {}
for protein_path in protein_path_list:
    combind_scores = pd.read_csv(protein_path)
    glide_scores = pd.read_csv(protein_path.replace("_combind_", "_glide_"))
    #set column 0 name as ID and column 1 name as score
    combind_scores.columns = ["ID", "score"]
    glide_scores.columns = ["ID", "score"]
    #energy terms is negative, so we need to make it positive
    glide_scores["score"] = -glide_scores["score"]
    protein_name = protein_path.split("/")[-1].split("_")[0]
    
    #real score is in ../benchmark_set_split/{protein_name}/library.csv
    library_path = f"../../benchmark_set_split/{protein_name}/library.csv"
    library_df = pd.read_csv(library_path)
    #set standard_value column NaN as -1
    library_df["standard_value"] = library_df["standard_value"].fillna(-1)
    
    #merge the combind_scores and glide_scores with the library_df
    combind_scores = pd.merge(combind_scores, library_df, on="ID", how="left")
    glide_scores = pd.merge(glide_scores, library_df, on="ID", how="left")
    
    combind_log_auc = calculate_auc(combind_scores)
    combind_log_auc = combind_log_auc["log_auc"]
    glide_log_auc = calculate_auc(glide_scores)
    glide_log_auc = glide_log_auc["log_auc"]
    
    protein_log_auc[protein_name] = {"combind_log_auc":combind_log_auc, "glide_log_auc":glide_log_auc}
    


In [8]:
import pandas as pd
protein_log_auc = pd.DataFrame(protein_log_auc)
protein_log_auc.to_csv("protein_log_auc_fragment_stats_prev.csv")
