In [1]:
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:

def read_table(path):
    df = pd.DataFrame(columns=['model', 'lDDToligo'])
    with open(path, "r") as f:
        lines = f.read().splitlines()
    for line in lines:
        cols = line.split()
        # if line[0] is a string digit
        if not (line[0]).isdigit():
            continue
        if len(cols) < 10:
            continue
        model_name = cols[1]
        try:
            lDDToligo = float(cols[16])
        except ValueError:
            continue
        df = df.append({'model': model_name, 'lDDToligo': lDDToligo}, ignore_index=True)
    return df



- Read the true and predict score tables and combine them into one table.
- The output table are saved in 'casp15_qa_comparison'

In [3]:
true_dir = "casp15_true_results"
pred_dir = "casp15_qa_results"
save_dir = 'casp15_qa_comparison'
no_qa_results = []
for table in os.listdir(true_dir):
    if not table.endswith(".txt"):
        continue
    target_name = table.split(".")[0]
    true_df = read_table(os.path.join(true_dir, table))
    pred_table_path = os.path.join(pred_dir, target_name,f"qa.csv")
    if not os.path.exists(pred_table_path):
        no_qa_results.append(pred_table_path)
        continue
    pred_table = pd.read_csv(pred_table_path)
    if pred_table.shape[0] < 100:
        # remove targets that have < 100 predicted models
        continue
    # rename column names
    pred_table.rename(columns={"model_name": "model","lddt":"lDDToligo"}, inplace=True)
    # remove a string "_merged" from the column "model"
    pred_table["model"] = pred_table["model"].str.replace("_merged","")

    shared_df = true_df.merge(pred_table, on='model', how='inner', suffixes=('_True', '_EnQA-MSA'))
    # plot scatter plot for column "lDDToligo_True" and "lDDToligo_EnQA-MSA" and regression line using sns
    sns.scatterplot(x="lDDToligo_True", y="lDDToligo_EnQA-MSA",  data=shared_df)
    sns.regplot(x="lDDToligo_True", y="lDDToligo_EnQA-MSA", data=shared_df)
    plt.savefig(os.path.join(save_dir, f"{target_name}.png"))
    plt.clf()
    
    shared_df.to_csv(os.path.join(save_dir, f"{target_name}.csv"), index=False)


<Figure size 640x480 with 0 Axes>

In [4]:
# the following targets are not in the EnQA-MSA results because they are too long to exceed the memory limit.
no_qa_results

['casp15_qa_results/H1157/qa.csv',
 'casp15_qa_results/H1137/qa.csv',
 'casp15_qa_results/H1168/qa.csv',
 'casp15_qa_results/H1111/qa.csv',
 'casp15_qa_results/H1168v1/qa.csv',
 'casp15_qa_results/H1114/qa.csv',
 'casp15_qa_results/H1114v2/qa.csv',
 'casp15_qa_results/H1185/qa.csv',
 'casp15_qa_results/H1171/qa.csv',
 'casp15_qa_results/T1115o/qa.csv',
 'casp15_qa_results/H1134/qa.csv']

In [5]:
def calc_correlation(df):
    # select the two columns we want to calculate correlation for
    selected_columns = df[["lDDToligo_True", "lDDToligo_EnQA-MSA"]]
    # calculate the correlation matrix
    corr_matrix = selected_columns.corr()
    # return the correlation value for the two columns
    return corr_matrix.loc["lDDToligo_True", "lDDToligo_EnQA-MSA"]  

def calc_loss(df):
    # sort the dataframe by predicted quality score in descending order
    first_ranked_model_true_score = df.sort_values("lDDToligo_EnQA-MSA", ascending=False).iloc[0]["lDDToligo_True"]
    top_model_true_score = df.sort_values("lDDToligo_True", ascending=False).iloc[0]["lDDToligo_True"]
    # calculate the loss
    loss = top_model_true_score - first_ranked_model_true_score
    return loss

In [6]:
stats = pd.DataFrame(columns=["target", "correlation", "loss"])
for table_path in glob.glob(save_dir+"/*.csv"):
    table_name = table_path.split("/")[-1].split(".")[0]
    table = pd.read_csv(table_path)
    correlation = calc_correlation(table)
    loss = calc_loss(table)
    stats = stats.append({"target": table_name, "correlation": correlation, "loss": loss}, ignore_index=True)
print(stats)
stats.to_csv("casp15_multimer_EnQA-MSA.csv", index=False,float_format='%.3f')
    

     target  correlation   loss
0    T1132o     0.643353  0.004
1    T1124o     0.410851  0.002
2     H1141     0.416283  0.095
3     H1144     0.263351  0.009
4    T1109o     0.344015  0.062
5    T1127o     0.476944  0.039
6     H1140     0.205527  0.000
7     H1167     0.301865  0.056
8     H1106    -0.139552  0.496
9    T1110o     0.474538  0.014
10   T1113o     0.654506  0.072
11    H1151    -0.067309  0.058
12  H1166v1     0.259850  0.036
13  H1167v1     0.301737  0.056
14   T1121o     0.365142  0.019
15    H1166     0.259852  0.036
16    H1142     0.185464  0.437
17    H1143     0.549620  0.010
18   T1123o     0.148827  0.194
