In [10]:
import glob
import pandas as pd
import numpy as np

In [11]:

def get_confidences(pdb_files: list) -> list:
    """
    Read AlphaFold PDB file contents to find confidence info
    @param: pdbs glob list of AlphaFold PDB files
    @return: confidence_info list containing the dictionary of confidence info
    """
    counter = 1
    confidence_info = []
    for pdb_file in pdb_files:
        print(f"At {counter}/{len(pdb_files)}")
        file = open(pdb_file)
        lines = file.readlines()
        file.close()
        confidence_scores = []
        confidence_dict = {}
        for line in lines:
            if line.startswith("ATOM"):
                current_line = line.split()
                confidence_score = float(current_line[-2])
                confidence_scores.append(confidence_score)
        confidence_dict["mean"] = np.mean(confidence_scores)
        confidence_dict["std"] = np.std(confidence_scores)
        confidence_dict["filename"] = pdb_file
        confidence_info.append(confidence_dict)
        counter += 1
    return confidence_info
                                          
                                      
def filter_confidences(pdb_files: list) -> pd.DataFrame:
    """
    Filter out AlphaFold structures with less than 90% confidence
    @param pdb_files: glob list of AlphaFold PDBs
    @return good_confidence_df: dataframe containing PDBs with 90% confidence or above
    """
    confidence_list = get_confidences(pdb_files)
    confidence_dataframe = pd.DataFrame.from_dict(confidence_list)
    confidence_filter = confidence_dataframe["mean"] >85.0
    good_confidence_df = confidence_dataframe.where(confidence_filter).dropna()
    return good_confidence_df



## KIBA dataset

In [5]:
pdbs = glob.glob("/data/kiba/alphafold2/*.pdb")

In [7]:
confidence_df = filter_confidences(pdbs)

At 1/228
At 2/228
At 3/228
At 4/228
At 5/228
At 6/228
At 7/228
At 8/228
At 9/228
At 10/228
At 11/228
At 12/228
At 13/228
At 14/228
At 15/228
At 16/228
At 17/228
At 18/228
At 19/228
At 20/228
At 21/228
At 22/228
At 23/228
At 24/228
At 25/228
At 26/228
At 27/228
At 28/228
At 29/228
At 30/228
At 31/228
At 32/228
At 33/228
At 34/228
At 35/228
At 36/228
At 37/228
At 38/228
At 39/228
At 40/228
At 41/228
At 42/228
At 43/228
At 44/228
At 45/228
At 46/228
At 47/228
At 48/228
At 49/228
At 50/228
At 51/228
At 52/228
At 53/228
At 54/228
At 55/228
At 56/228
At 57/228
At 58/228
At 59/228
At 60/228
At 61/228
At 62/228
At 63/228
At 64/228
At 65/228
At 66/228
At 67/228
At 68/228
At 69/228
At 70/228
At 71/228
At 72/228
At 73/228
At 74/228
At 75/228
At 76/228
At 77/228
At 78/228
At 79/228
At 80/228
At 81/228
At 82/228
At 83/228
At 84/228
At 85/228
At 86/228
At 87/228
At 88/228
At 89/228
At 90/228
At 91/228
At 92/228
At 93/228
At 94/228
At 95/228
At 96/228
At 97/228
At 98/228
At 99/228
At 100/228
At 101/2

In [10]:
confidence_df.to_csv("/confidences_kiba.csv")

In [8]:
np.mean(confidence_df['mean'])

78.85069976751063

In [9]:
np.mean(confidence_df['std'])

21.520113334127863

## Davis

In [3]:
pdbs = glob.glob("/data/davis/alphafold2_structures/*.pdb")

In [12]:
confidence_df = filter_confidences(pdbs)


At 1/332
At 2/332
At 3/332
At 4/332
At 5/332
At 6/332
At 7/332
At 8/332
At 9/332
At 10/332
At 11/332
At 12/332
At 13/332
At 14/332
At 15/332
At 16/332
At 17/332
At 18/332
At 19/332
At 20/332
At 21/332
At 22/332
At 23/332
At 24/332
At 25/332
At 26/332
At 27/332
At 28/332
At 29/332
At 30/332
At 31/332
At 32/332
At 33/332
At 34/332
At 35/332
At 36/332
At 37/332
At 38/332
At 39/332
At 40/332
At 41/332
At 42/332
At 43/332
At 44/332
At 45/332
At 46/332
At 47/332
At 48/332
At 49/332
At 50/332
At 51/332
At 52/332
At 53/332
At 54/332
At 55/332
At 56/332
At 57/332
At 58/332
At 59/332
At 60/332
At 61/332
At 62/332
At 63/332
At 64/332
At 65/332
At 66/332
At 67/332
At 68/332
At 69/332
At 70/332
At 71/332
At 72/332
At 73/332
At 74/332
At 75/332
At 76/332
At 77/332
At 78/332
At 79/332
At 80/332
At 81/332
At 82/332
At 83/332
At 84/332
At 85/332
At 86/332
At 87/332
At 88/332
At 89/332
At 90/332
At 91/332
At 92/332
At 93/332
At 94/332
At 95/332
At 96/332
At 97/332
At 98/332
At 99/332
At 100/332
At 101/3

In [13]:
np.mean(confidence_df['mean'])

88.8159472339142

In [7]:
np.mean(confidence_df['std'])

22.348975797093374

In [16]:
confidence_df.to_csv("confidences_davis.csv")