In [1]:
import pandas as pd 
import numpy as np

Here, we attempt to run the following protein models on CAMEO and CASP14 datasets to get results that can be processed into TM Scores:

* ESMFold (validate paper results )
* ProtT5 
* ProstT5
* Ankh
* Seqvec

We already have rules of :

* AlphaFold2 (single)
* AlphaFold2 (full)

Conda environments for running each of the models are saved


# CASP14 sequence data preprocessing

process the raw sequence data into a dataframe with columns:

tarid, sequence, types (found from the other dataset)

The file casp14_targetlist_seqs.txt was downloaded from https://predictioncenter.org/download_area/CASP14/sequences/

The file casp14_targetlist.txt was downloaded from https://predictioncenter.org/casp14/targetlist.cgi

First extract the sequences and tar_ids

In [None]:
filename = 'casp14_targetlist_seqs.txt'
tar_ids = []
seqs_list = []


with open(filename, 'r') as file:
    lines = file.readlines()
    for index, line in enumerate(lines):
       
        if index % 3 == 0: 
            tarid = line.split(">")[1]#get rid of t1024
            tarid = tarid.split(" ")[0]#get rid of t1024
            tar_ids.append(tarid)
        if (index-1) % 3 == 0:
            seqs_list.append(line)  #this is the seq line


create a dataset from the lists and combine

In [None]:
print(len(tar_ids), len(seqs_list))

In [None]:
casp14dataset = pd.DataFrame({"tarid":tar_ids, "sequence":seqs_list})
casp14dataset.head(n=190)

read in the more info dataset and merge columns

In [None]:
more_data = pd.read_csv("./casp14_targetlist.csv", sep =";")
more_data.head()

perform the inner join with the 

In [None]:
CASP_df = pd.merge(more_data, casp14dataset, how='inner', left_on='Target', right_on='tarid')
CASP_df.head()

save the dataset

In [None]:
CASP_df.to_csv("./CASP_dataset.csv")


# Reading in CASP sequence data as sample 

Note that esmfold has a sequence input limit of 1022

In [2]:
data = pd.read_csv("./CASP_dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Target,Type,Res,Oligo.State,Entry Date,Server Exp.,QA Exp.,Human Exp.,Cancellation Date,Description,tarid,sequence
0,0,T1024,All groups,408,A1,2020-05-18,2020-05-21,m1:2020-05-25 m2:2020-05-27,2020-06-08,-,LmrP 6t1z,T1024,MKEFWNLDKNLQLRLGIVFLGAFSYGTVFSSMTIYYNQYLGSAITG...
1,1,T1025,Server only,268,A1,2020-05-19,2020-05-22,m1:2020-05-26 m2:2020-05-28,2020-06-09,-,AtmM 6uv6,T1025,MTDISQMYDQLSDPFAGLGAGNIHLGYFDGPDDAATLAEAADRLTD...
2,2,T1026,All groups,172,A1,2020-05-19,2020-05-22,m1:2020-05-26 m2:2020-05-28,2020-06-09,-,FBNSV 6s44,T1026,MVSNWNWSGKKGRRTPRRGYTRPFKSAVPTTRVVVHQSAVLKKDDV...
3,3,T1027,All groups,168,A1,2020-05-20,2020-05-23,m1:2020-05-27 m2:2020-05-29,2020-06-10,-,GLuc 7d2o,T1027,KPTENNEDFNIVAVASNFATTDLDADRGKLPGKKLPLEVLKEMEAN...
4,4,T1028,Server only,316,A1,2020-05-21,2020-05-24,m1:2020-05-28 m2:2020-05-30,2020-06-11,-,CalU17 6vqp,T1028,MARIGDLDAARPAPEAVPGDMVRIPGGTFLQGSPERTLDWLDREGQ...


Get the largest sequence length

In [3]:
lengths = []
for seq in data["sequence"]:
    lengths.append(len(seq))

In [4]:
print(max(lengths))

2181


## attempt to get real pdbs for all them to evaluate generated ones against

These are the experimental pdbs


In [5]:
import requests

# writing a function to extract pdbs
def pdbDownload(pdb_id = "6vr4", path = "."):
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    r = requests.get(url)
    with open(f"{path}/{pdb_id}.pdb", "wb") as f:
        f.write(r.content)

In [6]:
pdbDownload("6vr4", "experimental_pdbs")

We can find pdbs in the description of the dataset

In [7]:
main = pd.read_csv("CASP_dataset.csv")
main.head(n=2)

Unnamed: 0.1,Unnamed: 0,Target,Type,Res,Oligo.State,Entry Date,Server Exp.,QA Exp.,Human Exp.,Cancellation Date,Description,tarid,sequence
0,0,T1024,All groups,408,A1,2020-05-18,2020-05-21,m1:2020-05-25 m2:2020-05-27,2020-06-08,-,LmrP 6t1z,T1024,MKEFWNLDKNLQLRLGIVFLGAFSYGTVFSSMTIYYNQYLGSAITG...
1,1,T1025,Server only,268,A1,2020-05-19,2020-05-22,m1:2020-05-26 m2:2020-05-28,2020-06-09,-,AtmM 6uv6,T1025,MTDISQMYDQLSDPFAGLGAGNIHLGYFDGPDDAATLAEAADRLTD...


Attempt to extract pdbs, count number of failures, get list of failed targets

In [17]:
failed_targets_codes = []


for index,row in main.iterrows():
    target = row["Target"]
    string = row["Description"]
    pdb_id = string.split("<")[0]
    pdb_id = string.split()
    pdb_id = pdb_id[len(pdb_id)-1]
    try: 
        pdbDownload(pdb_id, "experimental_pdbs")
    except: 
        failed_targets_codes.append(target)
        print("failed for target id", target)

failed for target id T1048
failed for target id T1051
failed for target id T1059
failed for target id T1062
failed for target id T1063
failed for target id T1066s1
failed for target id T1066s2
failed for target id T1069s1
failed for target id T1069s2
failed for target id T1071
failed for target id T1072s1
failed for target id T1072s2
failed for target id T1075
failed for target id T1077
failed for target id T1088
failed for target id T1098


In [18]:
print(len(failed_targets_codes))

16


# 45 proteins could not be extracted this way?


For now, run results for all models on these proteins, find the file names and the corresponding tar ids where it worked