In [2]:
import pandas as pd 
import numpy as np

Here, we attempt to run the following protein models on CAMEO and CASP14 datasets to get results that can be processed into TM Scores:

* ESMFold (validate paper results )
* ProtT5 
* ProstT5
* Ankh
* Seqvec

We already have rules of :

* AlphaFold2 (single)
* AlphaFold2 (full)

Conda environments for running each of the models are saved


# CASP14 sequence data preprocessing

process the raw sequence data into a dataframe with columns:

tarid, sequence, types (found from the other dataset)

The file casp14_targetlist_seqs.txt was downloaded from https://predictioncenter.org/download_area/CASP14/sequences/

The file casp14_targetlist.txt was downloaded from https://predictioncenter.org/casp14/targetlist.cgi

First extract the sequences and tar_ids

In [20]:
filename = 'casp14_targetlist_seqs.txt'
tar_ids = []
seqs_list = []


with open(filename, 'r') as file:
    lines = file.readlines()
    for index, line in enumerate(lines):
       
        if index % 3 == 0: 
            tarid = line.split(">")[1]#get rid of t1024
            tarid = tarid.split(" ")[0]#get rid of t1024
            tar_ids.append(tarid)
        if (index-1) % 3 == 0:
            seqs_list.append(line)  #this is the seq line


create a dataset from the lists and combine

In [21]:
print(len(tar_ids), len(seqs_list))

85 85


In [26]:
casp14dataset = pd.DataFrame({"tarid":tar_ids, "sequence":seqs_list})
casp14dataset.head(n=190)

Unnamed: 0,tarid,sequence
0,T1024,MKEFWNLDKNLQLRLGIVFLGAFSYGTVFSSMTIYYNQYLGSAITG...
1,T1025,MTDISQMYDQLSDPFAGLGAGNIHLGYFDGPDDAATLAEAADRLTD...
2,T1026,MVSNWNWSGKKGRRTPRRGYTRPFKSAVPTTRVVVHQSAVLKKDDV...
3,T1027,KPTENNEDFNIVAVASNFATTDLDADRGKLPGKKLPLEVLKEMEAN...
4,T1028,MARIGDLDAARPAPEAVPGDMVRIPGGTFLQGSPERTLDWLDREGQ...
...,...,...
80,T1096,MDILENYVSFDEQARDINIAFDKLFGRDDISHMNNFSINKRSYYNC...
81,T1098,MARGSGAGGGGGGGGGGLELSVGVGGGGGARGGGGGEAAAAVETAA...
82,T1099,MDINASRALANVYDLPDDFFPKIDDLVRDAKDALEPYWKSDSIKKH...
83,T1100,MKLTPQIVLIVVVASLVPLSVLGYLTIAGMTSSAEEAKQGVTTVSQ...


read in the more info dataset and merge columns

In [29]:
more_data = pd.read_csv("./casp14_targetlist.csv", sep =";")
more_data.head()

Unnamed: 0,Target,Type,Res,Oligo.State,Entry Date,Server Exp.,QA Exp.,Human Exp.,Cancellation Date,Description
0,T1024,All groups,408,A1,2020-05-18,2020-05-21,m1:2020-05-25 m2:2020-05-27,2020-06-08,-,LmrP 6t1z
1,T1025,Server only,268,A1,2020-05-19,2020-05-22,m1:2020-05-26 m2:2020-05-28,2020-06-09,-,AtmM 6uv6
2,T1026,All groups,172,A1,2020-05-19,2020-05-22,m1:2020-05-26 m2:2020-05-28,2020-06-09,-,FBNSV 6s44
3,T1027,All groups,168,A1,2020-05-20,2020-05-23,m1:2020-05-27 m2:2020-05-29,2020-06-10,-,GLuc 7d2o
4,T1028,Server only,316,A1,2020-05-21,2020-05-24,m1:2020-05-28 m2:2020-05-30,2020-06-11,-,CalU17 6vqp


perform the inner join with the 

In [31]:
CASP_df = pd.merge(more_data, casp14dataset, how='inner', left_on='Target', right_on='tarid')
CASP_df.head()

Unnamed: 0,Target,Type,Res,Oligo.State,Entry Date,Server Exp.,QA Exp.,Human Exp.,Cancellation Date,Description,tarid,sequence
0,T1024,All groups,408,A1,2020-05-18,2020-05-21,m1:2020-05-25 m2:2020-05-27,2020-06-08,-,LmrP 6t1z,T1024,MKEFWNLDKNLQLRLGIVFLGAFSYGTVFSSMTIYYNQYLGSAITG...
1,T1025,Server only,268,A1,2020-05-19,2020-05-22,m1:2020-05-26 m2:2020-05-28,2020-06-09,-,AtmM 6uv6,T1025,MTDISQMYDQLSDPFAGLGAGNIHLGYFDGPDDAATLAEAADRLTD...
2,T1026,All groups,172,A1,2020-05-19,2020-05-22,m1:2020-05-26 m2:2020-05-28,2020-06-09,-,FBNSV 6s44,T1026,MVSNWNWSGKKGRRTPRRGYTRPFKSAVPTTRVVVHQSAVLKKDDV...
3,T1027,All groups,168,A1,2020-05-20,2020-05-23,m1:2020-05-27 m2:2020-05-29,2020-06-10,-,GLuc 7d2o,T1027,KPTENNEDFNIVAVASNFATTDLDADRGKLPGKKLPLEVLKEMEAN...
4,T1028,Server only,316,A1,2020-05-21,2020-05-24,m1:2020-05-28 m2:2020-05-30,2020-06-11,-,CalU17 6vqp,T1028,MARIGDLDAARPAPEAVPGDMVRIPGGTFLQGSPERTLDWLDREGQ...


save the dataset

In [35]:
CASP_df.to_csv("./CASP_dataset.csv")


# Reading in CASP sequence data as sample 

Note that esmfold has a sequence input limit of 1022

In [2]:
data = pd.read_csv("./CASP_dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Target,Type,Res,Oligo.State,Entry Date,Server Exp.,QA Exp.,Human Exp.,Cancellation Date,Description,tarid,sequence
0,0,T1024,All groups,408,A1,2020-05-18,2020-05-21,m1:2020-05-25 m2:2020-05-27,2020-06-08,-,LmrP 6t1z,T1024,MKEFWNLDKNLQLRLGIVFLGAFSYGTVFSSMTIYYNQYLGSAITG...
1,1,T1025,Server only,268,A1,2020-05-19,2020-05-22,m1:2020-05-26 m2:2020-05-28,2020-06-09,-,AtmM 6uv6,T1025,MTDISQMYDQLSDPFAGLGAGNIHLGYFDGPDDAATLAEAADRLTD...
2,2,T1026,All groups,172,A1,2020-05-19,2020-05-22,m1:2020-05-26 m2:2020-05-28,2020-06-09,-,FBNSV 6s44,T1026,MVSNWNWSGKKGRRTPRRGYTRPFKSAVPTTRVVVHQSAVLKKDDV...
3,3,T1027,All groups,168,A1,2020-05-20,2020-05-23,m1:2020-05-27 m2:2020-05-29,2020-06-10,-,GLuc 7d2o,T1027,KPTENNEDFNIVAVASNFATTDLDADRGKLPGKKLPLEVLKEMEAN...
4,4,T1028,Server only,316,A1,2020-05-21,2020-05-24,m1:2020-05-28 m2:2020-05-30,2020-06-11,-,CalU17 6vqp,T1028,MARIGDLDAARPAPEAVPGDMVRIPGGTFLQGSPERTLDWLDREGQ...


Get the largest sequence length

In [3]:
lengths = []
for seq in data["sequence"]:
    lengths.append(len(seq))

In [None]:
print(max(lengths))

2181


: 

## attempt to get real pdbs for all them to evaluate generated ones against

These are the experimental pdbs


In [3]:
import requests

# writing a function to extract pdbs
def pdbDownload(pdb_id = "6vr4", path = "."):
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    r = requests.get(url)
    with open(f"{path}/{pdb_id}.pdb", "wb") as f:
        f.write(r.content)

In [4]:
pdbDownload("6vr4", "experimental_pdbs")

We can find pdbs in the description of the dataset

In [5]:
main = pd.read_csv("CASP_dataset.csv")
main.head(n=2)

Unnamed: 0.1,Unnamed: 0,Target,Type,Res,Oligo.State,Entry Date,Server Exp.,QA Exp.,Human Exp.,Cancellation Date,Description,tarid,sequence
0,0,T1024,All groups,408,A1,2020-05-18,2020-05-21,m1:2020-05-25 m2:2020-05-27,2020-06-08,-,LmrP 6t1z,T1024,MKEFWNLDKNLQLRLGIVFLGAFSYGTVFSSMTIYYNQYLGSAITG...
1,1,T1025,Server only,268,A1,2020-05-19,2020-05-22,m1:2020-05-26 m2:2020-05-28,2020-06-09,-,AtmM 6uv6,T1025,MTDISQMYDQLSDPFAGLGAGNIHLGYFDGPDDAATLAEAADRLTD...


Get list of pdb codes and check length. Turns out some targets have no valid pdbs, so we need to find a list that do

In [6]:
pdb_codes = []
for string in main["Description"]:
    pdb_id = string.split("<")[0]
    pdb_id = string.split()
    pdb_id = pdb_id[len(pdb_id)-1]
    print(pdb_id)

6t1z
6uv6
6s44
7d2o
6vqp
6uf2
6poo
6vr4
6n64
6vr4
6tmm
6vr4
6vn1
6vr4
6ya2
6vr4
6vr4
6vr4
6vr4
6vr4
6xod
6xod
6px4
6px4
7bgl
7bgl
structure.</font></em>
6y4f
A7LXT1
structure.</font></em>
Q858F5.1
7m7a
6v4v
6zyc
6yj1
7m6b
7abw
structure.</font></em>
7qg9
7qg9
7zhj
structure.</font></em>
structure.</font></em>
7jtl
7m5f
7m5f
structure.</font></em>
structure.</font></em>
Bd3741
LmazxJ3-JDBD
structure.</font></em>
structure.</font></em>
6vr4
7rej
structure.</font></em>
structure.</font></em>
available.</font></em>
Bd2625
7oc9
structure.</font></em>
6xn8
N1077.</font></em>
7cwp
Bd0655
8onb
7cn6
Nitro
Meio
Bd2374_c
Bd2374_c
Bd2374_n
Tuna
N1088.</font></em>
7mhu
7k7w
7w6b
7um1
7um1
7um1
7um1
7um1
structure.</font></em>
6ygh
AAB89755.1
ASCC1


Attempt to extract pdbs, count number of failures

In [7]:
pdb_codes = []
for string in main["Description"]:
    pdb_id = string.split("<")[0]
    pdb_id = string.split()
    pdb_id = pdb_id[len(pdb_id)-1]
    try: 
        pdbDownload(pdb_id, "experimental_pdbs")
    except: 
        print("failed for pdb id", pdb_id)

failed for pdb id structure.</font></em>
failed for pdb id structure.</font></em>
failed for pdb id structure.</font></em>
failed for pdb id structure.</font></em>
failed for pdb id structure.</font></em>
failed for pdb id 7m5f
failed for pdb id structure.</font></em>
failed for pdb id structure.</font></em>
failed for pdb id Bd3741
failed for pdb id LmazxJ3-JDBD
failed for pdb id structure.</font></em>
failed for pdb id structure.</font></em>
failed for pdb id 6vr4
failed for pdb id 7rej
failed for pdb id structure.</font></em>
failed for pdb id structure.</font></em>
failed for pdb id available.</font></em>
failed for pdb id Bd2625
failed for pdb id 7oc9
failed for pdb id structure.</font></em>
failed for pdb id 6xn8
failed for pdb id N1077.</font></em>
failed for pdb id 7cwp
failed for pdb id Bd0655
failed for pdb id 8onb
failed for pdb id 7cn6
failed for pdb id Nitro
failed for pdb id Meio
failed for pdb id Bd2374_c
failed for pdb id Bd2374_c
failed for pdb id Bd2374_n
failed for p

# 45 proteins could not be extracted this way?


For now, run results for all models on these proteins