## Data exploration
Let's find the alpha-fold structures corresponding to our PDB list.

In [4]:
# Imports
import pandas as pd
import numpy as np
import warnings
from helpers import read_full_list
def display_all(df): 
    with warnings.catch_warnings(): 
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None): display(df)

In [2]:
# Load data
metadata = pd.read_parquet('metadata.parquet')
index = pd.read_parquet('index.parquet')
full_list_df = read_full_list('masif/data/masif_ppi_search/lists/full_list.txt')
print(f'Length metadata: {len(metadata)}')
print(f'Length index: {len(index)}')
print(f'Length full list: {len(full_list_df)}')

Length metadata: 2319564
Length index: 2319564
Length full list: 5902


In [5]:
# Display first rows
display_all(metadata.head(1))
display_all(index.head(1))
display_all(full_list_df.head(1))

Unnamed: 0,id,entry_id,method,date,release_date,resolution,label,probability,chain1_id,chain2_id,assembly,assembly_details,oligomeric_details,oligomeric_count,biol_details,complex_type,chain_1,asym_id_1,chain_2,asym_id_2,length1,length2,length_resolved_1,length_resolved_2,number_of_components_1,number_of_components_2,link_density,planarity,max_var_1,max_var_2,num_atom_types,n_residue_pairs,n_residues,buried_sasa,intermolecular_contacts,charged_charged_contacts,charged_polar_contacts,charged_apolar_contacts,polar_polar_contacts,apolar_polar_contacts,apolar_apolar_contacts,interface_atom_gaps_4A,missing_interface_residues_4A,interface_atom_gaps_8A,missing_interface_residues_8A,entity_id_R,entity_id_L,pdb_strand_id_R,pdb_strand_id_L,ECOD_names_R,ECOD_names_L
0,5oge__E1_P40107--5oge__A1_P40107,5oge,X-RAY DIFFRACTION,2017-07-12,2017-11-22,3.220703,BIO,0.560059,R,L,1,software_defined_assembly,dimeric,2,,homomer,E1,E,A1,A,304,299,304,299,1,1,0.160034,1.810547,0.43042,0.436768,4,135,48,1182.0,29,0,0,0,4,9,16,0,0,11,0,1,1,E,A,"F_UNCLASSIFIED,F_UNCLASSIFIED","F_UNCLASSIFIED,F_UNCLASSIFIED"


Unnamed: 0,split,id,pdb_id,cluster_id,cluster_id_R,cluster_id_L,pinder_s,pinder_xl,pinder_af2,pinder_af2_hard,uniprot_R,uniprot_L,holo_R_pdb,holo_L_pdb,predicted_R_pdb,predicted_L_pdb,apo_R_pdb,apo_L_pdb,apo_R_pdbs,apo_L_pdbs,holo_R,holo_L,predicted_R,predicted_L,apo_R,apo_L,apo_R_quality,apo_L_quality,chain1_neff,chain2_neff,chain_R,chain_L,contains_antibody,contains_antigen,contains_enzyme
0,test,5oge__E1_P40107--5oge__A1_P40107,5oge,cluster_2035_2035,cluster_2035,cluster_2035,False,True,False,False,P40107,P40107,5oge__E1_P40107-R.pdb,5oge__A1_P40107-L.pdb,af__P40107.pdb,af__P40107.pdb,,,,,True,True,True,True,False,False,,,58.1875,58.1875,E1,A1,False,False,False


Unnamed: 0,entry_id,asym_id_1,asym_id_2
0,1a0g,A,B


In [6]:
# Merge dataframes on appropriate keys
appropiate_keys = ['entry_id', 'asym_id_1', 'asym_id_2']
temp_metadata = pd.merge(metadata, index, on='id', how='inner')
complete_metadata = pd.merge(temp_metadata, full_list_df, on=appropiate_keys, how='inner')
print(f'Length complete metadata: {len(complete_metadata)}')
display_all(complete_metadata.head(3))

Length complete metadata: 5375


Unnamed: 0,id,entry_id,method,date,release_date,resolution,label,probability,chain1_id,chain2_id,assembly,assembly_details,oligomeric_details,oligomeric_count,biol_details,complex_type,chain_1,asym_id_1,chain_2,asym_id_2,length1,length2,length_resolved_1,length_resolved_2,number_of_components_1,number_of_components_2,link_density,planarity,max_var_1,max_var_2,num_atom_types,n_residue_pairs,n_residues,buried_sasa,intermolecular_contacts,charged_charged_contacts,charged_polar_contacts,charged_apolar_contacts,polar_polar_contacts,apolar_polar_contacts,apolar_apolar_contacts,interface_atom_gaps_4A,missing_interface_residues_4A,interface_atom_gaps_8A,missing_interface_residues_8A,entity_id_R,entity_id_L,pdb_strand_id_R,pdb_strand_id_L,ECOD_names_R,ECOD_names_L,split,pdb_id,cluster_id,cluster_id_R,cluster_id_L,pinder_s,pinder_xl,pinder_af2,pinder_af2_hard,uniprot_R,uniprot_L,holo_R_pdb,holo_L_pdb,predicted_R_pdb,predicted_L_pdb,apo_R_pdb,apo_L_pdb,apo_R_pdbs,apo_L_pdbs,holo_R,holo_L,predicted_R,predicted_L,apo_R,apo_L,apo_R_quality,apo_L_quality,chain1_neff,chain2_neff,chain_R,chain_L,contains_antibody,contains_antigen,contains_enzyme
0,3fil__A1_P19909--3fil__B1_P19909,3fil,X-RAY DIFFRACTION,2008-12-12,2009-08-18,0.879883,BIO,0.647949,R,L,1,author_and_software_defined_assembly,dimeric,2,?,homomer,A1,A,B1,B,56,56,56,56,1,1,0.27002,2.541016,0.576172,0.594238,4,175,63,987.0,39,1,2,6,2,8,20,0,0,0,0,1,2,A,B,PF01378,PF01378,test,3fil,cluster_1361_1361,cluster_1361,cluster_1361,False,True,False,False,P19909,P19909,3fil__A1_P19909-R.pdb,3fil__B1_P19909-L.pdb,af__P19909.pdb,af__P19909.pdb,2gi9__A1_P19909.pdb,5bmi__A1_P19909.pdb,2gi9__A1_P19909.pdb;5bmg__A1_P19909.pdb;6nl9__A1_P19909.pdb;5bmh__A1_P19909.pdb;4ozb__A1_P19909.pdb;5bmi__A1_P19909.pdb;4ozc__A1_P19909.pdb;4wh4__A1_P19909.pdb;2on8__A1_P19909.pdb;5hg2__A1_P19909.pdb;6nl6__A1_P19909.pdb;5hi1__A1_P19909.pdb;2qmt__A1_P19909.pdb;2onq__A1_P19909.pdb;5hfy__A1_P19909.pdb;6nlb__A1_P19909.pdb;8dij__A1_P19909.pdb;1p7e__A1_P19909.pdb;6cpz__A1_P19909.pdb;1p7f__A1_P19909.pdb;2n9k__A1_P19909.pdb;2kq4__A1_P19909.pdb;6c9o__A1_P19909.pdb;7da8__A1_P19909.pdb;6cte__A1_P19909.pdb;2igg__A1_P19909.pdb;6kmc__A1_P19909.pdb;6che__A1_P19909.pdb;2lum__A1_P19909.pdb;6nl7__A1_P19909.pdb;1fcl__A1_P19909.pdb;2n9l__A1_P19909.pdb;1gb4__A1_P19909.pdb;1fd6__A1_P19909.pdb;5ub0__A1_P19909.pdb;2jsv__A1_P19909.pdb;4oza__A1_P19909.pdb;6njf__A1_P19909.pdb;5ubs__A1_P19909.pdb;2ju6__A1_P19909.pdb;5uce__A1_P19909.pdb;5ucf__A1_P19909.pdb;1zxh__A1_P19909.pdb,5bmi__A1_P19909.pdb;6nl9__A1_P19909.pdb;5hi1__A1_P19909.pdb;5bmg__A1_P19909.pdb;2gi9__A1_P19909.pdb;2qmt__A1_P19909.pdb;6nlb__A1_P19909.pdb;6cpz__A1_P19909.pdb;5hg2__A1_P19909.pdb;2on8__A1_P19909.pdb;4ozc__A1_P19909.pdb;6c9o__A1_P19909.pdb;5bmh__A1_P19909.pdb;2onq__A1_P19909.pdb;6cte__A1_P19909.pdb;6nl6__A1_P19909.pdb;4ozb__A1_P19909.pdb;1p7e__A1_P19909.pdb;7da8__A1_P19909.pdb;4wh4__A1_P19909.pdb;6che__A1_P19909.pdb;1p7f__A1_P19909.pdb;6nl7__A1_P19909.pdb;2kq4__A1_P19909.pdb;8dij__A1_P19909.pdb;5hfy__A1_P19909.pdb;2igg__A1_P19909.pdb;2n9k__A1_P19909.pdb;2lum__A1_P19909.pdb;5ub0__A1_P19909.pdb;6kmc__A1_P19909.pdb;1fd6__A1_P19909.pdb;2jsv__A1_P19909.pdb;1fcl__A1_P19909.pdb;1gb4__A1_P19909.pdb;2n9l__A1_P19909.pdb;6njf__A1_P19909.pdb;4oza__A1_P19909.pdb;5ubs__A1_P19909.pdb;2ju6__A1_P19909.pdb;5ucf__A1_P19909.pdb;5uce__A1_P19909.pdb;1zxh__A1_P19909.pdb,True,True,True,True,True,True,high,high,15.84375,15.84375,A1,B1,False,False,False
1,2hvg__A1_A5KBL5--2hvg__B1_A5KBL5,2hvg,X-RAY DIFFRACTION,2006-07-28,2006-08-22,2.300781,BIO,0.992188,R,L,1,author_and_software_defined_assembly,dimeric,2,?,homomer,A1,A,B1,B,447,447,447,447,1,1,0.029999,7.089844,0.820801,0.821777,5,1145,322,7468.0,260,33,28,62,24,52,61,0,0,17,0,1,1,A,B,"F_UNCLASSIFIED,PF08328,PF00206,PF08328","F_UNCLASSIFIED,PF08328,PF00206,PF08328",test,2hvg,cluster_4531_4531,cluster_4531,cluster_4531,False,True,False,False,A5KBL5,A5KBL5,2hvg__A1_A5KBL5-R.pdb,2hvg__B1_A5KBL5-L.pdb,af__A5KBL5.pdb,af__A5KBL5.pdb,,,,,True,True,True,True,False,False,,,317.0,317.0,A1,B1,False,False,True
2,3ix1__A1_Q9K9G5--3ix1__B1_Q9K9G5,3ix1,X-RAY DIFFRACTION,2009-09-03,2010-10-13,2.400391,BIO,0.640137,R,L,1,author_defined_assembly,dimeric,2,?,homomer,A1,A,B1,B,301,301,301,301,1,1,0.099976,3.796875,0.52832,0.52832,4,328,89,1847.0,77,5,14,25,2,14,17,0,0,0,0,1,1,A,B,"PF09084,PF09084","PF09084,PF09084",test,3ix1,cluster_32172_32172,cluster_32172,cluster_32172,False,True,False,False,Q9K9G5,Q9K9G5,3ix1__A1_Q9K9G5-R.pdb,3ix1__B1_Q9K9G5-L.pdb,af__Q9K9G5.pdb,af__Q9K9G5.pdb,,,,,True,True,True,True,False,False,,,301.75,301.75,A1,B1,False,False,False


From this dataset, we could get the corresponding `af__*.pdb` files that are already downloaded. 

I guess you then have to associate them (Receptor and Ligend) and pre-process it (rotation and removing extra-parts) to then form the structure than will be fed inside the model. 

In [None]:
# But there are some duplicates, why ??
len(complete_metadata[['entry_id', 'asym_id_1', 'asym_id_2']].drop_duplicates())
len(complete_metadata[complete_metadata[['predicted_R_pdb', 'predicted_L_pdb']].duplicated()])

3107

In [8]:
display_all(complete_metadata[complete_metadata[['predicted_R_pdb', 'predicted_L_pdb']].duplicated()].head(3))

Unnamed: 0,id,entry_id,method,date,release_date,resolution,label,probability,chain1_id,chain2_id,assembly,assembly_details,oligomeric_details,oligomeric_count,biol_details,complex_type,chain_1,asym_id_1,chain_2,asym_id_2,length1,length2,length_resolved_1,length_resolved_2,number_of_components_1,number_of_components_2,link_density,planarity,max_var_1,max_var_2,num_atom_types,n_residue_pairs,n_residues,buried_sasa,intermolecular_contacts,charged_charged_contacts,charged_polar_contacts,charged_apolar_contacts,polar_polar_contacts,apolar_polar_contacts,apolar_apolar_contacts,interface_atom_gaps_4A,missing_interface_residues_4A,interface_atom_gaps_8A,missing_interface_residues_8A,entity_id_R,entity_id_L,pdb_strand_id_R,pdb_strand_id_L,ECOD_names_R,ECOD_names_L,split,pdb_id,cluster_id,cluster_id_R,cluster_id_L,pinder_s,pinder_xl,pinder_af2,pinder_af2_hard,uniprot_R,uniprot_L,holo_R_pdb,holo_L_pdb,predicted_R_pdb,predicted_L_pdb,apo_R_pdb,apo_L_pdb,apo_R_pdbs,apo_L_pdbs,holo_R,holo_L,predicted_R,predicted_L,apo_R,apo_L,apo_R_quality,apo_L_quality,chain1_neff,chain2_neff,chain_R,chain_L,contains_antibody,contains_antigen,contains_enzyme
63,4oyd__A1_P0C6Z1--4oyd__B1_UNDEFINED,4oyd,X-RAY DIFFRACTION,2014-02-11,2014-07-09,1.799805,BIO,0.839844,R,L,1,author_and_software_defined_assembly,dimeric,2,gelfiltration,heteromer,A1,A,B1,B,156,117,156,117,1,1,0.090027,3.902344,0.450928,0.838867,4,304,90,2572.0,79,16,9,18,2,10,24,0,0,0,0,1,2,A,B,PF00452,,test,4oyd,cluster_2306_368,cluster_2306,cluster_368,False,True,False,False,P0C6Z1,UNDEFINED,4oyd__A1_P0C6Z1-R.pdb,4oyd__B1_UNDEFINED-L.pdb,,,,,,,True,True,False,False,False,False,,,10.414062,0.185059,A1,B1,False,False,False
110,1g6u__A1_UNDEFINED--1g6u__B1_UNDEFINED,1g6u,X-RAY DIFFRACTION,2000-11-07,2001-02-21,1.480469,BIO,0.795898,R,L,1,author_and_software_defined_assembly,dimeric,2,?,homomer,A1,A,B1,B,48,48,48,48,1,1,0.130005,3.566406,0.846191,0.856934,3,208,76,2086.0,65,7,2,16,0,4,36,0,0,0,0,1,1,A,B,,,val,1g6u,cluster_34425_34425,cluster_34425,cluster_34425,False,False,False,False,UNDEFINED,UNDEFINED,1g6u__A1_UNDEFINED-R.pdb,1g6u__B1_UNDEFINED-L.pdb,,,,,,,True,True,False,False,False,False,,,0.192017,0.192017,A1,B1,False,False,False
121,3tdm__A1_UNDEFINED--3tdm__B1_UNDEFINED,3tdm,X-RAY DIFFRACTION,2011-08-11,2011-11-16,2.400391,BIO,0.97998,R,L,1,author_and_software_defined_assembly,dimeric,2,?,homomer,A1,A,B1,B,120,120,120,120,1,1,0.059998,5.152344,0.570312,0.560547,4,587,153,3522.0,124,4,11,20,6,27,56,0,0,0,0,1,1,A,B,PF00977,PF00977,val,3tdm,cluster_3830_3830,cluster_3830,cluster_3830,False,False,False,False,UNDEFINED,UNDEFINED,3tdm__A1_UNDEFINED-R.pdb,3tdm__B1_UNDEFINED-L.pdb,,,,,,,True,True,False,False,False,False,,,1057.0,1082.0,A1,B1,False,False,False


### Example for `3fil__A1_P19909--3fil__B1_P19909`
Would like to do an example but we need access to the af files.