In [49]:
import os
import gzip
import shutil
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

from matplotlib.ticker import MultipleLocator, AutoMinorLocator
from Bio.PDB import (
    PDBList,
    PDBIO,
    NeighborSearch,
    calc_angle,
    calc_dihedral,
    PPBuilder,
    is_aa,
)
from Bio.PDB.PDBParser import PDBParser
from Bio.SeqUtils import IUPACData, seq1
from Bio.PDB.PDBIO import Select
from Bio.SeqIO.PdbIO import PdbSeqresIterator

# Data Extraction

We'll use this notebook to create the datasets we are going to use.

All the datasets created here will be saved in `data` directory

In [50]:
path_to_data = Path("../data")  # Access to data folder

### All features

We start by extracting all the features from the table given from the professor

In [51]:
# Combine all PDBs into a single dataframe
ring_path = Path("../features_ring")

dfs = []
for filename in os.listdir(ring_path):
    dfs.append(pd.read_csv(ring_path / filename, sep="\t"))
df = pd.concat(dfs)

df.to_csv(path_to_data / "df.csv", index=False)

df.head()

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_down,t_phi,t_psi,t_ss3,t_a1,t_a2,t_a3,t_a4,t_a5,Interaction
0,1u9c,A,32,,Y,H,0.149,22.0,24.0,-0.971,...,11.0,-1.163,-0.725,H,-1.006,-0.59,1.891,-0.397,0.412,
1,1u9c,A,112,,L,H,0.0,25.0,25.0,-1.135,...,16.0,-1.136,-0.853,H,-0.591,-1.302,-0.733,1.57,-0.146,HBOND
2,1u9c,A,107,,T,H,0.162,11.0,16.0,-1.096,...,21.0,-1.128,-0.789,H,0.26,0.83,3.097,-0.838,1.512,
3,1u9c,A,147,,K,-,0.19,18.0,8.0,-1.795,...,17.0,-1.631,1.771,H,0.945,0.828,1.299,-0.169,0.933,HBOND
4,1u9c,A,109,,Q,H,0.263,13.0,21.0,-1.056,...,25.0,-1.135,-0.671,H,-1.019,-0.987,-1.505,1.266,-0.912,


#### Numerical

In [52]:
df_num = df[
    [
        "s_rsa",
        "s_up",
        "s_down",
        "s_phi",
        "s_psi",
        "s_a1",
        "s_a2",
        "s_a3",
        "s_a4",
        "s_a5",
        "t_rsa",
        "t_up",
        "t_down",
        "t_phi",
        "t_psi",
        "t_a1",
        "t_a2",
        "t_a3",
        "t_a4",
        "t_a5",
        "Interaction",
    ]
]

df_num.head()

Unnamed: 0,s_rsa,s_up,s_down,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,...,t_up,t_down,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,Interaction
0,0.149,22.0,24.0,-0.971,-0.821,0.26,0.83,3.097,-0.838,1.512,...,25.0,11.0,-1.163,-0.725,-1.006,-0.59,1.891,-0.397,0.412,
1,0.0,25.0,25.0,-1.135,-0.671,-1.019,-0.987,-1.505,1.266,-0.912,...,17.0,16.0,-1.136,-0.853,-0.591,-1.302,-0.733,1.57,-0.146,HBOND
2,0.162,11.0,16.0,-1.096,-0.857,-0.032,0.326,2.213,0.908,1.313,...,10.0,21.0,-1.128,-0.789,0.26,0.83,3.097,-0.838,1.512,
3,0.19,18.0,8.0,-1.795,2.558,1.831,-0.561,0.533,-0.277,1.648,...,5.0,17.0,-1.631,1.771,0.945,0.828,1.299,-0.169,0.933,HBOND
4,0.263,13.0,21.0,-1.056,-0.744,0.931,-0.179,-3.005,-0.503,-1.853,...,25.0,25.0,-1.135,-0.671,-1.019,-0.987,-1.505,1.266,-0.912,


In [53]:
# Saving the numerical dataset
df_num.to_csv(path_to_data / "df_num.csv", index=False)

#### Categorical

In [54]:
df_cat = df[
    [
        "s_ch",
        "s_ins",
        "s_resn",
        "s_ss3",
        "s_ss8",
        "t_ch",
        "t_ins",
        "t_resi",
        "t_resn",
        "t_ss3",
        "t_ss8",
        "Interaction",
    ]
]

for f in df_cat.columns:
    df_cat[f] = df_cat[f].astype("category").cat.codes

df_cat.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[f] = df_cat[f].astype("category").cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[f] = df_cat[f].astype("category").cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[f] = df_cat[f].astype("category").cat.codes
A value is trying to be set on a copy of a slice fro

Unnamed: 0,s_ch,s_ins,s_resn,s_ss3,s_ss8,t_ch,t_ins,t_resi,t_resn,t_ss3,t_ss8,Interaction
0,0,0,19,0,4,0,0,55,4,0,4,-1
1,0,0,9,0,4,0,0,136,0,0,4,0
2,0,0,16,0,4,0,0,130,19,0,4,-1
3,0,0,8,0,0,0,0,200,11,0,2,0
4,0,0,13,0,4,0,0,132,9,0,4,-1


In [55]:
# Saving the categorical dataset
df_cat.to_csv(path_to_data / "df_cat.csv", index=False)

In [56]:
df_complete = pd.concat([df_num.drop("Interaction", axis=1), df_cat], axis=1)

df_complete.head()

Unnamed: 0,s_rsa,s_up,s_down,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,...,s_resn,s_ss3,s_ss8,t_ch,t_ins,t_resi,t_resn,t_ss3,t_ss8,Interaction
0,0.149,22.0,24.0,-0.971,-0.821,0.26,0.83,3.097,-0.838,1.512,...,19,0,4,0,0,55,4,0,4,-1
1,0.0,25.0,25.0,-1.135,-0.671,-1.019,-0.987,-1.505,1.266,-0.912,...,9,0,4,0,0,136,0,0,4,0
2,0.162,11.0,16.0,-1.096,-0.857,-0.032,0.326,2.213,0.908,1.313,...,16,0,4,0,0,130,19,0,4,-1
3,0.19,18.0,8.0,-1.795,2.558,1.831,-0.561,0.533,-0.277,1.648,...,8,0,0,0,0,200,11,0,2,0
4,0.263,13.0,21.0,-1.056,-0.744,0.931,-0.179,-3.005,-0.503,-1.853,...,13,0,4,0,0,132,9,0,4,-1


In [57]:
# Saving the dataset
df_complete.to_csv(path_to_data / "df_complete.csv", index=False)

### Contact Map

In [58]:
pdb_id = "1u9c"
chain_id = "A"
path_to_contact_maps = Path("../data/contact_maps")

pdbl = PDBList()

# Retrieve the PDB file and save it in the specified directory
file_path = pdbl.retrieve_pdb_file(pdb_id, file_format="pdb", pdir=path_to_contact_maps)

pdb_file = path_to_contact_maps / f"{pdb_id}.pdb"
os.rename(file_path, pdb_file)

structure = PDBParser(QUIET=True).get_structure(
    pdb_id, path_to_contact_maps / "{}.pdb".format(pdb_id)
)

selected_residues = [
    residue for residue in structure[0][chain_id] if residue.id[0] == " "
]

print(f"len(selected_residues): {len(selected_residues)}")

Downloading PDB structure '1u9c'...
len(selected_residues): 220


In [23]:
def contact_map(pdb_id: str, save_dir: Path):
    # Create the directory if it doesn't exist
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Initialize PDBList
    pdbl = PDBList()

    # Retrieve the PDB file and save it in the specified directory
    file_path = pdbl.retrieve_pdb_file(pdb_id, file_format="pdb", pdir=save_dir)

    # Check if the file is compressed (has a .gz extension)
    if file_path.endswith(".gz"):
        compressed_file = file_path
        pdb_file = os.path.join(save_dir, f"{pdb_id}.pdb")

        # Unzip the file
        with gzip.open(compressed_file, "rb") as f_in:
            with open(pdb_file, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

        # Optionally, remove the compressed file
        os.remove(compressed_file)
    else:
        # If the file is not compressed, just rename it to the final .pdb file
        pdb_file = os.path.join(save_dir, f"{pdb_id}.pdb")
        os.rename(file_path, pdb_file)

    print(f"{pdb_id} downloaded")

In [24]:
df_complete = pd.read_csv("../data/df.csv")
path_to_contact_maps = Path("../data/contact_maps")

i = 0  # Per il momento vogliamo raccogliere solo 1000 pdb file
for i, pdb_id in enumerate(set(df_complete["pdb_id"])):
    try:
        if i >= 10:  # For now we want a small train set
            break
        contact_map(pdb_id=pdb_id, save_dir=path_to_contact_maps)
    except:
        continue

Downloading PDB structure '6had'...
../data/contact_maps/pdb6had.ent
6had downloaded
Downloading PDB structure '7au1'...
../data/contact_maps/pdb7au1.ent
7au1 downloaded
Downloading PDB structure '7lb1'...
../data/contact_maps/pdb7lb1.ent
7lb1 downloaded
Downloading PDB structure '2fwh'...
../data/contact_maps/pdb2fwh.ent
2fwh downloaded
Downloading PDB structure '5v01'...
../data/contact_maps/pdb5v01.ent
5v01 downloaded
Downloading PDB structure '5ecu'...
../data/contact_maps/pdb5ecu.ent
5ecu downloaded
Downloading PDB structure '6u4z'...
Downloading PDB structure '3wqb'...
../data/contact_maps/pdb3wqb.ent
3wqb downloaded
Downloading PDB structure '8c9t'...
../data/contact_maps/pdb8c9t.ent
8c9t downloaded
Downloading PDB structure '2qnt'...
../data/contact_maps/pdb2qnt.ent
2qnt downloaded


In [None]:
pdb_files_path = Path("../data/pdb_files")

dfs = []
for filename in os.listdir(pdb_files_path):
    dfs.append(pd.read_csv(pdb_files_path / filename, sep="\t"))
df_complete = pd.concat(dfs)
df_complete.head()