In [135]:
import os
import numpy as np
import torch
import pandas as pd
from biopandas.pdb import PandasPdb
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from Bio.PDB import DSSP, HSExposureCB, PPBuilder, is_aa, NeighborSearch
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.SeqUtils import seq1
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB, ComplementNB, BernoulliNB
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from timeit import default_timer as timer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, average_precision_score, roc_auc_score


In [110]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
current_directory = os.getcwd()
path_ring = current_directory + "/data/features_ring/"
path_pdb = current_directory + "/data/pdb_files/"
path_zip = current_directory + '/df_data-2.zip'

In [131]:
df = pd.read_pickle(path_zip)
df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a2,t_a3,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn
0,3m7l,A,123,,R,H,0.032,19.0,10.0,-1.747,...,-0.987,-1.505,1.266,-0.912,VDW,6.722287,LEU,VAL,GLU,TRP
1,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,-0.987,-1.505,1.266,-0.912,HBOND,6.391156,LEU,ALA,VAL,ILE
2,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,-0.987,-1.505,1.266,-0.912,VDW,6.391156,LEU,ALA,VAL,ILE
4,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.590,1.891,-0.397,0.412,HBOND,5.929388,THR,PRO,SER,TYR
5,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.590,1.891,-0.397,0.412,VDW,5.929388,THR,PRO,SER,TYR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1568,4a6r,B,400,,I,H,0.101,20.0,13.0,-1.236,...,-0.590,1.891,-0.397,0.412,VDW,9.463818,GLU,GLY,GLU,GLU
1569,4a6r,A,382,,F,E,0.000,16.0,23.0,-2.681,...,-0.547,2.131,0.393,0.816,HBOND,5.446033,ALA,THR,HIS,VAL
1570,4a6r,A,266,,G,T,0.000,26.0,24.0,1.706,...,1.652,1.330,1.045,2.064,HBOND,5.035146,PHE,ARG,THR,GLU
1571,4a6r,B,303,,G,-,0.083,14.0,19.0,-1.338,...,-1.302,-0.733,1.570,-0.146,HBOND,5.847882,VAL,LYS,VAL,GLU


In [5]:
dfs = []
for filename in os.listdir(path_ring):
    dfs.append(pd.read_csv(path_ring + filename, sep='\t'))
df = pd.concat(dfs)

df.dropna(inplace=True)
y = df['Interaction'].astype('category')

# Add Feature: CA-CA Distances between source & target residues

In [3]:
from Bio import PDB
import numpy as np
import os
from tqdm import tqdm
from joblib import Parallel, delayed


parser = PDB.PDBParser(QUIET=True)


def get_residue_distance(pdb_id, s_resi, t_resi, s_ch, t_ch, pdb_file_path):
    """Calculates CA-CA distance between two residues in a PDB file"""

    structure = parser.get_structure(pdb_id, pdb_file_path)
    model = structure[0]


    try: # locate source- & target-chains
        s_chain = model[s_ch]
        t_chain = model[t_ch]
    except KeyError:
        raise ValueError(f"Chain {s_ch} or {t_ch} not found in structure {pdb_id}")


    try: # locate source- & target-residues
        s_residue = s_chain[s_resi]
        t_residue = t_chain[t_resi]
    except KeyError:
        raise ValueError(f"Residue {s_resi} or {t_resi} not found in chains {s_ch} or {t_ch}")


    try: # locate alpha carbons
        s_ca = s_residue['CA']
        t_ca = t_residue['CA']
    except KeyError:
        raise ValueError(f"Alpha-carbon not found in residue {s_resi} or {t_resi}")
    

    s_ca_coord = s_ca.get_coord()
    t_ca_coord = t_ca.get_coord()

    distance = np.linalg.norm(s_ca_coord - t_ca_coord)

    return distance


def process_row(index, row, pdb_directory):

    pdb_id = row['pdb_id']
    s_resi = row['s_resi']
    t_resi = row['t_resi']
    s_ch = row['s_ch']
    t_ch = row['t_ch']

    pdb_file_path = os.path.join(pdb_directory, f'{pdb_id}.pdb')

    if not os.path.isfile(pdb_file_path):
        print(f"File {pdb_file_path} does not exist.")
        return None

    try:
        distance = get_residue_distance(pdb_id, s_resi, t_resi, s_ch, t_ch, pdb_file_path)
        return distance
    except Exception as e:
        print(f"Error processing {pdb_id} (row {index}): {e}")
        return None


def calculate_distances_parallel(df, pdb_directory, n_jobs=-1):
    # Use Parallel to process each row in the dataframe in parallel
    ca_distances = Parallel(n_jobs=n_jobs)(
        delayed(process_row)(index, row, pdb_directory) for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Rows")
    )
    
    # Add the result to the dataframe
    df['CA_CA_distance'] = ca_distances
    return df

if not os.path.exists(current_directory + '/data/df_data.pkl'):
    pdb_directory = path_pdb
    df = calculate_distances_parallel(df, pdb_directory, n_jobs=-1)
else:
    df = pd.read_pickle(current_directory + '/data/df_data.pkl')

# Add Feature: Sequence Neighbors (left/right) Aminoacid Type 

In [7]:
from Bio import PDB
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
import os

def compute_residue_names(pdb_id, s_ch, s_resi, t_ch, t_resi, path_pdb):
    pdb_file = path_pdb + f"{pdb_id}.pdb"
    if not os.path.isfile(pdb_file):
        return None, None, None, None
    
    structure = PDB.PDBParser(QUIET=True).get_structure(pdb_id, pdb_file)

    s_resn_prev, s_resn_next, t_resn_prev, t_resn_next = None, None, None, None
    for model in structure:
        for chain in model:
            for residue in chain:
                if chain.id == s_ch and residue.id[1] == s_resi - 1:
                    s_resn_prev = residue.resname
                if chain.id == s_ch and residue.id[1] == s_resi + 1:
                    s_resn_next = residue.resname
                if chain.id == t_ch and residue.id[1] == t_resi - 1:
                    t_resn_prev = residue.resname
                if chain.id == t_ch and residue.id[1] == t_resi + 1:
                    t_resn_next = residue.resname

    return s_resn_prev, s_resn_next, t_resn_prev, t_resn_next


def process_row(row, path_pdb):
    pdb_id = row['pdb_id']
    s_ch = row['s_ch']
    t_ch = row['t_ch']
    s_resi = row['s_resi']
    t_resi = row['t_resi']

    return compute_residue_names(pdb_id, s_ch, s_resi, t_ch, t_resi, path_pdb)


def process_dataset(df, path_pdb):

    results = Parallel(n_jobs=-1)(
        delayed(process_row)(row, path_pdb) for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing Dataset")
    )

    # Extract the results and add them as new columns
    prev_s_resn_list, next_s_resn_list, prev_t_resn_list, next_t_resn_list = zip(*results)

    df['prev_s_resn'] = prev_s_resn_list
    df['next_s_resn'] = next_s_resn_list
    df['prev_t_resn'] = prev_t_resn_list
    df['next_t_resn'] = next_t_resn_list

    return df


if not os.path.exists(current_directory + '/data/df_data.pkl'):
    df = process_dataset(df, path_pdb)
else:
    df = pd.read_pickle(current_directory + '/data/df_data.pkl')


# Add Feature: Neighbors in 3D Space with sequence_separtion=6 

In [6]:
import os
import numpy as np
import pandas as pd
from Bio import PDB
from tqdm import tqdm
from Bio.PDB import PDBParser, NeighborSearch
from joblib import Parallel, delayed

three_to_one_letter = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
    'GLU': 'E', 'GLN': 'Q', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
    'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
    'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
}

parser = PDB.PDBParser(QUIET=True)

def compute_residue_names(pdb_id, s_ch, s_resi, t_ch, t_resi):

    pdb_file = os.path.join(path_pdb, f'{pdb_id}.pdb')
    structure = parser.get_structure(pdb_id, pdb_file)
    
    source_residue = structure[0][s_ch][s_resi]
    target_residue = structure[0][t_ch][t_resi]

    all_atoms = list(structure.get_atoms())  # Search across the entire structure
    ns = NeighborSearch(all_atoms)

    # Residues within 8.0 Å distance
    contacts = ns.search_all(8.0, level="R")

    # Exclude contacts with sequence separation <= 6
    filtered_contacts = []
    for res1, res2 in contacts:

        is_amino_acid_1 = is_aa(res1)
        is_amino_acid_2 = is_aa(res2)

        if is_amino_acid_1 and is_amino_acid_2:
            res1_id = res1.get_id()[1]
            res2_id = res2.get_id()[1]

            if abs(res1_id - res2_id) > 6:
                filtered_contacts.append((res1, res2))

    # Extract coordinates
    source_coords = np.array([atom.coord for atom in source_residue.get_atoms()])
    target_coords = np.array([atom.coord for atom in target_residue.get_atoms()])

    min_distance_s, min_distance_t = float('inf'), float('inf')
    s_resn_neighbour, t_resn_neighbour = None, None

    # Process filtered contacts to find the closest residues
    for residue1, residue2 in filtered_contacts:
        if residue1 == target_residue or residue2 == target_residue:
            other_residue = residue2 if residue1 == target_residue else residue1
            other_coords = np.array([atom.coord for atom in other_residue.get_atoms()])
            distances = np.linalg.norm(target_coords[:, np.newaxis] - other_coords, axis=-1)
            min_distance = np.min(distances)

            if min_distance < min_distance_t:
                min_distance_t = min_distance
                t_resn_neighbour = other_residue

        if residue1 == source_residue or residue2 == source_residue:
            other_residue = residue2 if residue1 == source_residue else residue1
            other_coords = np.array([atom.coord for atom in other_residue.get_atoms()])
            distances = np.linalg.norm(source_coords[:, np.newaxis] - other_coords, axis=-1)
            min_distance = np.min(distances)

            if min_distance < min_distance_s:
                min_distance_s = min_distance
                s_resn_neighbour = other_residue

    return (s_resn_neighbour.get_resname(),
            t_resn_neighbour.get_resname())


def process_row(row):
    pdb_id = row['pdb_id']
    s_ch = row['s_ch']
    t_ch = row['t_ch']
    s_resi = row['s_resi']
    t_resi = row['t_resi']

    s_resn_neighbour, t_resn_neighbour = compute_residue_names(pdb_id, s_ch, s_resi, t_ch, t_resi)
    
    s_resn_neighbour = three_to_one_letter.get(s_resn_neighbour, s_resn_neighbour)
    t_resn_neighbour = three_to_one_letter.get(t_resn_neighbour, t_resn_neighbour)
    
    return s_resn_neighbour, t_resn_neighbour


def process_dataset(df, n_jobs=-1):
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_row)(row) for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows")
    )

    s_resn_neighbours_list, t_resn_neighbours_list = zip(*results)
    df['s_resn_neighbour'] = s_resn_neighbours_list
    df['t_resn_neighbour'] = t_resn_neighbours_list

    return df


df = process_dataset(df)


Processing rows: 100%|██████████| 100/100 [00:06<00:00, 14.69it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['s_resn_neighbour'] = s_resn_neighbours_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['t_resn_neighbour'] = t_resn_neighbours_list


In [132]:

df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a2,t_a3,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn
0,3m7l,A,123,,R,H,0.032,19.0,10.0,-1.747,...,-0.987,-1.505,1.266,-0.912,VDW,6.722287,LEU,VAL,GLU,TRP
1,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,-0.987,-1.505,1.266,-0.912,HBOND,6.391156,LEU,ALA,VAL,ILE
2,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,-0.987,-1.505,1.266,-0.912,VDW,6.391156,LEU,ALA,VAL,ILE
4,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.590,1.891,-0.397,0.412,HBOND,5.929388,THR,PRO,SER,TYR
5,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.590,1.891,-0.397,0.412,VDW,5.929388,THR,PRO,SER,TYR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1568,4a6r,B,400,,I,H,0.101,20.0,13.0,-1.236,...,-0.590,1.891,-0.397,0.412,VDW,9.463818,GLU,GLY,GLU,GLU
1569,4a6r,A,382,,F,E,0.000,16.0,23.0,-2.681,...,-0.547,2.131,0.393,0.816,HBOND,5.446033,ALA,THR,HIS,VAL
1570,4a6r,A,266,,G,T,0.000,26.0,24.0,1.706,...,1.652,1.330,1.045,2.064,HBOND,5.035146,PHE,ARG,THR,GLU
1571,4a6r,B,303,,G,-,0.083,14.0,19.0,-1.338,...,-1.302,-0.733,1.570,-0.146,HBOND,5.847882,VAL,LYS,VAL,GLU


# Add Feature: Lenght of a sequence

In [136]:

from Bio import PDB
from tqdm import tqdm
from Bio.PDB import PDBParser, NeighborSearch
from joblib import Parallel, delayed


def calc_chain_lenght(pdb_id, s_ch, t_ch):
    pdb_file = path_pdb + f"{pdb_id}.pdb"
    if not os.path.isfile(pdb_file):
        return None, None
    
    structure = PDB.PDBParser(QUIET=True).get_structure(pdb_id, pdb_file)
    try:
        if s_ch == t_ch:
            selected_residues = [residue for residue in structure[0][s_ch] if residue.id[0] == " "]
            seq_len = len(selected_residues)
            return [seq_len, seq_len]
        
        else:
            s_selected_residues = [residue for residue in structure[0][s_ch] if residue.id[0] == " "]
            t_selected_residues = [residue for residue in structure[0][t_ch] if residue.id[0] == " "]
            seq_len_s_ch = len(s_selected_residues)
            seq_len_t_ch = len(t_selected_residues)
            return [seq_len_s_ch, seq_len_t_ch]
    except Exception as e:
        print(f"Error processing {pdb_id}, chain {s_ch} or chain {t_ch}: {e}")
        return None

def add_chain_lenghts(df):
    df['s_ch_seq_len'] = None
    df['t_ch_seq_len'] = None
    for idx, row in df.iterrows():
        pdb_id = row['pdb_id']
        s_chain_id = row['s_ch']
        t_chain_id = row['t_ch']
        
        # Calculate the chain lengths
        length_chain = calc_chain_lenght(pdb_id, s_chain_id, t_chain_id)
        
        # Update the DataFrame
        df.at[idx, 's_ch_seq_len'] = length_chain[0]
        df.at[idx, 't_ch_seq_len'] = length_chain[1]
    
    return df


In [144]:
df_mini = df.head(2000)
df_mini

add_chain_lenghts(df_mini)
df_mini

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['s_ch_seq_len'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['t_ch_seq_len'] = None


Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn,s_ch_seq_len,t_ch_seq_len
0,3m7l,A,123,,R,H,0.032,19.0,10.0,-1.747,...,1.266,-0.912,VDW,6.722287,LEU,VAL,GLU,TRP,300,300
1,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,1.266,-0.912,HBOND,6.391156,LEU,ALA,VAL,ILE,142,142
2,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,1.266,-0.912,VDW,6.391156,LEU,ALA,VAL,ILE,300,300
4,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.397,0.412,HBOND,5.929388,THR,PRO,SER,TYR,166,166
5,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.397,0.412,VDW,5.929388,THR,PRO,SER,TYR,142,142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,4muv,B,279,,S,-,0.246,5.0,28.0,-2.152,...,0.908,1.313,HBOND,5.752223,GLY,VAL,ALA,THR,139,139
198,4muv,A,247,,I,H,0.006,20.0,12.0,-1.201,...,1.266,-0.912,VDW,7.357159,GLU,VAL,MET,CYS,142,142
201,4muv,A,343,,K,H,0.571,5.0,12.0,-1.215,...,0.113,-0.837,HBOND,6.086410,ARG,THR,LEU,ARG,142,142
202,4muv,A,344,,T,H,0.134,12.0,9.0,-1.088,...,0.113,-0.837,VDW,5.012308,LYS,ALA,LEU,ARG,142,142


In [147]:
df_mini[(df_mini['pdb_id'] == '4muv') & (df_mini['s_ch'] == 'A') & (df_mini['t_ch'] == 'B')]

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn,s_ch_seq_len,t_ch_seq_len
11,4muv,A,254,,R,E,0.141,17.0,24.0,-2.44,...,-0.259,-3.242,VDW,6.749897,ALA,THR,GLY,PHE,142,139
59,4muv,A,218,,V,H,0.148,15.0,18.0,-1.091,...,1.266,-0.912,VDW,6.202717,GLU,ARG,GLN,VAL,142,139


# Data Transformation

In [112]:
# Convert new feature columns so that they are consistent with the existing ones
three_to_one_letter = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
    'GLU': 'E', 'GLN': 'Q', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
    'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
    'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
}


df['prev_s_resn'] = df['prev_s_resn'].replace(three_to_one_letter)
df['next_s_resn'] = df['next_s_resn'].replace(three_to_one_letter)
df['prev_t_resn'] = df['prev_t_resn'].replace(three_to_one_letter)
df['next_t_resn'] = df['next_t_resn'].replace(three_to_one_letter)
df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a2,t_a3,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn
0,3m7l,A,123,,R,H,0.032,19.0,10.0,-1.747,...,-0.987,-1.505,1.266,-0.912,VDW,6.722287,L,V,E,W
1,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,-0.987,-1.505,1.266,-0.912,HBOND,6.391156,L,A,V,I
2,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,-0.987,-1.505,1.266,-0.912,VDW,6.391156,L,A,V,I
4,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.590,1.891,-0.397,0.412,HBOND,5.929388,T,P,S,Y
5,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.590,1.891,-0.397,0.412,VDW,5.929388,T,P,S,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1568,4a6r,B,400,,I,H,0.101,20.0,13.0,-1.236,...,-0.590,1.891,-0.397,0.412,VDW,9.463818,E,G,E,E
1569,4a6r,A,382,,F,E,0.000,16.0,23.0,-2.681,...,-0.547,2.131,0.393,0.816,HBOND,5.446033,A,T,H,V
1570,4a6r,A,266,,G,T,0.000,26.0,24.0,1.706,...,1.652,1.330,1.045,2.064,HBOND,5.035146,F,R,T,E
1571,4a6r,B,303,,G,-,0.083,14.0,19.0,-1.338,...,-1.302,-0.733,1.570,-0.146,HBOND,5.847882,V,K,V,E


In [113]:
#Dropping NAs after adding features
df.dropna(inplace=True)
print(f'Are there still missing values? : {df.isnull().any().any()}') #False 

Are there still missing values? : False


In [114]:
#If we want to delete rows with HOH etc

print(f'unique values for next_s_resn column are: {df['next_s_resn'].unique()}')
print(f'unique values for prev_s_resn column are: {df['prev_s_resn'].unique()}')
print(f'unique values for next_t_resn column are: {df['next_t_resn'].unique()}')
print(f'unique values for prev_t_resn column are: {df['prev_t_resn'].unique()}')

df = df[~((df['next_s_resn'] == "HOH") | (df['next_s_resn'] == "OCS") | (df['prev_s_resn'] == "HOH") | (df['next_t_resn'] == "CSO") | (df['next_t_resn'] == "HOH") | (df['prev_t_resn'] == "HOH") )]
df 

unique values for next_s_resn column are: ['V' 'A' 'P' 'F' 'E' 'L' 'T' 'H' 'G' 'Y' 'D' 'Q' 'I' 'S' 'K' 'R' 'W' 'C'
 'N' 'M' 'HOH' 'OCS']
unique values for prev_s_resn column are: ['L' 'T' 'H' 'Y' 'I' 'N' 'E' 'G' 'A' 'W' 'P' 'S' 'F' 'V' 'R' 'D' 'Q' 'K'
 'C' 'M' 'HOH']
unique values for next_t_resn column are: ['W' 'I' 'Y' 'G' 'V' 'L' 'S' 'Q' 'E' 'A' 'N' 'T' 'F' 'P' 'R' 'H' 'K' 'C'
 'D' 'M' 'CSO' 'HOH']
unique values for prev_t_resn column are: ['E' 'V' 'S' 'Y' 'A' 'T' 'P' 'L' 'F' 'I' 'N' 'R' 'G' 'D' 'Q' 'W' 'K' 'C'
 'H' 'M' 'HOH']


Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_a2,t_a3,t_a4,t_a5,Interaction,CA_CA_distance,prev_s_resn,next_s_resn,prev_t_resn,next_t_resn
0,3m7l,A,123,,R,H,0.032,19.0,10.0,-1.747,...,-0.987,-1.505,1.266,-0.912,VDW,6.722287,L,V,E,W
1,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,-0.987,-1.505,1.266,-0.912,HBOND,6.391156,L,A,V,I
2,3m7l,A,104,,I,H,0.485,9.0,11.0,-1.124,...,-0.987,-1.505,1.266,-0.912,VDW,6.391156,L,A,V,I
4,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.590,1.891,-0.397,0.412,HBOND,5.929388,T,P,S,Y
5,3m7l,A,139,,H,-,0.049,20.0,18.0,-2.085,...,-0.590,1.891,-0.397,0.412,VDW,5.929388,T,P,S,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1568,4a6r,B,400,,I,H,0.101,20.0,13.0,-1.236,...,-0.590,1.891,-0.397,0.412,VDW,9.463818,E,G,E,E
1569,4a6r,A,382,,F,E,0.000,16.0,23.0,-2.681,...,-0.547,2.131,0.393,0.816,HBOND,5.446033,A,T,H,V
1570,4a6r,A,266,,G,T,0.000,26.0,24.0,1.706,...,1.652,1.330,1.045,2.064,HBOND,5.035146,F,R,T,E
1571,4a6r,B,303,,G,-,0.083,14.0,19.0,-1.338,...,-1.302,-0.733,1.570,-0.146,HBOND,5.847882,V,K,V,E


In [115]:

label_encoder = LabelEncoder()

def encode_object_columns(df):
    label_encoder = LabelEncoder()
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].astype(str)
            df[column] = label_encoder.fit_transform(df[column])
    return df

df = encode_object_columns(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = label_encoder.fit_transform(df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

In [116]:
X = df.drop(columns=['Interaction'])
y = df['Interaction']

y = to_categorical(y, num_classes=10)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Feature Importance

In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Sample a subset of the data
sample_size = 50000
df_sample = df.sample(n=sample_size, random_state=42)

X_sample = df_sample.drop(columns=['Interaction'])
y_sample = df_sample['Interaction']

# Convert labels to one-hot encoding if necessary and then back to labels for Random Forest
y_sample = pd.get_dummies(y_sample)  # One-hot encode if needed
y_sample_labels = y_sample.values.argmax(axis=1)  # Convert back to labels

rf = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=42)  # Reduced parameters
rf.fit(X_sample, y_sample_labels)

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 8))
plt.title("Feature Importance")
plt.bar(range(X_sample.shape[1]), importances[indices], align="center")
plt.xticks(range(X_sample.shape[1]), X_sample.columns[indices], rotation=90)
plt.xlim([-1, X_sample.shape[1]])
plt.show()


In [None]:
X = df.drop(columns=['Interaction', 't_ss3', "s_ss3", "t_ins", "s_ins" ])
y = df['Interaction']

y = to_categorical(y, num_classes=10) # One-hot encode the labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handle Imbalanced Data

In [None]:
from sklearn.utils.class_weight import compute_class_weight

y_train_labels = np.argmax(y_train, axis=1)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_labels), y=y_train_labels)
class_weight_dict = dict(enumerate(class_weights))

# Model Training

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

model = Sequential()

# Input layer and first hidden layer
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

# Second hidden layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

# Third hidden layer
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))

# Fourth hidden layer
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.3))

# Output layer
model.add(Dense(10, activation='softmax'))

optimizer = Adam(learning_rate=0.0001)

model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Implement early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define K-Fold cross-validator
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Store scores for each fold
fold_accuracies = []

# K-Fold Cross Validation
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Create a new model instance for each fold
    optimizer = Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Train the model
    history = model.fit(X_train_fold, y_train_fold,
                        validation_data=(X_val_fold, y_val_fold),
                        epochs=50,
                        batch_size=32,
                        class_weight=class_weight_dict,
                        callbacks=[early_stopping],
                        verbose=0)  # Set verbose to 1 for detailed output

   # Predict the labels for the validation fold
    y_val_pred = np.argmax(model.predict(X_val_fold), axis=1)
    y_val_true = np.argmax(y_val_fold, axis=1)

    # Compute balanced accuracy for the current fold
    balanced_acc = balanced_accuracy_score(y_val_true, y_val_pred)
    print(f"Balanced accuracy for current fold: {balanced_acc:.4f}")

    fold_accuracies.append(balanced_acc)

# Calculate and print the average balanced accuracy across all folds
average_balanced_accuracy = np.mean(fold_accuracies)
print(f"Average balanced accuracy across all folds: {average_balanced_accuracy:.4f}")


# Predict

In [None]:
y_pred_proba = model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

y_true = np.argmax(y_test, axis=1)

mcc = matthews_corrcoef(y_true, y_pred)
balanced_acc = balanced_accuracy_score(y_true, y_pred)
avg_precision = average_precision_score(y_test, y_pred_proba, average='macro')

print(f"Test accuracy: {test_acc:.4f}")
print(f"Matthews Correlation Coefficient: {mcc:.4f}")
print(f"Balanced Accuracy: {balanced_acc:.4f}")
print(f"Average Precision Score: {avg_precision:.4f}")
