In [None]:
from Bio.PDB import PDBParser
from Bio.PDB.SASA import ShrakeRupley
from Bio.PDB.Polypeptide import is_aa
import numpy as np

# Step 1: Parse the PDB structure
parser = PDBParser(QUIET=True)
structure = parser.get_structure("AF-P84085-F1", "files/AF-P84085-F1.pdb")

# Step 2: Initialize ShrakeRupley to compute SASA
sr = ShrakeRupley()
sr.compute(structure, level="A")  # Compute at the atom level

# Step 3: Set thresholds
sasa_threshold = 10.0  # Threshold for surface residues in Å²
adjacency_threshold = 5.0  # Threshold for adjacency in Å

# Step 4: Identify surface residues
surface_residues = []
for chain in structure[0]:  # First model
    for residue in chain:
        if not is_aa(residue):  # Only consider amino acids
            continue
        
        # Sum the SASA for each atom in the residue
        residue_sasa = sum(atom.sasa for atom in residue if hasattr(atom, 'sasa'))
        
        # Collect residues with SASA above the threshold
        if residue_sasa > sasa_threshold:
            surface_residues.append(residue)

# Function to calculate the center of mass of a residue
def get_residue_center(residue):
    atom_coords = [atom.coord for atom in residue.get_atoms()]
    if len(atom_coords) == 0:
        return np.array([0.0, 0.0, 0.0])  # Handle case with no atoms
    center = np.mean(atom_coords, axis=0)  # Calculate the mean of the coordinates
    return center

# Function to find adjacent residues to a given residue list
def find_adjacent_residues(residues, structure, threshold):
    adjacent_residues = set()
    for residue in residues:
        residue_center = get_residue_center(residue)
        
        for chain in structure[0]:  # Iterate over all residues in the first model
            for res in chain:
                if res == residue or not is_aa(res):  # Skip the residue itself and non-amino acids
                    continue
                
                res_center = get_residue_center(res)
                
                # Calculate the distance between the centers of mass
                distance = np.linalg.norm(residue_center - res_center)
                if distance < threshold:
                    adjacent_residues.add(res)
    return adjacent_residues

# Step 5: Find adjacent residues to surface residues (first-level neighbors)
first_level_adjacent = find_adjacent_residues(surface_residues, structure, adjacency_threshold)

# Step 6: Find adjacent residues to the first-level adjacent residues (second-level neighbors)
second_level_adjacent = find_adjacent_residues(first_level_adjacent, structure, adjacency_threshold)

# Step 7: Print the adjacent residues
print("\nResidues at or near the surface:")
print(f"Total: {len(surface_residues)}")
for res in surface_residues:
    print(f"Chain: {res.get_parent().id}, Residue: {res.get_resname()} {res.id[1]}")
    
print("\nFirst-level adjacent residues to surface residues:")
print(f"Total: {len(first_level_adjacent)}")
for res in first_level_adjacent:
    print(f"Chain: {res.get_parent().id}, Residue: {res.get_resname()} {res.id[1]}")

print("\nSecond-level adjacent residues to first-level adjacent residues:")
print(f"Total: {len(second_level_adjacent)}")
for res in second_level_adjacent:
    print(f"Chain: {res.get_parent().id}, Residue: {res.get_resname()} {res.id[1]}")


In [None]:
import pandas as pd
from collections import Counter
from Bio.PDB import PDBParser
from Bio.PDB.SASA import ShrakeRupley
from Bio.PDB.Polypeptide import is_aa
import numpy as np
from itertools import combinations
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import networkx as nx
import matplotlib.pyplot as plt
import csv
import numpy as np

# Define interaction criteria
interaction_criteria = {
    "ARM_STACK": {"atomic_type1": "ARM", "atomic_type2": "ARM", "min_dist": 1.5, "max_dist": 3.5},
    "H_BOND": {"atomic_type1": "ACP", "atomic_type2": "DON", "min_dist": 2.0, "max_dist": 3.0},
    "HYDROPHOBIC": {"atomic_type1": "HPB", "atomic_type2": "HPB", "min_dist": 2.0, "max_dist": 3.8},
    "REPULSIVE_POS": {"atomic_type1": "POS", "atomic_type2": "POS", "min_dist": 2.0, "max_dist": 6.0},
    "REPULSIVE_NEG": {"atomic_type1": "NEG", "atomic_type2": "NEG", "min_dist": 2.0, "max_dist": 6.0},
    "SALT_BRIDGE": {"atomic_type1": "POS", "atomic_type2": "NEG", "min_dist": 2.0, "max_dist": 6.0},
}

# Full names mapping
type_full_names = {
    "ACP": "Acceptor",
    "DON": "Donor",
    "POS": "Positive",
    "NEG": "Negative",
    "HPB": "Hydrophobic",
    "ARM": "Aromatic",
    "HYDROPHOBIC": "Hydrophobic",
    "SALT_BRIDGE": "Salt bridge",
    "ARM_STACK": "Aromatic",
    "H_BOND": "Hydrogen bond",
    "REPULSIVE": "Repulsive",
    "SS_BRIDGE": "Disulfide Bridge"
}

# Amino acid to atom types mapping (same as given in the provided data)
# Amino acid to atom types mapping
amino_acid_atoms = {
    "ALA": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB"},
    "ARG": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "HPB", "CD": None, 
            "NE": "POS,DON", "CZ": "POS", "NH1": "POS,DON", "NH2": "POS,DON"},
    "ASN": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": None, 
            "OD1": "ACP", "ND2": "DON"},
    "ASP": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": None, 
            "OD1": "NEG,ACP", "OD2": "NEG,ACP"},
    "CYS": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "SG": "DON,ACP"},
    "GLN": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "HPB", 
            "CD": None, "OE1": "ACP", "NE2": "DON"},
    "GLU": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "HPB", 
            "CD": None, "OE1": "NEG,ACP", "OE2": "NEG,ACP"},
    "GLY": {"N": "DON", "CA": None, "C": None, "O": "ACP"},
    "HIS": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "ARM", 
            "ND1": "ARM,POS", "CD2": "ARM", "CE1": "ARM", "NE2": "ARM,POS"},
    "ILE": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG1": "HPB", 
            "CG2": "HPB", "CD1": "HPB"},
    "LEU": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "HPB", 
            "CD1": "HPB", "CD2": "HPB"},
    "LYS": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "HPB", 
            "CD": "HPB", "CE": None, "NZ": "POS,DON"},
    "MET": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "HPB", 
            "SD": "ACP", "CE": "HPB"},
    "PHE": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "HPB,ARM", 
            "CD1": "HPB,ARM", "CD2": "HPB,ARM", "CE1": "HPB,ARM", "CE2": "HPB,ARM", 
            "CZ": "HPB,ARM"},
    "PRO": {"N": None, "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "HPB", 
            "CD": None},
    "SER": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": None, "OG": "DON,ACP"},
    "THR": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": None, "OG1": "DON,ACP", 
            "CG2": "HPB"},
    "TRP": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "HPB,ARM", 
            "CD1": "ARM", "CD2": "HPB,ARM", "NE1": "ARM,DON", "CE2": "ARM", 
            "CE3": "HPB,ARM", "CZ2": "HPB,ARM", "CZ3": "HPB,ARM", "CH2": "HPB,ARM"},
    "TYR": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", "CG": "HPB,ARM", 
            "CD1": "HPB,ARM", "CD2": "HPB,ARM", "CE1": "HPB,ARM", "CE2": "HPB,ARM", 
            "CZ": "ARM", "OH": "DON,ACP"},
    "VAL": {"N": "DON", "CA": None, "C": None, "O": "ACP", "CB": "HPB", 
            "CG1": "HPB", "CG2": "HPB"},
}


In [None]:
# Function to calculate the center of mass of a residue
def get_residue_center(residue):
    atom_coords = [atom.coord for atom in residue.get_atoms()]
    if len(atom_coords) == 0:
        return np.array([0.0, 0.0, 0.0])  # Handle case with no atoms
    center = np.mean(atom_coords, axis=0)  # Calculate the mean of the coordinates
    return center


# Function to find adjacent residues to a given residue list
def find_adjacent_residues(residues, structure, threshold):
    adjacent_residues = set()
    for residue in residues:
        residue_center = get_residue_center(residue)
        
        for chain in structure[0]:  # Iterate over all residues in the first model
            for res in chain:
                if res == residue or not is_aa(res):  # Skip the residue itself and non-amino acids
                    continue
                
                res_center = get_residue_center(res)
                
                # Calculate the distance between the centers of mass
                distance = np.linalg.norm(residue_center - res_center)
                if distance < threshold:
                    adjacent_residues.add(res)
    return adjacent_residues


def parse_pdb(pdb_file):    
   # Step 1: Parse the PDB structure
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_file, pdb_file)

    # Step 2: Initialize ShrakeRupley to compute SASA
    sr = ShrakeRupley()
    sr.compute(structure, level="A")  # Compute at the atom level
    
    # Step 3: Set thresholds
    sasa_threshold = 10.0  # Threshold for surface residues in Å²
    adjacency_threshold = 5.0  # Threshold for adjacency in Å

    # Step 4: Identify surface residues
    surface_residues = []
    for chain in structure[0]:  # First model
        for residue in chain:
            if not is_aa(residue):  # Only consider amino acids
                continue

            # Sum the SASA for each atom in the residue
            residue_sasa = sum(atom.sasa for atom in residue if hasattr(atom, 'sasa'))

            # Collect residues with SASA above the threshold
            if residue_sasa > sasa_threshold:
                surface_residues.append(residue)
    
    
    # Step 5: Find adjacent residues to surface residues (first-level neighbors)
    first_level_adjacent = find_adjacent_residues(surface_residues, structure, adjacency_threshold)

    # Step 6: Find adjacent residues to the first-level adjacent residues (second-level neighbors)
    second_level_adjacent = find_adjacent_residues(first_level_adjacent, structure, adjacency_threshold)
                
    return surface_residues, first_level_adjacent, second_level_adjacent

In [None]:
# Step 6: Function to calculate distances between atom pairs
def calculate_distance(atom1, atom2):
    diff = atom1.coord - atom2.coord
    return np.sqrt(np.sum(diff * diff))

In [None]:
# Step 7: Identify interactions between surface residues, adjacent residues, and adjacent-adjacent residues
def check_interactions(surface_residues, adjacent_residues, adjacent_adjacent_residues, interaction_criteria):
    interactions1 = []

    # Check interactions between surface and adjacent residues
    for surface_res in surface_residues:
        for adjacent_res in adjacent_residues:
            # Get atom types for surface and adjacent residues
            surface_atoms = amino_acid_atoms.get(surface_res.get_resname(), {})
            adjacent_atoms = amino_acid_atoms.get(adjacent_res.get_resname(), {})

            for atom_surface in surface_res:
                atom_type_surface = surface_atoms.get(atom_surface.get_name())

                for atom_adjacent in adjacent_res:
                    atom_type_adjacent = adjacent_atoms.get(atom_adjacent.get_name())

                    if not atom_type_surface or not atom_type_adjacent:
                        continue

                    # Split atom types by comma if multiple types exist
                    atom_type_surface_split = atom_type_surface.split(",")
                    atom_type_adjacent_split = atom_type_adjacent.split(",")

                    # Check if any combination satisfies interaction criteria
                    for surface_type in atom_type_surface_split:
                        for adjacent_type in atom_type_adjacent_split:
                            for interaction, criteria in interaction_criteria.items():
                                if (criteria["atomic_type1"] == surface_type and 
                                    criteria["atomic_type2"] == adjacent_type):

                                    dist = calculate_distance(atom_surface, atom_adjacent)
                                    if criteria["min_dist"] <= dist <= criteria["max_dist"]:
                                        interactions1.append((surface_res, adjacent_res, interaction, dist))
    
    interactions2 = []
    # Check interactions between surface and adjacent-adjacent residues
    for surface_res in adjacent_residues:
        for adjacent_adjacent_res in adjacent_adjacent_residues:
            # Get atom types for surface and adjacent-adjacent residues
            surface_atoms = amino_acid_atoms.get(surface_res.get_resname(), {})
            adjacent_atoms = amino_acid_atoms.get(adjacent_adjacent_res.get_resname(), {})

            for atom_surface in surface_res:
                atom_type_surface = surface_atoms.get(atom_surface.get_name())

                for atom_adjacent in adjacent_adjacent_res:
                    atom_type_adjacent = adjacent_atoms.get(atom_adjacent.get_name())

                    if not atom_type_surface or not atom_type_adjacent:
                        continue

                    # Split atom types by comma if multiple types exist
                    atom_type_surface_split = atom_type_surface.split(",")
                    atom_type_adjacent_split = atom_type_adjacent.split(",")

                    # Check if any combination satisfies interaction criteria
                    for surface_type in atom_type_surface_split:
                        for adjacent_type in atom_type_adjacent_split:
                            for interaction, criteria in interaction_criteria.items():
                                if (criteria["atomic_type1"] == surface_type and 
                                    criteria["atomic_type2"] == adjacent_type):

                                    dist = calculate_distance(atom_surface, atom_adjacent)
                                    if criteria["min_dist"] <= dist <= criteria["max_dist"]:
                                        interactions2.append((surface_res, adjacent_adjacent_res, interaction, dist))

    return interactions1, interactions2

In [None]:
# Function to generate features for a residue
def generate_features(residue, interaction_type):
    # Get residue name
    res_name = residue.get_resname()
    
    # Determine atom types based on the residue
    residue_atoms = amino_acid_atoms.get(res_name, {})
    
    
    #print(residue_atoms)
    #for atom_type in residue_atoms.values():
        #print (atom_type)
    
    return {
        "RES_NAME": res_name,
        "RES_ID": residue.id[1],
        "ACP": 1 if interaction_type in ["H_BOND", "SALT_BRIDGE", "SS_BRIDGE"] else 0,  # ACP related
        "DON": 1 if any(atom_type == "DON" for atom_type in residue_atoms.values()) and interaction_type in ["H_BOND", "SS_BRIDGE"] else 0,  # Donor atom
        "POS": 1 if any(atom_type == "POS" for atom_type in residue_atoms.values()) and interaction_type in ["SALT_BRIDGE", "REPULSIVE_POS"] else 0,  # Positive charge
        "NEG": 1 if any(atom_type == "NEG" for atom_type in residue_atoms.values()) and interaction_type in ["SALT_BRIDGE", "REPULSIVE_NEG"] else 0,  # Negative charge
        "HPB": 1 if any(atom_type == "HPB" for atom_type in residue_atoms.values()) and interaction_type == "HYDROPHOBIC" else 0,  # Hydrophobic interaction
        "ARM": 1 if any(atom_type == "ARM" for atom_type in residue_atoms.values()) and interaction_type in ["ARM_STACK"] else 0,  # Aromatic stacking
        "HYDROPHOBIC": 1 if res_name in ["ALA", "ILE", "LEU", "MET", "PHE", "TRP", "VAL"] else 0,  # Hydrophobic residue
        "SALT_BRIDGE": 1 if interaction_type == "SALT_BRIDGE" else 0,  # Salt bridge
        "ARM_STACK": 1 if interaction_type == "ARM_STACK" else 0,  # Aromatic stacking
        "H_BOND": 1 if interaction_type == "H_BOND" else 0,  # Hydrogen bond
        "REPULSIVE_POS": 1 if interaction_type == "REPULSIVE_POS" else 0,  # Repulsive interaction between positive charges
        "REPULSIVE_NEG": 1 if interaction_type == "REPULSIVE_NEG" else 0,  # Repulsive interaction between negative charges
        "SS_BRIDGE": 1 if interaction_type == "SS_BRIDGE" else 0,  # Disulfide bridge
    }

In [None]:
import csv
import pandas as pd

# Define the atom type categories
atom_type_categories = ["ACP", "DON", "POS", "NEG", "HPB", "ARM"]

# Define the interaction type categories you're interested in
interaction_categories = ["HYDROPHOBIC", "SALT_BRIDGE", "ARM_STACK", 
                          "H_BOND", "REPULSIVE_POS", "REPULSIVE_NEG", "SS_BRIDGE"]

# Helper function to sum features
def sum_features(features_list):
    feature_sums = {key: 0 for key in atom_type_categories + interaction_categories}
    
    for features in features_list:
        for key, value in features.items():
            if key in feature_sums:  # Only sum if the key is in feature_sums
                feature_sums[key] += value
    return feature_sums

# Load the CSV file containing protein pairs and their class labels
df_pairs = pd.read_csv('demo2.csv')  # Make sure to specify the correct path

# Define the CSV header for P1 and P2 with clear labels for each residue type
header = (atom_type_categories + interaction_categories +  # For surface residues P1
          atom_type_categories + interaction_categories +  # For adjacent residues P1
          atom_type_categories + interaction_categories +  # For adjacent-adjacent residues P1
          atom_type_categories + interaction_categories +  # For surface residues P2
          atom_type_categories + interaction_categories +  # For adjacent residues P2
          atom_type_categories + interaction_categories +  # For adjacent-adjacent residues P2
          ['class_label'])  # Class label

# Initialize a list to store each row of the final output
output_data = []

# Iterate through each row in the protein pairs DataFrame
for index, row in df_pairs.iterrows():
    protein1 = row['P1']  # Extract the first protein
    protein2 = row['P2']  # Extract the second protein
    class_label = row['class label']  # Extract the class label
    
    # Process Protein 1
    surface_residuesP1, adjacent_residuesP1, adjacent_adjacent_residuesP1 = parse_pdb('files/' + protein1)
    interactions1_P1, interactions2_P1 = check_interactions(surface_residuesP1, adjacent_residuesP1, adjacent_adjacent_residuesP1, interaction_criteria)
    
    surface_featuresP1 = []
    adjacent_featuresP1 = []
    adjacent_adjacent_featuresP1 = []

    for surface_res, adjacent_res, interaction, dist in interactions1_P1:
        surface_featuresP1.append(generate_features(surface_res, interaction))
        adjacent_featuresP1.append(generate_features(adjacent_res, interaction))

    for adjacent_res, adjacent_adjacent_res, interaction, dist in interactions2_P1:
        adjacent_adjacent_featuresP1.append(generate_features(adjacent_adjacent_res, interaction))

    # Process Protein 2
    surface_residuesP2, adjacent_residuesP2, adjacent_adjacent_residuesP2 = parse_pdb('files/' + protein2)
    interactions1_P2, interactions2_P2 = check_interactions(surface_residuesP2, adjacent_residuesP2, adjacent_adjacent_residuesP2, interaction_criteria)
    
    surface_featuresP2 = []
    adjacent_featuresP2 = []
    adjacent_adjacent_featuresP2 = []

    for surface_res, adjacent_res, interaction, dist in interactions1_P2:
        surface_featuresP2.append(generate_features(surface_res, interaction))
        adjacent_featuresP2.append(generate_features(adjacent_res, interaction))

    for adjacent_res, adjacent_adjacent_res, interaction, dist in interactions2_P2:
        adjacent_adjacent_featuresP2.append(generate_features(adjacent_adjacent_res, interaction))

    # Sum features for P1
    surface_sums_P1 = sum_features(surface_featuresP1)
    adjacent_sums_P1 = sum_features(adjacent_featuresP1)
    adjacent_adjacent_sums_P1 = sum_features(adjacent_adjacent_featuresP1)

    # Sum features for P2
    surface_sums_P2 = sum_features(surface_featuresP2)
    adjacent_sums_P2 = sum_features(adjacent_featuresP2)
    adjacent_adjacent_sums_P2 = sum_features(adjacent_adjacent_featuresP2)

    # Prepare combined data for P1 and P2 in a single row
    combined_data = (list(surface_sums_P1.values()) + 
                     list(adjacent_sums_P1.values()) + 
                     list(adjacent_adjacent_sums_P1.values()) +
                     list(surface_sums_P2.values()) + 
                     list(adjacent_sums_P2.values()) + 
                     list(adjacent_adjacent_sums_P2.values()) +
                     [class_label])

    # Append the combined data to the output data list
    output_data.append(combined_data)

# Write the data to a CSV file with side-by-side columns
with open('residue_interactions_side_by_side2.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header
    writer.writerow(header)
    
    # Write the combined data for each pair
    writer.writerows(output_data)

print("Data written to 'residue_interactions_side_by_side2.csv'")
