In [1]:
import pandas as pd
from pathlib import Path

### EXTRACT RNA BINDING RESIDUES

In [None]:
# ---------------------------------------------------------------------
# 1. Paths
# ---------------------------------------------------------------------

rna_file       = "../data/processed/ligand_splits/positives_rna.csv"
master_X_file  = "../data/processed/master_residue_file.csv"

X_out = "../data/processed/rna_binding_residues.csv"

In [3]:
# ---------------------------------------------------------------------
# 2. Load RNA-binding residue identifiers
# ---------------------------------------------------------------------
rna_df = pd.read_csv(rna_file)

# Make sure these columns exist
expected_cols = {"pdb_id", "chain_id", "pdb_residue_number"}
assert expected_cols.issubset(set(rna_df.columns)), f"Missing columns in RNA file: {expected_cols - set(rna_df.columns)}"

rna_keys = rna_df[["pdb_id", "chain_id", "pdb_residue_number"]].drop_duplicates()

print(f"RNA file: {len(rna_keys):,} unique binding residues")

RNA file: 174,226 unique binding residues


In [4]:
# ---------------------------------------------------------------------
# 3. Load master residue feature file
# ---------------------------------------------------------------------
X = pd.read_csv(master_X_file)
print(f"Master residue file shape: {X.shape}")

  X = pd.read_csv(master_X_file)


Master residue file shape: (13219671, 69)


In [5]:
X.columns

Index(['pdb_id', 'chain_id', 'pdb_residue_number', 'renum_residue_number',
       'insertion_code', 'residue_name', 'centroid_x', 'centroid_y',
       'centroid_z', 'mean_bfactor', 'std_bfactor', 'mean_occupancy',
       'num_atoms', 'num_heavy_atoms', 'num_sidechain_atoms',
       'mean_intra_atom_dist', 'std_intra_atom_dist', 'radius_of_gyration',
       'residue_radius', 'bounding_box_volume', 'closest_neighbor_dist',
       'avg_neighbor_distance', 'prev_res', 'next_res', 'position_in_chain',
       'is_small', 'contact_number_4A', 'contact_number_6A',
       'contact_number_8A', 'contact_number_10A', 'label', 'amino_acid',
       'iupred2a_long_score', 'iupred2a_short_score', 'iupred2a_anchor_score',
       'PSSM_A', 'PSSM_R', 'PSSM_N', 'PSSM_D', 'PSSM_C', 'PSSM_Q', 'PSSM_E',
       'PSSM_G', 'PSSM_H', 'PSSM_I', 'PSSM_L', 'PSSM_K', 'PSSM_M', 'PSSM_F',
       'PSSM_P', 'PSSM_S', 'PSSM_T', 'PSSM_W', 'PSSM_Y', 'PSSM_V',
       'dssp_residue_name', 'secondary_structure', 'absolute_sas

In [6]:
# ---------------------------------------------------------------------
# 4. Check that X has the identifier columns
# ---------------------------------------------------------------------
assert expected_cols.issubset(set(X.columns)), f"Missing identifier columns in master residue file"

In [7]:
# ---------------------------------------------------------------------
# 5. Merge to find matching rows
# ---------------------------------------------------------------------
merged = X.merge(
    rna_keys,
    on=["pdb_id", "chain_id", "pdb_residue_number"],
    how="inner"
)

print(f"Matched rows before dropping NaNs: {len(merged):,}")


Matched rows before dropping NaNs: 203,279


In [8]:
# See which columns have missing values
missing_per_column = merged.isnull().sum()
print("Missing values per column:")
print(missing_per_column[missing_per_column > 0])

Missing values per column:
prev_res                   2287
next_res                    766
amino_acid                40161
iupred2a_long_score       40161
iupred2a_short_score      40161
iupred2a_anchor_score     40161
PSSM_A                    49068
PSSM_R                    49068
PSSM_N                    49068
PSSM_D                    49068
PSSM_C                    49068
PSSM_Q                    49068
PSSM_E                    49068
PSSM_G                    49068
PSSM_H                    49068
PSSM_I                    49068
PSSM_L                    49068
PSSM_K                    49068
PSSM_M                    49068
PSSM_F                    49068
PSSM_P                    49068
PSSM_S                    49068
PSSM_T                    49068
PSSM_W                    49068
PSSM_Y                    49068
PSSM_V                    49068
dssp_residue_name        166986
secondary_structure      166986
absolute_sasa            166986
relative_asa             166986
phi          

In [9]:
merged.shape

(203279, 69)

In [10]:
# ------------------------------------------------------------
# 6. Add 100,000 random unlabeled residues
# ------------------------------------------------------------
# Make sure you have a 'label' column for this
if 'label' not in X.columns:
    raise ValueError("The master residue file must contain a 'label' column for selecting unlabeled residues.")

In [11]:
unlabeled_pool = X[X["label"] == 0]
print(f"Total unlabeled pool: {len(unlabeled_pool):,}")

unlabeled_sample = unlabeled_pool.sample(n=100_000, random_state=42)

print(f"Sampled unlabeled: {len(unlabeled_sample):,}")


Total unlabeled pool: 12,571,279
Sampled unlabeled: 100,000


In [12]:
# Combine
combined = pd.concat([merged, unlabeled_sample], ignore_index=True)
print(f"Combined rows before dropping NaNs: {len(combined):,}")

Combined rows before dropping NaNs: 303,279


In [14]:
# Columns to drop due to high missing values
columns_to_drop = [
    'dssp_residue_name',
    'secondary_structure', 
    'absolute_sasa',
    'relative_asa',
    'phi',
    'psi',
    'hbond_NH_O1_energy',
    'hbond_NH_O2_energy',
    'hbond_O_NH1_energy',
    'hbond_O_NH2_energy',
    'is_aromatic',
    'is_polar',
    'is_charged',
    'is_hydrophobic'
]

# Drop the columns
combined = combined.drop(columns=columns_to_drop)

print(f"Dropped {len(columns_to_drop)} columns with high missing values")
print(f"DataFrame shape after dropping: {combined.shape}")

KeyError: "['dssp_residue_name', 'secondary_structure', 'absolute_sasa', 'relative_asa', 'phi', 'psi', 'hbond_NH_O1_energy', 'hbond_NH_O2_energy', 'hbond_O_NH1_energy', 'hbond_O_NH2_energy', 'is_aromatic', 'is_polar', 'is_charged', 'is_hydrophobic'] not found in axis"

In [15]:
# ------------------------------------------------------------
# 7. Drop any rows with NaNs
# ------------------------------------------------------------
cleaned = combined.dropna().reset_index(drop=True)
print(f"Rows after dropping NaNs: {len(cleaned):,}")


Rows after dropping NaNs: 245,280


In [17]:
cleaned['label'].value_counts()

label
1    152037
0     93243
Name: count, dtype: int64

In [None]:

# ---------------------------------------------------------------------
# 6. Drop rows with any NaNs
# ---------------------------------------------------------------------
cleaned = merged.dropna().reset_index(drop=True)
print(f"Matched rows after dropping NaNs: {len(cleaned):,}")

In [None]:
# # ---------------------------------------------------------------------
# # 7. Save
# # ---------------------------------------------------------------------
# cleaned.to_csv(X_out, index=False)
# print(f"Saved RNA-binding residues → {X_out}")

In [18]:
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from scipy import sparse

In [26]:
# ---------- 1. Configuration --------------------------------
rna_binding_residues    = "/home/mpradhan/Intern_Research_Project/data/rna_binding_residues.csv"          # <-- change to real path
drop_cols      = ['amino_acid']
label_col      = 'label'

identifier_cols = ['pdb_id', 'chain_id',
                   'pdb_residue_number', 'renum_residue_number',
                   'insertion_code']               # keep residue_name for encoding

categorical_cols = ['residue_name',                 # 3-letter code
                    'prev_res', 'next_res']          # DSSP SS symbol

In [27]:
# ---------- 2. Load data ------------------------------------
df = cleaned
df = df.drop(columns=drop_cols, errors='ignore')

In [28]:
df.columns

Index(['pdb_id', 'chain_id', 'pdb_residue_number', 'renum_residue_number',
       'insertion_code', 'residue_name', 'centroid_x', 'centroid_y',
       'centroid_z', 'mean_bfactor', 'std_bfactor', 'mean_occupancy',
       'num_atoms', 'num_heavy_atoms', 'num_sidechain_atoms',
       'mean_intra_atom_dist', 'std_intra_atom_dist', 'radius_of_gyration',
       'residue_radius', 'bounding_box_volume', 'closest_neighbor_dist',
       'avg_neighbor_distance', 'prev_res', 'next_res', 'position_in_chain',
       'is_small', 'contact_number_4A', 'contact_number_6A',
       'contact_number_8A', 'contact_number_10A', 'label',
       'iupred2a_long_score', 'iupred2a_short_score', 'iupred2a_anchor_score',
       'PSSM_A', 'PSSM_R', 'PSSM_N', 'PSSM_D', 'PSSM_C', 'PSSM_Q', 'PSSM_E',
       'PSSM_G', 'PSSM_H', 'PSSM_I', 'PSSM_L', 'PSSM_K', 'PSSM_M', 'PSSM_F',
       'PSSM_P', 'PSSM_S', 'PSSM_T', 'PSSM_W', 'PSSM_Y', 'PSSM_V'],
      dtype='object')

In [29]:
# ---------- 3. Split X and y -------------------------------
y = df[label_col]
X = df.drop(columns=[label_col] + identifier_cols, errors='ignore')

In [30]:
# ---------- 4. Separate numeric columns ---------------------
numeric_cols = [c for c in X.columns if c not in categorical_cols]

In [31]:
# ---------- 5. Build ColumnTransformer ----------------------
encoder = ColumnTransformer(
    transformers=[
        ("cat",
         OneHotEncoder(sparse_output=True, handle_unknown="ignore"),
         categorical_cols),
        ("num", "passthrough", numeric_cols)
    ],
    remainder="drop",
    sparse_threshold=1.0      # keep sparse unless 100 % dense
)

In [32]:
# ---------- 6. Fit & transform ------------------------------
X_sparse = encoder.fit_transform(X)      # SciPy CSR matrix

In [33]:
# ---------- 7. Save outputs ---------------------------------
print("Saving dense CSV – may take a while and lots of disk space …")
X_dense = X_sparse.toarray()
feature_names = encoder.get_feature_names_out()
pd.DataFrame(X_dense, columns=feature_names).to_csv("../data/rna_binding_residues.csv", index=False)
y.to_csv("../data/y_rna_binding_residues.csv", index=False)

Saving dense CSV – may take a while and lots of disk space …


In [34]:
import pandas as pd
import numpy as np

# Read the saved y file
y = pd.read_csv("../data/y_rna_binding_residues.csv")

# Find indices where y == 1
positive_indices = y[y.iloc[:, 0] == 1].index.tolist()

print(f"Total positive samples (label=1): {len(positive_indices)}")

# Check if we have enough positive samples
if len(positive_indices) < 15000:
    print(f"Warning: Only {len(positive_indices)} positive samples available, changing all to 0")
    indices_to_change = positive_indices
else:
    # Randomly select 15000 positive samples to change to 0
    np.random.seed(42)  # For reproducibility
    indices_to_change = np.random.choice(positive_indices, size=15000, replace=False)

# Create a copy and modify the labels
y_modified = y.copy()
y_modified.iloc[indices_to_change, 0] = 0

# Save the modified y as new file
y_modified.to_csv("../data/y_true_binding_residues.csv", index=False)

print(f"Changed {len(indices_to_change)} samples from 1 to 0")
print(f"Original distribution: {y.iloc[:, 0].value_counts().to_dict()}")
print(f"Modified distribution: {y_modified.iloc[:, 0].value_counts().to_dict()}")
print("Saved as: ../data/y_true_binding_residues.csv")

Total positive samples (label=1): 152037
Changed 15000 samples from 1 to 0
Original distribution: {1: 152037, 0: 93243}
Modified distribution: {1: 137037, 0: 108243}
Saved as: ../data/y_true_binding_residues.csv


### EXTRACT DNA BINDING RESIDUES