In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from scipy import sparse

In [2]:
balanced_path = "../balanced_master_residue_file.csv"
drop_cols       = ['amino_acid', 'dssp_residue_name']
label_col       = 'label'

# Possible residue identity columns (one-letter or three-letter)
possible_res_cols = ['residue_name', 'amino_acid', 'aa_name']

# Identifier columns to remove
identifier_cols = ['pdb_id', 'chain_id',
                   'pdb_residue_number', 'renum_residue_number',
                   'insertion_code']

In [3]:

# Other categorical features
extra_cat_cols = ['prev_res', 'next_res', 'secondary_structure']

In [19]:
# 1 ▸ load & basic clean
df = pd.read_csv(balanced_path)
df = df.drop(columns=drop_cols, errors="ignore")

  df = pd.read_csv(balanced_path)


In [20]:
# ---------- 3. Detect residue identity column ---------------
present_res_col = next((c for c in possible_res_cols if c in df.columns), None)
if present_res_col is None:
    raise ValueError("No residue identity column found: tried 'residue_name', 'amino_acid', or 'aa_name'.")

In [21]:
# Define full list of categorical columns
categorical_cols = [present_res_col] + [c for c in extra_cat_cols if c in df.columns]

# ---------- 4. Prepare X and y ------------------------------
y = df[label_col]

# Drop identifier columns (NOT the residue identity column)
X = df.drop(columns=[label_col] + identifier_cols, errors="ignore")

In [22]:

# Determine numeric columns to passthrough
numeric_cols = [c for c in X.columns if c not in categorical_cols]

# ---------- 5. ColumnTransformer -----------------------------
encoder = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(sparse_output=True, handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ],
    remainder="drop",
    sparse_threshold=1.0
)

In [23]:

# ---------- 6. Encode ----------------------------------------
X_sparse = encoder.fit_transform(X)  # Will be sparse

In [None]:
# ---------- 7. Save outputs ----------------------------------
sparse.save_npz("../data/X_balanced_sparse.npz", X_sparse)
y.to_csv("../data/y_balanced.csv", index=False)
joblib.dump(encoder, "onehot_encoder_balanced.pkl")

['onehot_encoder_balanced.pkl']

In [25]:
print("✓ Encoding complete")
print(f"  → Sparse feature matrix: X_balanced_sparse.npz  (shape: {X_sparse.shape})")
print("  → Labels:                 y_balanced.csv")
print("  → Encoder saved to:       onehot_encoder_balanced.pkl")

✓ Encoding complete
  → Sparse feature matrix: X_balanced_sparse.npz  (shape: (1296468, 128))
  → Labels:                 y_balanced.csv
  → Encoder saved to:       onehot_encoder_balanced.pkl


In [26]:
# Get feature names (optional but useful)
encoded_feature_names = encoder.get_feature_names_out()

# Convert sparse matrix to dense array
X_dense = X_sparse.toarray()

# Create a DataFrame
X_df = pd.DataFrame(X_dense, columns=encoded_feature_names)

# Save as CSV
X_df.to_csv("../data/X_balanced_dense.csv", index=False)
print("✓ Dense version of X saved as X_balanced_dense.csv")

✓ Dense version of X saved as X_balanced_dense.csv
