In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from scipy import sparse

In [None]:
# ---------- 1. Configuration --------------------------------
master_path    = "../data/master_residue_file.csv"          # <-- change to real path
drop_cols      = ['amino_acid', 'dssp_residue_name']
label_col      = 'label'

identifier_cols = ['pdb_id', 'chain_id',
                   'pdb_residue_number', 'renum_residue_number',
                   'insertion_code']               # keep residue_name for encoding

categorical_cols = ['residue_name',                 # 3-letter code
                    'prev_res', 'next_res',         # neighbour 3-letter codes
                    'secondary_structure']          # DSSP SS symbol

In [3]:
# ---------- 2. Load data ------------------------------------
df = pd.read_csv(master_path)
df = df.drop(columns=drop_cols, errors='ignore')

  df = pd.read_csv(master_path)


In [4]:
# ---------- 3. Split X and y -------------------------------
y = df[label_col]
X = df.drop(columns=[label_col] + identifier_cols, errors='ignore')

In [5]:
# ---------- 4. Separate numeric columns ---------------------
numeric_cols = [c for c in X.columns if c not in categorical_cols]

In [6]:
# ---------- 5. Build ColumnTransformer ----------------------
encoder = ColumnTransformer(
    transformers=[
        ("cat",
         OneHotEncoder(sparse_output=True, handle_unknown="ignore"),
         categorical_cols),
        ("num", "passthrough", numeric_cols)
    ],
    remainder="drop",
    sparse_threshold=1.0      # keep sparse unless 100 % dense
)

In [7]:
# ---------- 6. Fit & transform ------------------------------
X_sparse = encoder.fit_transform(X)      # SciPy CSR matrix

In [None]:
# ---------- 7. Save outputs ---------------------------------
sparse.save_npz("../data/X_master_sparse.npz", X_sparse)
y.to_csv("../data/y_master.csv", index=False)
joblib.dump(encoder, "../data/onehot_encoder_master.pkl")

['../onehot_encoder_master.pkl']

In [10]:
print("Saving dense CSV – may take a while and lots of disk space …")
X_dense = X_sparse.toarray()
feature_names = encoder.get_feature_names_out()
pd.DataFrame(X_dense, columns=feature_names).to_csv("../data/X_master_dense.csv", index=False)

Saving dense CSV – may take a while and lots of disk space …
