This is an exploratory data analysis notebook for modeling

Nathan Lanclos
12/12/2024

In [30]:
import pickle

# Open and load the .pkl file
with open('../GATSol/dataset/eSol_test.pkl', 'rb') as file:
    test_df = pickle.load(file)
with open('../GATSol/dataset/eSol_train.pkl', 'rb') as file:
    train_df = pickle.load(file)


In [25]:
from structure_feature_engine import PDBFeatureExtractor

# Directory with PDB files
pdb_folder = "~/PH245_Project_Local_Only/PH245_PROJECT_STRUCTURES/Structures/eSol_train_predicted_structures"

# Extract features from all PDB files in the folder
df = PDBFeatureExtractor.extract_features_from_folder(pdb_folder)

# Display the first few rows
df.head()

# Save results to a CSV file
# df.to_csv("pdb_features_with_proportions.csv", index=False)


Unnamed: 0,num_atoms,total_mass,center_of_mass,num_residues,num_chains,radius_of_gyration,bounding_box_volume,principal_axes_sum,aspect_ratio,b_factors_mean,...,solvent_exposed_fraction,N_atom_type_proportion,C_atom_type_proportion,O_atom_type_proportion,S_atom_type_proportion,polar_exposed_residue_proportion,nonpolar_exposed_residue_proportion,positive_exposed_residue_proportion,negative_exposed_residue_proportion,pdb_file
0,801,10622.81,"[2.067508, 3.6856148, -1.8086874]",105,1,19.371229,151326.71875,375.713603,11.367589,0.658627,...,0.975,0.177278,0.627965,0.187266,0.007491,0.19802,0.514851,0.158416,0.09901,yegR_1.pdb
1,611,8095.33,"[0.44703537, 0.5240049, -1.2193217]",76,1,16.24313,75647.078125,264.271767,4.223266,0.335843,...,0.918919,0.168576,0.644845,0.176759,0.00982,0.222222,0.569444,0.083333,0.083333,ymcE_1.pdb
2,2213,29102.27,"[2.2136195, 5.421145, -2.3721812]",289,1,23.921539,308682.6875,572.498778,5.492515,0.891035,...,0.966387,0.186173,0.628559,0.183461,0.001808,0.212014,0.480565,0.204947,0.091873,amiA_1.pdb
3,2446,32192.57,"[0.4596787, -0.43476364, -0.25820008]",313,1,22.425901,183335.0625,503.126767,5.586904,0.944914,...,0.993056,0.184791,0.634096,0.177433,0.003679,0.14658,0.570033,0.159609,0.107492,ribF_1.pdb
4,1282,16982.57,"[-0.73663634, 0.26377583, -3.0664878]",166,1,25.638334,238596.046875,657.83733,8.256398,0.619048,...,1.0,0.174727,0.621685,0.198908,0.00468,0.240964,0.475904,0.126506,0.138554,ydaW_1.pdb


In [26]:
df.columns

Index(['num_atoms', 'total_mass', 'center_of_mass', 'num_residues',
       'num_chains', 'radius_of_gyration', 'bounding_box_volume',
       'principal_axes_sum', 'aspect_ratio', 'b_factors_mean',
       'b_factors_variance', 'b_factors_min', 'b_factors_max',
       'mean_contacts_per_residue', 'solvent_exposed_fraction',
       'N_atom_type_proportion', 'C_atom_type_proportion',
       'O_atom_type_proportion', 'S_atom_type_proportion',
       'polar_exposed_residue_proportion',
       'nonpolar_exposed_residue_proportion',
       'positive_exposed_residue_proportion',
       'negative_exposed_residue_proportion', 'pdb_file'],
      dtype='object')

In [27]:
# Step 1: Modify pdb_file column
df["pdb_file"] = df["pdb_file"].str.split("_").str[0] 

df.head()

Unnamed: 0,num_atoms,total_mass,center_of_mass,num_residues,num_chains,radius_of_gyration,bounding_box_volume,principal_axes_sum,aspect_ratio,b_factors_mean,...,solvent_exposed_fraction,N_atom_type_proportion,C_atom_type_proportion,O_atom_type_proportion,S_atom_type_proportion,polar_exposed_residue_proportion,nonpolar_exposed_residue_proportion,positive_exposed_residue_proportion,negative_exposed_residue_proportion,pdb_file
0,801,10622.81,"[2.067508, 3.6856148, -1.8086874]",105,1,19.371229,151326.71875,375.713603,11.367589,0.658627,...,0.975,0.177278,0.627965,0.187266,0.007491,0.19802,0.514851,0.158416,0.09901,yegR
1,611,8095.33,"[0.44703537, 0.5240049, -1.2193217]",76,1,16.24313,75647.078125,264.271767,4.223266,0.335843,...,0.918919,0.168576,0.644845,0.176759,0.00982,0.222222,0.569444,0.083333,0.083333,ymcE
2,2213,29102.27,"[2.2136195, 5.421145, -2.3721812]",289,1,23.921539,308682.6875,572.498778,5.492515,0.891035,...,0.966387,0.186173,0.628559,0.183461,0.001808,0.212014,0.480565,0.204947,0.091873,amiA
3,2446,32192.57,"[0.4596787, -0.43476364, -0.25820008]",313,1,22.425901,183335.0625,503.126767,5.586904,0.944914,...,0.993056,0.184791,0.634096,0.177433,0.003679,0.14658,0.570033,0.159609,0.107492,ribF
4,1282,16982.57,"[-0.73663634, 0.26377583, -3.0664878]",166,1,25.638334,238596.046875,657.83733,8.256398,0.619048,...,1.0,0.174727,0.621685,0.198908,0.00468,0.240964,0.475904,0.126506,0.138554,ydaW


In [28]:
import pandas as pd
# Step 2: Merge DataFrames
merged_df = pd.merge(
    train_df,
    df,
    left_on="gene",
    right_on="pdb_file",
    how="inner"
)

# # Step 3: Drop the redundant 'gene' column (optional)
# merged_df = merged_df.drop(columns=["gene"])
merged_df = merged_df.drop(columns=["pdb_file"])
# # Display the result
# print(merged_df)
merged_df.head()

Unnamed: 0,gene,solubility,sequence,embedding,binary_solubility,molecular_weight,aromaticity,gravy,isoelectric_point,length,...,mean_contacts_per_residue,solvent_exposed_fraction,N_atom_type_proportion,C_atom_type_proportion,O_atom_type_proportion,S_atom_type_proportion,polar_exposed_residue_proportion,nonpolar_exposed_residue_proportion,positive_exposed_residue_proportion,negative_exposed_residue_proportion
0,aaeX,0.34,MSLFPVIVVFGLSFPPIFFELLLSLAIFWLVRRVLVPTGIYDFVWH...,"[[0.025740903, -0.06068451, 0.04562243, -0.098...",0,7846.489,0.223881,1.485075,7.824157,67.0,...,3.358209,0.87234,0.145161,0.704301,0.145161,0.005376,0.101695,0.762712,0.067797,0.033898
1,aas,0.07,MLFSFFRNLCRVLYRVRVTGDTQALKGERVLITPNHVSFIDGILLG...,"[[-0.037975986, -0.046647176, -0.014576814, 0....",0,80699.0443,0.091794,-0.042281,9.312345,719.0,...,3.534075,0.982405,0.174112,0.643862,0.177278,0.004749,0.161744,0.554149,0.147679,0.105485
2,aat,0.08,MRLVQLSRHSIAFPSPEGALREPNGLLALGGDLSPARLLMAYQRGI...,"[[0.037574537, -0.042643685, 0.05279441, -0.02...",0,26618.2104,0.106838,-0.247009,6.755053,234.0,...,3.547009,0.980769,0.180459,0.636412,0.175654,0.007475,0.165179,0.558036,0.147321,0.102679
3,abgA,0.31,MESLNQFVNSLAPKLSHWRRDFHHYAESGWVEFRTATLVAEELHQL...,"[[-0.00765643, -0.13189432, 0.014674729, -0.01...",0,46587.6743,0.075688,-0.095872,5.506925,436.0,...,3.642202,0.995,0.181153,0.623666,0.190302,0.00488,0.200935,0.57243,0.114486,0.095794
4,abgB,0.49,MQEIYRFIDDAIEADRQRYTDIADQIWDHPETRFEEFWSAEHLASA...,"[[0.0036755833, -0.15647556, -0.01813248, 0.04...",0,52193.0071,0.085239,-0.186694,5.437809,481.0,...,3.586279,1.0,0.175014,0.627926,0.192161,0.004899,0.23431,0.535565,0.104603,0.100418


In [29]:
# Step 4: Write to pickle file
output_path = "../GATSol/dataset/eSol_train.pkl"  # Specify your desired file path
merged_df.to_pickle(output_path)