In [None]:
import os
import warnings
warnings.filterwarnings("ignore")

import duckdb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import DataStructs, Descriptors, rdFingerprintGenerator

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

In [None]:
data_dir = "/home/pervinco/Datasets/leash-bio"
save_dir = f"{data_dir}/split_sets"

train_csv = f"{data_dir}/train.csv"
test_csv = f"{data_dir}/test.csv"

train_parquet = f"{data_dir}/train.parquet"
test_parquet = f'{data_dir}/test.parquet'

os.listdir(data_dir)

In [None]:
con = duckdb.connect()
data = con.query(f"""(SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 0
                      ORDER BY random()
                      LIMIT 100000)
                      UNION ALL
                      (SELECT *
                      FROM parquet_scan('{train_parquet}')
                      WHERE binds = 1
                      ORDER BY random()
                      LIMIT 100000)""").df()

con.close()

In [None]:
data['molecule'] = data['molecule_smiles'].apply(Chem.MolFromSmiles) ## 문자열 화학식을 rdkit 객체로 변환한다.

In [None]:
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

def compute_fingerprint(mol):
    if mol is None:
        return None
    return fpg.GetFingerprint(mol)

data['fingerprints'] = data['molecule'].apply(compute_fingerprint)

In [None]:
def calculate_features(mol):
    features = {}
    features['MolWeight'] = Descriptors.ExactMolWt(mol)
    features['LogP'] = Descriptors.MolLogP(mol)
    features['HBondDonor'] = Descriptors.NumHDonors(mol)
    features['HBondAcceptor'] = Descriptors.NumHAcceptors(mol)
    features['RotatableBonds'] = Descriptors.NumRotatableBonds(mol)
    features['TPSA'] = Descriptors.TPSA(mol)
    features['AromaticRings'] = Descriptors.NumAromaticRings(mol)
    features['HeteroAtoms'] = Descriptors.NumHeteroatoms(mol)
    
    return features

In [None]:
data['features'] = data['molecule'].apply(calculate_features)
features_df = data['features'].apply(pd.Series)
data = pd.concat([data, features_df], axis=1)

In [None]:
scaler = MinMaxScaler()

feature_columns = ['MolWeight', 'LogP', 'HBondDonor', 'HBondAcceptor', 'RotatableBonds', 'TPSA', 'AromaticRings', 'HeteroAtoms']
feature_data = data[feature_columns].dropna().values
scaler.fit(feature_data)
normalized_features = scaler.transform(feature_data)

for i, col in enumerate(feature_columns):
    data[col] = normalized_features[:, i]

def combine_features(row):
    fingerprint = row['fingerprints']
    additional_features = row[feature_columns].values
    if fingerprint is None or additional_features is None:
        return None
    
    fingerprint_array = np.array(fingerprint)
    combined_features = np.concatenate((fingerprint_array, additional_features))

    return combined_features

data['combined_features'] = data.apply(combine_features, axis=1)

In [None]:
onehot_encoder = OneHotEncoder(sparse_output=False)
protein_encoded = onehot_encoder.fit_transform(data['protein_name'].values.reshape(-1, 1)).astype(np.uint8)
protein_columns = onehot_encoder.get_feature_names_out(['protein'])
protein_encoded_df = pd.DataFrame(protein_encoded, columns=protein_columns)
data = pd.concat([data, protein_encoded_df], axis=1)

In [None]:
data.to_csv("./train.csv", index=False)

In [None]:
data_clean = data.dropna(subset=['combined_features', 'binds'])
X = np.stack(data_clean['combined_features'].values)
y = data_clean['binds'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print(f"Validation Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))
print(y_pred_proba)

In [None]:
def convert_to_numpy_array(explicit_bit_vect):
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(explicit_bit_vect, arr)
    return arr


def calculate_features(mol):
    features = {}
    features['MolWeight'] = Descriptors.ExactMolWt(mol)
    features['LogP'] = Descriptors.MolLogP(mol)
    features['HBondDonor'] = Descriptors.NumHDonors(mol)
    features['HBondAcceptor'] = Descriptors.NumHAcceptors(mol)
    features['RotatableBonds'] = Descriptors.NumRotatableBonds(mol)
    features['TPSA'] = Descriptors.TPSA(mol)
    features['AromaticRings'] = Descriptors.NumAromaticRings(mol)
    features['HeteroAtoms'] = Descriptors.NumHeteroatoms(mol)
    return features


def compute_fingerprint(mol):
    if mol is None:
        return None
    return fpg.GetFingerprint(mol)

In [None]:
chunk_size = 1000
submission_chunks = []
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

for chunk in pd.read_csv(test_csv, chunksize=chunk_size):
    chunk['molecule'] = chunk['molecule_smiles'].apply(Chem.MolFromSmiles)
    chunk['fingerprints'] = chunk['molecule'].apply(compute_fingerprint)
    chunk['features'] = chunk['molecule'].apply(calculate_features)
    
    features_df = chunk['features'].apply(pd.Series)
    chunk = pd.concat([chunk, features_df], axis=1)    
    normalized_features = scaler.transform(chunk[feature_columns])
    
    for i, col in enumerate(feature_columns):
        chunk[col] = normalized_features[:, i]
    
    def combine_features(row):
        fingerprint = row['fingerprints']
        additional_features = row[feature_columns].values
        if fingerprint is None or additional_features is None:
            return None
        fingerprint_array = np.array(fingerprint)
        combined_features = np.concatenate((fingerprint_array, additional_features))
        return combined_features

    chunk['combined_features'] = chunk.apply(combine_features, axis=1)
    
    chunk = chunk.dropna(subset=['combined_features'])
    X_test_final = np.stack(chunk['combined_features'].values)
    
    y_test_pred = model.predict(X_test_final)
    chunk['binds_pred'] = y_test_pred
    
    submission_chunks.append(chunk[['id', 'binds_pred']])

submission = pd.concat(submission_chunks, axis=0)
submission.to_csv('./submission.csv', index=False)