In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [None]:
df = pd.read_parquet("datasets/dgsm/chembl_22_clean_1576904_sorted_std_final.parquet")
df.head()

In [None]:
from models.mol2vec.mol2vec_encoder import Mol2VecEncoder

m2v_encoder = Mol2VecEncoder("models/mol2vec/model_300dim.pkl")

In [None]:
smiles_sample = df["SMILES"][0]

In [None]:
m2v_encoder.smiles_to_vec(smiles_sample)

In [None]:
def process_parquet_with_mol2vec(parquet_path, model_path, output_dir, batch_size=1000):
    """
    Process parquet file with mol2vec and save embeddings
    
    Args:
        parquet_path (str): Path to parquet file
        model_path (str): Path to mol2vec model
        output_dir (str): Directory to save embeddings
        batch_size (int): Number of rows to process at once
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Initialize encoder
        print("Initializing Mol2Vec encoder...")
        encoder = Mol2VecEncoder(model_path)
        
        # Read parquet file
        print("Reading parquet file...")
        df = pd.read_parquet(parquet_path)
        total_rows = len(df)
        print(f"Total rows to process: {total_rows}")
        
        # Process in batches with tqdm
        failed_indices = []
        successful_embeddings = 0
        
        for start_idx in tqdm(range(0, total_rows, batch_size), 
                            desc="Processing molecules", 
                            unit="batch"):
            end_idx = min(start_idx + batch_size, total_rows)
            batch = df.iloc[start_idx:end_idx]
            
            # Process each SMILES in the batch
            for idx, row in batch.iterrows():
                try:
                    # Get embedding
                    embedding = encoder.smiles_to_vec(row['SMILES'])
                    
                    if embedding is not None:
                        # Save embedding to file
                        output_path = os.path.join(output_dir, f"{idx}.npy")
                        np.save(output_path, embedding)
                        successful_embeddings += 1
                    else:
                        failed_indices.append(idx)
                        
                except Exception as e:
                    print(f"Error processing index {idx}: {str(e)}")
                    failed_indices.append(idx)
                    
        # Print summary
        print("\nProcessing complete!")
        print(f"Successfully processed: {successful_embeddings}/{total_rows} molecules")
        print(f"Failed to process: {len(failed_indices)} molecules")
        
        if failed_indices:
            failed_file = os.path.join(output_dir, "failed_indices.txt")
            with open(failed_file, 'w') as f:
                f.write("\n".join(map(str, failed_indices)))
            print(f"Failed indices saved to: {failed_file}")
            
    except Exception as e:
        print(f"Error in main processing: {str(e)}")

def verify_embeddings(output_dir):
    """
    Verify the saved embeddings
    
    Args:
        output_dir (str): Directory containing embeddings
    """
    try:
        # Get all npy files
        embedding_files = [f for f in os.listdir(output_dir) if f.endswith('.npy')]
        print(f"\nVerifying {len(embedding_files)} embedding files...")
        
        # Check a few random files
        sample_size = min(5, len(embedding_files))
        sample_files = np.random.choice(embedding_files, sample_size, replace=False)
        
        for file in sample_files:
            file_path = os.path.join(output_dir, file)
            embedding = np.load(file_path)
            print(f"File: {file}, Shape: {embedding.shape}")
            
    except Exception as e:
        print(f"Error verifying embeddings: {str(e)}")

In [None]:
PARQUET_FILE = 'datasets/dgsm/chembl_22_clean_1576904_sorted_std_final.parquet'
MODEL_PATH = 'models/mol2vec/model_300dim.pkl'  # Update with your model path
OUTPUT_DIR = 'storages/mol2vec_dgsm'

In [None]:
process_parquet_with_mol2vec(PARQUET_FILE, MODEL_PATH, OUTPUT_DIR)