In [None]:
import pandas as pd
import os

# Step 1: Read the CSV file into a DataFrame in chunks
data_chunks = pd.read_csv('moses.csv', chunksize=5000)

# Step 2: Process each chunk and save to separate CSV files
processed_files = []

for i, chunk in enumerate(data_chunks):
    # Drop the second column ('SPLIT')
    chunk.drop(columns=['SPLIT'], inplace=True)
    
    # Remove duplicates and null values
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    
    # Save the processed chunk to a separate CSV file
    processed_file = f'processed_chunk_{i}.csv'
    chunk.to_csv(processed_file, index=False)
    
    processed_files.append(processed_file)

# Step 3: Concatenate the processed chunks into a single DataFrame
final_result = pd.concat([pd.read_csv(file) for file in processed_files], ignore_index=True)

# Step 4: Drop duplicates from the final result
final_result.drop_duplicates(inplace=True)

# Step 5: Save the final result to a CSV file
final_result.to_csv('moses_processed.csv', index=False)

# Step 6: Delete the individual processed chunk files
for file in processed_files:
    os.remove(file)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import os

# Step 1: Read the processed CSV file into a DataFrame in chunks
data_chunks = pd.read_csv('moses_processed.csv', chunksize=5000)

# Step 2: Initialize a counter for tracking the processed chunks
processed_chunks_count = 0

# Step 3: Function to calculate Morgan fingerprints for a chunk and save to CSV
def calculate_fingerprints(chunk, chunk_index):
    # Convert SMILES to RDKit Mol objects
    mols = [Chem.MolFromSmiles(smiles) for smiles in chunk['SMILES']]
    
    # Calculate Morgan fingerprints
    fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, 2048) for mol in mols]
    
    # Initialize an empty list to store fingerprint data
    fps_data = []
    
    # Iterate over each fingerprint
    for fp in fps:
        # Convert fingerprint object to binary string and split it into individual bits
        bits = [int(bit) for bit in fp.ToBitString()]
        # Append the bits to the fps_data list
        fps_data.append(bits)
    
    # Create a DataFrame with the fingerprint data
    fp_df = pd.DataFrame(fps_data, columns=[f'Bit{i}' for i in range(2048)])
    
    # Concatenate the original data with the fingerprint DataFrame
    result = pd.concat([chunk.reset_index(drop=True), fp_df], axis=1)
    
    # Save the processed chunk with fingerprints to a separate CSV file
    processed_file = f'processed_chunk_with_fingerprints_{chunk_index}.csv'
    result.to_csv(processed_file, index=False)
    
    return processed_file

# Step 4: Process chunks and calculate fingerprints
processed_files = []

for i, chunk in enumerate(data_chunks):
    processed_file = calculate_fingerprints(chunk, i)
    processed_files.append(processed_file)
    processed_chunks_count += 1
    print(f'Processed chunk {processed_chunks_count} saved to {processed_file}')

# Step 5: Concatenate all processed files into one final CSV file
final_result_with_fingerprints = pd.concat([pd.read_csv(file) for file in processed_files], ignore_index=True)
final_csv_file = 'moses_with_fingerprints.csv'
final_result_with_fingerprints.to_csv(final_csv_file, index=False)
print(f'Concatenated all processed files into {final_csv_file}')

# Step 6: Delete the individual processed chunk files
for file in processed_files:
    os.remove(file)
    print(f'Deleted {file}')


In [None]:
import pandas as pd
import os

# List all CSV files in the directory
csv_files = [file for file in os.listdir() if file.startswith('processed_chunk_with_fingerprint')]

# Name for the final merged CSV file
final_csv_file = 'merged_moses_with_fingerprints.csv'

# Check if the final CSV file already exists
if os.path.exists(final_csv_file):
    os.remove(final_csv_file)

# Open the final CSV file in append mode
with open(final_csv_file, 'a') as final_csv:
    # Iterate over each CSV file
    for file in csv_files:
        # Open and read each CSV file chunk by chunk
        for chunk in pd.read_csv(file, chunksize=10000):  # Adjust chunksize as needed
            # Write the chunk to the final CSV file without header after the first chunk
            chunk.to_csv(final_csv, index=False, header=not os.path.exists(final_csv_file))

        # Optionally, delete the individual file to free up memory
        os.remove(file)
        print(f"Processed and removed: {file}")

print("All files merged and saved.")


In [None]:
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from rdkit import Chem
import numpy as np

# Step 1: Read the merged CSV file into a DataFrame
data = pd.read_csv('merged_moses_with_fingerprints.csv')

# Step 2: Extract Morgan fingerprints as features for clustering
features = data.iloc[:, 3:].values

# Step 3: Apply PCA for dimensionality reduction
pca = PCA(n_components=100, random_state=42)  # Reduce to 100 dimensions
features_reduced = pca.fit_transform(features)

# Step 4: Apply MiniBatchKMeans clustering on reduced features
n_clusters = 100
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1000, random_state=42)
cluster_labels = kmeans.fit_predict(features_reduced)

# Step 5: Extract 10 molecules from each centroid
centroids = kmeans.cluster_centers_
cluster_molecules = {}

for i in range(n_clusters):
    # Calculate Euclidean distances from each molecule to the centroid of the cluster
    distances = np.linalg.norm(features_reduced - centroids[i], axis=1)
    
    # Get indices of 10 closest molecules to the centroid
    closest_indices = np.argsort(distances)[:10]
    
    # Extract SMILES and cluster number for closest molecules
    smiles = data.iloc[closest_indices]['SMILES'].values
    cluster_molecules[i] = smiles

# Step 6: Create a DataFrame to store the results
results = []

for cluster, smiles_list in cluster_molecules.items():
    for smiles in smiles_list:
        results.append({'SMILES': smiles, 'Cluster': cluster})

results_df = pd.DataFrame(results)

# Step 7: Save the results to a CSV file
results_df.to_csv('clustered_molecules.csv', index=False)


In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from rdkit import Chem
import numpy as np
import tempfile
import os

# Step 1: Read the merged CSV file into a DataFrame in chunks
chunk_size = 5000
temp_files = []

# Step 2: Apply PCA for dimensionality reduction
n_components = 100
pca = PCA(n_components=n_components, random_state=42)

# Step 3: Apply KMeans clustering
n_clusters = 100
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Step 4: Process data in chunks
for chunk in pd.read_csv('merged_moses_with_fingerprints.csv', chunksize=chunk_size):
    # Extract Morgan fingerprints as features for PCA
    features = chunk.iloc[:, 3:].values
    
    # Apply PCA
    features_reduced = pca.fit_transform(features)
    
    # Apply KMeans clustering
    cluster_labels = kmeans.fit_predict(features_reduced)
    
    # Write intermediate results to disk
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    np.savez(temp_file, features_reduced=features_reduced, cluster_labels=cluster_labels)
    temp_files.append(temp_file.name)

# Step 5: Extract 10 molecules from each cluster
cluster_molecules = {}

for temp_file_name in temp_files:
    with np.load(temp_file_name) as data:
        features_reduced = data['features_reduced']
        cluster_labels = data['cluster_labels']
        
        for i in range(n_clusters):
            cluster_indices = np.where(cluster_labels == i)[0]
            if len(cluster_indices) > 10:
                random_indices = np.random.choice(cluster_indices, 10, replace=False)
                smiles = chunk.iloc[random_indices]['SMILES'].values
                cluster_molecules[i] = smiles.tolist()
            else:
                smiles = chunk.iloc[cluster_indices]['SMILES'].values
                cluster_molecules[i] = smiles.tolist()

# Step 6: Create a DataFrame to store the results
results = []

for cluster, smiles_list in cluster_molecules.items():
    for smiles in smiles_list:
        results.append({'SMILES': smiles, 'Cluster': cluster})

results_df = pd.DataFrame(results)

# Step 7: Save the results to a CSV file
results_df.to_csv('clustered_molecules.csv', index=False)

# Step 8: Cleanup temporary files
for temp_file_name in temp_files:
    os.remove(temp_file_name)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().