In [None]:
#SVM model to test performance of Machine Learning on original and reorganized SMILES string #
#The code is initially generated with Openai's LLM i.e. "ChatGPT.com" #
#Then code is modified according to the need of the study #
#Initial code obtained on 12Dec2024 #


from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import SVG

def smiles_to_fingerprint(smiles, n_bits=2048, radius=2):
    """
    Converts a SMILES string into a molecular fingerprint.
    Invalid SMILES or errors during processing return a zero vector.

    Args:
        smiles (str): A SMILES string.
        n_bits (int): Number of bits for the fingerprint.
        radius (int): Radius for the Morgan fingerprint.

    Returns:
        np.array: A binary array representing the molecular fingerprint.
    """
    try:
        # Generate molecule object without sanitization to allow invalid SMILES
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        if mol is None:
            # Return zero vector for invalid SMILES
            return np.zeros(n_bits)
        
        # Attempt to sanitize the molecule
        Chem.SanitizeMol(mol)
        
        # Generate Morgan fingerprint
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
        return np.array(fingerprint)
    except Chem.AtomValenceException:
        # Handle specific exception for valence errors
        print(f"Valence error in SMILES: {smiles}")
        return np.ones(n_bits)
    except Exception as e:
        # General error handling
        print(f"Error processing SMILES '{smiles}': {e}")
        return np.zeros(n_bits)


# 2. Read SMILES Data from Files
def read_smiles(file_path):
    """Reads SMILES strings from a text file."""
    with open(file_path, 'r') as file:
        smiles = file.read().splitlines()
    return smiles


# File paths (update these with your file paths)
valid_smiles_file = "ValidCBIC.txt"
invalid_smiles_file = "InvalidCBIC.txt"

# Load SMILES strings
valid_smiles = read_smiles(valid_smiles_file)
invalid_smiles = read_smiles(invalid_smiles_file)

# Combine SMILES and Labels
smiles_list = valid_smiles + invalid_smiles
labels = [1] * len(valid_smiles) + [0] * len(invalid_smiles)

# Print class distribution
class_distribution = Counter(labels)
print(f"Class Distribution: {class_distribution}")

# Convert SMILES to Fingerprints
fingerprints = np.array([smiles_to_fingerprint(smiles) for smiles in smiles_list])

# Print dataset summary
print(f"Total SMILES: {len(smiles_list)}")
print(f"Valid SMILES: {len(valid_smiles)}")
print(f"Invalid SMILES: {len(invalid_smiles)}")

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(fingerprints, labels, test_size=0.2, stratify=labels, random_state=42)

# 4. SVM Model
model = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)

# Train the model
model.fit(X_train, y_train)

# 5. Evaluate the Model
# Accuracy on the test set
y_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification Report
print("Classification Report (Test Set):")
print(classification_report(y_test, y_pred))

# 6. Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5)
cv_scores = []

for train_index, test_index in skf.split(fingerprints, labels):
    X_train_cv, X_test_cv = fingerprints[train_index], fingerprints[test_index]
    y_train_cv, y_test_cv = np.array(labels)[train_index], np.array(labels)[test_index]
    
    # Train the model
    model.fit(X_train_cv, y_train_cv)
    
    # Evaluate on test fold
    y_pred_cv = model.predict(X_test_cv)
    cv_scores.append(accuracy_score(y_test_cv, y_pred_cv))

mean_cv_accuracy = np.mean(cv_scores)
print(f"Stratified K-Fold Mean Accuracy: {mean_cv_accuracy:.4f}")

# 7. Feature Space Visualization
def visualize_features(fingerprints, labels, output_file="pca_plot.svg"):
    """Visualizes the feature space using PCA."""
    pca = PCA(n_components=2)
    reduced = pca.fit_transform(fingerprints)
    plt.scatter(reduced[:, 0], reduced[:, 1], c=labels, cmap='coolwarm', alpha=0.7)
    plt.title("Feature Space Visualization (PCA)")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.colorbar(label="Label (1=Original CBIC, 0=Reorganized CBIC)")

    # Save the plot to an SVG file
    plt.savefig(output_file, format="svg")
    print(f"PCA plot saved to {output_file}")

    
    plt.show()

visualize_features(fingerprints, labels, output_file="pca_plotCBIC.svg")
