In [9]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, Lipinski, Fragments
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score
import DataPreprocessing as dprep
from sklearn.impute import SimpleImputer
import DescriptionFeaturesSelection as dsf
from sklearn.neural_network import MLPClassifier

In [7]:
training_data_path = 'training_smiles.csv'
test_data_path = 'test_smiles.csv'

training_data = pd.read_csv(training_data_path, dtype = {'ACTIVE': int})
test_data = pd.read_csv(test_data_path)

In [8]:
def extract_fingerprints(smiles):
    mol = Chem.MolFromSmiles(smiles)

    features = {}

    # Try nBits 2048, 1024, 512, 256
    # Morgan Fingerprint
    morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=512)
    for i in range(512):
        features[f'fp_{i}'] = morgan_fp[i]

    return features


    
training_features_df = training_data['SMILES'].apply(extract_fingerprints)

training_features_df = training_features_df.apply(pd.Series)

training_data_fingerprint = training_data.join(training_features_df)

training_data_fingerprint.to_csv('training_data_fingerprint.csv', index=False)




In [9]:

def extract_features(smiles):

    mol = Chem.MolFromSmiles(smiles)

    features = {}

    # Basic Properties
    features['num_atoms'] = mol.GetNumAtoms()
    features['num_bonds'] = mol.GetNumBonds()
    features['num_rings'] = mol.GetRingInfo().NumRings()

    # Molecular Descriptors
    for desc_name, desc_func in Descriptors.descList:
        features[desc_name] = desc_func(mol)

    # Lipinski Descriptors
    features['num_rotatable_bonds'] = Lipinski.NumRotatableBonds(mol)
    features['num_aromatic_rings'] = Lipinski.NumAromaticRings(mol)
    features['num_heteroatoms'] = Lipinski.NumHeteroatoms(mol)
    features['num_heavy_atoms'] = Lipinski.HeavyAtomCount(mol)
    features['num_h_donors'] = Lipinski.NumHDonors(mol)
    features['num_h_acceptors'] = Lipinski.NumHAcceptors(mol)
    features['num_aliphatic_rings'] = Lipinski.NumAliphaticRings(mol)
    features['num_saturated_rings'] = Lipinski.NumSaturatedRings(mol)
    features['num_aromatic_heterocycles'] = Lipinski.NumAromaticHeterocycles(mol)
    features['num_aromatic_carbocycles'] = Lipinski.NumAromaticCarbocycles(mol)
    features['num_aliphatic_heterocycles'] = Lipinski.NumAliphaticHeterocycles(mol)
    features['num_aliphatic_carbocycles'] = Lipinski.NumAliphaticCarbocycles(mol)

    # Fragment Descriptors
    for frag_func in dir(Fragments):
        if frag_func.startswith('fr_'):
            features[frag_func] = getattr(Fragments, frag_func)(mol)


    return features

training_features_df = training_data['SMILES'].apply(extract_features)

training_features_df = training_features_df.apply(pd.Series)

training_data_features = training_data.join(training_features_df)

training_data_features.to_csv('training_data_features.csv', index=False)





In [4]:
""" # Load the datasets
fingerprint_df = pd.read_csv('training_data_fingerprint.csv')
other_features_df = pd.read_csv('training_data_207_features.csv')

# Combine datasets
combined_df = pd.concat([fingerprint_df, other_features_df], axis=1) """

training_data = pd.read_csv('csvData/training_merged_fingerprints207.csv')

training_data, column_filter = dprep.create_column_filter(training_data)
training_data, imputation = dprep.create_imputation(training_data)


# Select only numeric columns for imputation
#numeric_cols = combined_df.select_dtypes(include=[np.number]).columns
#numeric_df = combined_df[numeric_cols]

#simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#numeric_df_imputed = pd.DataFrame(simple_imputer.fit_transform(numeric_df), columns=numeric_df.columns)


#training_data = pd.concat([combined_df.drop(columns=numeric_cols), numeric_df_imputed], axis=1)


X = training_data.drop(columns=["INDEX", "ACTIVE"])
y = training_data["ACTIVE"]


In [7]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

rf_clf.fit(X, y)

importances = rf_clf.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

feature_importance_df.to_csv('feature_importances.csv', index=False)

print(feature_importance_df.head())


               Feature  Importance
537       BCUT2D_MRLOW    0.009636
512  MaxAbsEStateIndex    0.008735
515     MinEStateIndex    0.008680
514  MinAbsEStateIndex    0.008660
513     MaxEStateIndex    0.008062


In [10]:
def manual_crossValidation(training, training_lables, number_of_features = 50, use_classifier="randomForest", sampling="smote"):
    if use_classifier == "randomForest":
        classifier = RandomForestClassifier(bootstrap=True, max_depth=40, min_samples_leaf=2,min_samples_split=2,n_estimators=200,class_weight="balanced",random_state=42)
    elif use_classifier == "MLP":
        classifier = MLPClassifier(activation = 'relu',alpha= 0.0001, hidden_layer_sizes=(50,))
    else:
        print("Invalid classifier specified:")
        return
    
    X = training.copy()
    y = training_lables.copy()

    # Set up cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    accuracies = []
    auc_scores = []
    confusion_matrices = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        n_best_features = dsf.select_n_best_features_randomForestImportance(X_train, y_train, n_features=number_of_features) ## should be moved into the fold calculation loop
        X_train = X_train[list(n_best_features.keys())]
        X_test = X_test[list(n_best_features.keys())]


        if sampling=="smote":
            # Apply SMOTE to the training set
            smote = SMOTE(sampling_strategy='auto', random_state=42)
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        elif sampling=="under":
            X_train_resampled, y_train_resampled = new_balance_labels_down(X_train, y_train)
        else:
            X_train_resampled, y_train_resampled = X_train, y_train

        classifier.fit(X_train_resampled, y_train_resampled)
        y_pred = classifier.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        auc_score = roc_auc_score(y_test, classifier.predict_proba(X_test)[:, 1])
        cm = confusion_matrix(y_test, y_pred)

        accuracies.append(accuracy)
        auc_scores.append(auc_score)
        confusion_matrices.append(cm)

    avg_auc_score = np.mean(auc_scores)
    avg_confusion_matrix = np.mean(confusion_matrices, axis=0)
    class_report = classification_report(y_test, y_pred)

    return {
        'average_auc_score': avg_auc_score,
        'confusion_matrix': avg_confusion_matrix,
        'classification_report': class_report
    }

In [11]:
def new_balance_labels_down(X,y):
    df = pd.concat([pd.DataFrame(X), pd.Series(y, name='label')], axis=1)

    original_counts = df['label'].value_counts()
    print("Original Label Counts:")
    print(original_counts)

    minority_label = original_counts.idxmin()
    majority_label = original_counts.idxmax()

    label_difference = original_counts[majority_label] - original_counts[minority_label]

    if label_difference == 0:
        print("Labels are already balanced.")
        return X, y

    majority_indices = df[df['label'] == majority_label].index
    indices_to_remove = np.random.choice(majority_indices, size=label_difference, replace=False)

    balanced_df = df.drop(indices_to_remove)

    new_counts = balanced_df['label'].value_counts()
    print("\nNew Label Counts:")
    print(new_counts)
    return balanced_df.drop('label', axis=1).values, balanced_df['label'].values

In [12]:
num_features_list = [10, 30, 50, 80, 100, len(X.columns)]

for num_features in num_features_list:
    ## Testing Random forest with and without smote
    results_oversampled = manual_crossValidation(X, y, num_features,use_classifier='randomForest',sampling="smote")
    results_undersampled = manual_crossValidation(X, y, num_features,use_classifier='randomForest',sampling="under")

    print("Random-Forest with SMOTE: ", results_oversampled["average_auc_score"])
    print("Random-Forest no SMOTE: ", results_undersampled["average_auc_score"])

    ## Testing MLP with and without smote
    MLP_results_oversampled = manual_crossValidation(X, y, num_features,use_classifier='MLP', sampling="smote")
    MLP_results_undersampled = manual_crossValidation(X, y, num_features, use_classifier='MLP',sampling="under")

    print("MLP with SMOTE: ", MLP_results_oversampled["average_auc_score"])
    print("MLP no SMOTE: ", MLP_results_undersampled["average_auc_score"])

Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
1.0    1428
0.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Random-Forest with SMOTE:  0.7210969505964713
Random-Forest no SMOTE:  0.7322540120469025




Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
0.0    1428
1.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




MLP with SMOTE:  0.6376834154151859
MLP no SMOTE:  0.6700174533209807
Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
0.0    1428
1.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Random-Forest with SMOTE:  0.765016080705721
Random-Forest no SMOTE:  0.7595262069858734




Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
1.0    1428
0.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




MLP with SMOTE:  0.6592256348302721
MLP no SMOTE:  0.6783573437772463
Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
1.0    1428
0.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Random-Forest with SMOTE:  0.7663072673369936
Random-Forest no SMOTE:  0.75704262423293




Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
1.0    1428
0.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




MLP with SMOTE:  0.6304479452652763
MLP no SMOTE:  0.6838452799476918
Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
0.0    1428
1.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Random-Forest with SMOTE:  0.753650953721017
Random-Forest no SMOTE:  0.757224201688445
Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
0.0    1428
1.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




MLP with SMOTE:  0.5185292100060261
MLP no SMOTE:  0.5153614751321743
Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
1.0    1428
0.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Random-Forest with SMOTE:  0.7642970960147643
Random-Forest no SMOTE:  0.7676295291706383
Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
0.0    1428
1.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




MLP with SMOTE:  0.5192935066194375
MLP no SMOTE:  0.525094245174963
Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
0.0    1428
1.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Random-Forest with SMOTE:  0.7932878005972789
Random-Forest no SMOTE:  0.7760062120930777
Original Label Counts:
label
0.0    121156
1.0      1428
Name: count, dtype: int64

New Label Counts:
label
1.0    1428
0.0    1428
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
0.0    1427
1.0    1427
Name: count, dtype: int64




Original Label Counts:
label
0.0    121157
1.0      1427
Name: count, dtype: int64

New Label Counts:
label
1.0    1427
0.0    1427
Name: count, dtype: int64




MLP with SMOTE:  0.5171943966198682
MLP no SMOTE:  0.5082726606103574


In [57]:

training_data = pd.read_csv('csvData/training_merged_fingerprints207.csv')
test_data = pd.read_csv('csvData/test_merged_fingerprints207.csv')


training_data, column_filter = dprep.create_column_filter(training_data)
training_data, imputation = dprep.create_imputation(training_data)

test_data_filtered = dprep.apply_column_filter(test_data, column_filter)
test_data_preprocessed = dprep.apply_imputation(test_data_filtered, imputation)

if 'INDEX' in test_data_preprocessed.columns:
    test_data_preprocessed = test_data_preprocessed.drop(columns=["INDEX"])

X = training_data.drop(columns=["INDEX", "ACTIVE"])
y = training_data["ACTIVE"]

test_data_final = test_data_preprocessed[X.columns]
test_data_final.replace([np.inf, -np.inf], np.nan, inplace=True)
test_data_final.fillna(test_data_final.mean(), inplace=True)


In [58]:
print(len(X))
print(len(test_data_final))

153230
51076


In [60]:
def write_predictions_to_txt(auc_estimate, predicted_probabilities, output_file_path='output.txt'):
    with open(output_file_path, 'w') as file:
        file.write(f"{auc_estimate}\n")
        for prob in predicted_probabilities:
            file.write(f"{prob}\n")

In [62]:
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split



def final_model(X_train, y_train, random_state=42):
    
    
    smote = SMOTE(random_state=random_state)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
    rf_classifier = RandomForestClassifier(bootstrap=False, max_depth=20, min_samples_leaf=2,min_samples_split=10,n_estimators=300,class_weight="balanced",random_state=42)
    rf_classifier.fit(X_train_smote, y_train_smote)
    
    return rf_classifier

rf_model_smote = final_model(X, y, random_state=42)


In [61]:
predicted_probabilities = rf_model_smote.predict_proba(test_data_final)[:, 1]
auc_score = 0.79
print(predicted_probabilities)


write_predictions_to_txt(auc_score, predicted_probabilities, "8.txt")

predictions_df = pd.read_csv("8.txt", header=None)
assert predictions_df.shape == (51077, 1)
assert np.all((predictions_df.values >= 0) & (predictions_df.values <= 1))

[0.05660967 0.03301795 0.1458733  ... 0.01044888 0.0693356  0.09084364]
