In [None]:
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu
from rdkit import Chem
from rdkit.Chem import Crippen, Descriptors
from trialblazer import Trialblazer
from trialblazer.Model.testset_visualization import (
    plot_score_distribution_with_significance,
    plot_correlation,
    SuspectedAdverseDrugEvents_count,
    SuspectedAdverseDrugEvents_count_for_eachdrug,
    SuspectedAdverseDrugEvents_Totalcount_for_eachdrug
)
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')


In [None]:
def get_logp(mol):
    return round(Crippen.MolLogP(mol), 3)

def cal_mw(mol):
    return round(Descriptors.MolWt(mol), 3)

def select_compounds(predictions, approved_drugs_info):
    predictions['PrOCTOR_score'] = predictions['PrOCTOR_score'].astype(float)
    sorted_predictions = predictions.sort_values(by='PrOCTOR_score', ascending=False, ignore_index=True)
    single_component_predictions = sorted_predictions[~sorted_predictions.id.str.contains(r"\d+x\d+")] # need to remove multi-components because it probably dont't have a name
    top_info, top_drug_names = connect_drug_name(approved_drugs_info, single_component_predictions, head=True)
    bottom_info, bottom_drug_names = connect_drug_name(approved_drugs_info, single_component_predictions, tail=True)
    return top_info, bottom_info, top_drug_names, bottom_drug_names

def connect_drug_name(approved_drugs_info, single_component_predictions, head=False, tail=False):
    if head:
        single_component_predictions = single_component_predictions.head(20)
    elif tail:
        single_component_predictions = single_component_predictions.tail(20)
    single_component_predictions['mol'] = single_component_predictions['smi'].apply(Chem.MolFromSmiles)
    approved_drugs_info = approved_drugs_info.rename(columns={'chembl_id': 'id'})
    approved_drugs_info['id'] = approved_drugs_info['id'].astype(str)
    merged_info = single_component_predictions.merge(approved_drugs_info, how='left', on='id')
    merged_info = merged_info.rename(columns={'name_to_use': 'Name'})
    drug_names = merged_info['Name'].tolist()
    return merged_info, drug_names

In [None]:
Training_data = pd.read_csv("../Data/Training_data_withoutInfo.csv", sep='|')
Testset_data= pd.read_csv('../Data/test_set_data.csv')
Training_data_with_features = pd.read_csv("../Data/training_features.csv")

In [None]:
print(len(Testset_data))
print(Testset_data.columns)
Testset_data['Mark'] = 0

Check whethere there is overlap between training compounds and test set compounds

In [None]:
Testset_data['inchi_noStereo'] = Testset_data['SmilesForDropDu'].apply(lambda x: Chem.MolToInchi(Chem.MolFromSmiles(x)))

In [None]:
training_set_inchi_list = Training_data['inchi_noStereo'].to_list()
print(len(Testset_data[Testset_data['inchi_noStereo'].isin(training_set_inchi_list)]))

# Prediction based on M2FPs features or M2FPs+PBFPs features

To reproduce the results from scratch, use the original_testset_data to run the model. This will take approximately a few hours.

In [None]:
# original_testset_data = pd.read_csv("../Data/Testset_raw_with_drugname.csv")

In [None]:
# raw_testdata_SMILES = original_testset_data['SmilesForDropDu'].tolist()
# model = Trialblazer()
# model.import_smiles(smiles=raw_testdata_SMILES)
# model.run()
# testset_prediction = model.result.copy()

model prediction based on M2FPs+PBFPs 

In [None]:
model_combined_feature = Trialblazer(remove_MultiComponent_cpd=False)
model_combined_feature.test_set=Testset_data
model_combined_feature.run()
prediction_combined_feature = model_combined_feature.result.copy()

In [None]:
prediction_combined_feature

model prediction based on M2FPs

In [None]:
model_M2FP = Trialblazer(M2FP_only=True, remove_MultiComponent_cpd=False)
model_M2FP.test_set=Testset_data
model_M2FP.run()
prediction_M2FP = model_M2FP.get_dataframe()

# Assign scores to benign and toxic compounds in the training set

In [None]:
benign_set = Training_data_with_features[Training_data_with_features.Mark == 0]
toxic_set = Training_data_with_features[Training_data_with_features.Mark == 1]

model prediction based on M2FPs

In [None]:
model_M2FP= Trialblazer(M2FP_only = True, remove_MultiComponent_cpd=False)
model_M2FP.test_set=benign_set
model_M2FP.run()
prediction_benign_set_M2FP = model_M2FP.result.copy()

In [None]:
model_M2FP = Trialblazer(M2FP_only = True, remove_MultiComponent_cpd=False)
model_M2FP.test_set=toxic_set
model_M2FP.run()
prediction_toxic_set_M2FP = model_M2FP.result.copy()

model prediction based on M2FPs+PBFPs

In [None]:
model_combined_feature= Trialblazer(remove_MultiComponent_cpd=False)
model_combined_feature.test_set=benign_set
model_combined_feature.run()
prediction_benign_set = model_combined_feature.result.copy()

In [None]:
model_combined_feature = Trialblazer(remove_MultiComponent_cpd=False)
model_combined_feature.test_set=toxic_set
model_combined_feature.run()
prediction_toxic_set = model_combined_feature.result.copy()

## PrOCTOR score distribution for benign, toxic, and test sets

Example of model based on M2FPs+PBFPs features

In [None]:
Testset_withScore = np.array(prediction_combined_feature.PrOCTOR_score)
benign = np.array(prediction_benign_set.PrOCTOR_score)
toxic = np.array(prediction_toxic_set.PrOCTOR_score)

In [None]:
print(np.mean(Testset_withScore).round(2))
print(np.mean(benign).round(2))
print(np.mean(toxic).round(2))

In [None]:
labels = ("Benign compounds\nTraining set","Toxic compounds\nTraining set","Benign compounds\nTest set")
data = pd.DataFrame({
    "Value": np.concatenate([benign, toxic, Testset_withScore]),
    "Category": np.repeat(labels, repeats=[len(benign), len(toxic), len(Testset_withScore)])
})
p_values = {}
pairs = [("Benign compounds\nTraining set", "Toxic compounds\nTraining set"), ("Toxic compounds\nTraining set", "Benign compounds\nTest set")]  # Pairs of categories to compare
for cat1, cat2 in pairs:
    stat, p = mannwhitneyu(np.array(data[data["Category"] == cat1]["Value"]), np.array(data[data["Category"] == cat2]["Value"]), alternative="two-sided")
    p_values[(cat1, cat2)] = p

In [None]:
plot_score_distribution_with_significance(data, p_values)

# Select compounds from test set according to PrOCTOR score

In [None]:
# Removing multi-components in here in order to find the drug name and evaluate the drug later
model_combined_feature = Trialblazer()
model_combined_feature.test_set=Testset_data
model_combined_feature.run()
prediction_combined_feature = model_combined_feature.result.copy()

In [None]:
# predict_result_sorted_nameInfo_head, predict_result_sorted_nameInfo_tail, drugs_name_head, drugs_name_tail = select_compounds(
#     prediction_combined_feature, 
#     Testset_drugs_with_name_new)

## Top 10 drugs

In [None]:
predict_result_sorted_nameInfo_head = pd.read_csv("../Data/predict_result_sorted_nameInfo_head_nameMatching.csv")
predict_result_sorted_nameInfo_tail = pd.read_csv("../Data/predict_result_sorted_nameInfo_tail_nameMatching.csv")

In [None]:
drugs_name_head = list(predict_result_sorted_nameInfo_head['name_to_use'])
predict_result_sorted_nameInfo_head['Name'] = drugs_name_head

In [None]:
# Only systemically administered drugs are selected to analyze
predict_result_sorted_nameInfo_head = predict_result_sorted_nameInfo_head.iloc[[0,4,6,9,10,11,12,13,14,15]]
drugs_name_head = list(predict_result_sorted_nameInfo_head['Name'])
predict_result_sorted_nameInfo_head.mol = predict_result_sorted_nameInfo_head.smi.apply(Chem.MolFromSmiles)
img_head = Chem.Draw.MolsToGridImage(list(predict_result_sorted_nameInfo_head['mol']),molsPerRow=5,subImgSize=(400,200), returnPNG=False, legends=drugs_name_head)
img_head

molecular weight and logP value calculation

In [None]:
predict_result_sorted_nameInfo_head['mw'] = predict_result_sorted_nameInfo_head.mol.apply(cal_mw)
print(round(np.mean(predict_result_sorted_nameInfo_head.head(10)['mw']),2))
print(round(np.std(predict_result_sorted_nameInfo_head.head(10)['mw']),2))

In [None]:
predict_result_sorted_nameInfo_head['logP'] = predict_result_sorted_nameInfo_head.mol.apply(get_logp)
print(round(np.mean(predict_result_sorted_nameInfo_head.head(10)['logP']),2))
print(round(np.std(predict_result_sorted_nameInfo_head.head(10)['logP']),2))

##  Tail 10 drugs

In [None]:
drugs_name_tail = list(predict_result_sorted_nameInfo_tail['name_to_use'])
predict_result_sorted_nameInfo_tail['Name'] = drugs_name_tail

In [None]:
# Only systemically administered drugs are selected to analyze
predict_result_sorted_nameInfo_tail = predict_result_sorted_nameInfo_tail.iloc[[18,17,15,13,12,11,10,9,8,6]]

In [None]:
drugs_name_tail = list(predict_result_sorted_nameInfo_tail['Name'])
drugs_name_tail[0] = 'Megestrol'

In [None]:
predict_result_sorted_nameInfo_tail.mol = predict_result_sorted_nameInfo_tail.smi.apply(Chem.MolFromSmiles)
mols = predict_result_sorted_nameInfo_tail['mol'].tolist()[::-1]
legends = drugs_name_tail[::-1]
img_tail = Chem.Draw.MolsToGridImage(list(mols),molsPerRow=5,subImgSize=(400,200), returnPNG=False, legends=legends)
img_tail

molecular weight and logP value calculation

In [None]:
predict_result_sorted_nameInfo_tail['mw'] = predict_result_sorted_nameInfo_tail.mol.apply(cal_mw)
print(round(np.mean(predict_result_sorted_nameInfo_tail['mw']),2))
print(round(np.std(predict_result_sorted_nameInfo_tail['mw']),2))

In [None]:
predict_result_sorted_nameInfo_tail['logP'] = predict_result_sorted_nameInfo_tail.mol.apply(get_logp)
print(round(np.mean(predict_result_sorted_nameInfo_tail.head(10)['logP']),2))
print(round(np.std(predict_result_sorted_nameInfo_tail.head(10)['logP']),2))

# Correlation between test set compounds' PrOCTOR scores and molecular weight

In [None]:
prediction_combined_feature['mol'] = prediction_combined_feature['smi'].apply(Chem.MolFromSmiles)
prediction_combined_feature['mw'] = prediction_combined_feature['mol'].apply(cal_mw)

In [None]:
prediction_combined_feature['mol'] = prediction_combined_feature['smi'].apply(Chem.MolFromSmiles)
prediction_combined_feature['logP'] = prediction_combined_feature['mol'].apply(get_logp)

In [None]:
predict_result_toxic = prediction_combined_feature[prediction_combined_feature['prediction'] == 'toxic']
predict_result_benign = prediction_combined_feature[prediction_combined_feature['prediction'] == 'benign']

In [None]:
plot_correlation(predict_result_toxic, predict_result_benign)

# Analyze the adverse events for ten predicted toxic and ten predicted benign drugs

In [None]:
true_adverse_drug_reactions = [
    'Blood and lymphatic system disorders',
    'Cardiac disorders',
    'Ear and labyrinth disorders',
    'Endocrine disorders',
    'Eye disorders',
    'Gastrointestinal disorders',
    'General disorders and administration site conditions',
    'Hepatobiliary disorders',
    'Immune system disorders',
    'Infections and infestations',
    'Metabolism and nutrition disorders',
    'Musculoskeletal and connective tissue disorders',
    'Neoplasms benign, malignant and unspecified (incl cysts and polyps)',
    'Nervous system disorders',
    'Psychiatric disorders',
    'Renal and urinary disorders',
    'Reproductive system and breast disorders',
    'Respiratory, thoracic and mediastinal disorders',
    'Skin and subcutaneous tissue disorders',
    'Vascular disorders'
]

Data source: EudraVigilance -- adverse reaction report analysis

In [None]:
pre_benign = pd.read_csv('../Data/pre_benign.csv',sep='|')
pre_toxic = pd.read_csv('../Data/pre_toxic.csv',sep='|')

Total number of suspected adverse reactions reported for the selected drugs

In [None]:
pre_benign_1 = pre_benign[pre_benign['Seriousness'] != 'Total'].drop(columns=['Total'])
pre_toxic_1 = pre_toxic[pre_toxic['Seriousness'] != 'Total'].drop(columns=['Total'])

In [None]:
list_of_adverse_reaction = pre_benign_1.iloc[:,4:].columns.tolist()

In [None]:
SuspectedAdverseDrugEvents_count(pre_benign_1, pre_toxic_1, true_adverse_drug_reactions)

Number of suspected adverse reactions reported for specific drugs

In [None]:
pre_benign_2 = pre_benign[pre_benign['Seriousness'] == 'Total']
pre_toxic_2 = pre_toxic[pre_toxic['Seriousness'] == 'Total']

In [None]:
pre_combine = pd.concat([pre_benign_2, pre_toxic_2], ignore_index=True)

In [None]:
pre_combine[true_adverse_drug_reactions] = pre_combine[true_adverse_drug_reactions].replace(',', '', regex=True).astype(float)

In [None]:
pre_benign_2['Total'] = pre_benign_2['Total'].replace(',', '', regex=True).astype(float)
pre_toxic_2['Total'] = pre_toxic_2['Total'].replace(',', '', regex=True).astype(float)

In [None]:
print(np.mean(pre_benign_2[true_adverse_drug_reactions]))
print(np.mean(pre_toxic_2[true_adverse_drug_reactions]))

In [None]:
pre_combine['Total'] = pre_combine[true_adverse_drug_reactions].sum(axis=1)

In [None]:
SuspectedAdverseDrugEvents_Totalcount_for_eachdrug(pre_combine)

In [None]:
SuspectedAdverseDrugEvents_count_for_eachdrug(pre_combine, true_adverse_drug_reactions)