### Check Sparsechem prediction for compounds with known labels

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import sklearn

In [None]:
# Copied form Sparsechem utils.py
def all_metrics(y_true, y_score):
    y_classes = np.where(y_score > 0.5, 1, 0) 
    if len(y_true) <= 1:
        df = pd.DataFrame({"roc_auc_score": [np.nan], "auc_pr": [np.nan], "avg_prec_score": [np.nan], "max_f1_score": [np.nan], "kappa": [np.nan]})
        return df
    if (y_true[0] == y_true).all():
        df = pd.DataFrame({"roc_auc_score": [np.nan], "auc_pr": [np.nan], "avg_prec_score": [np.nan], "max_f1_score": [np.nan], "kappa": [np.nan]})
        return df
    roc_auc_score = sklearn.metrics.roc_auc_score(
          y_true  = y_true,
          y_score = y_score)
    precision, recall, thresholds = sklearn.metrics.precision_recall_curve(y_true = y_true, probas_pred = y_score)

    ## calculating F1 for all cutoffs
    F1_score       = np.zeros(len(precision))
    mask           = precision > 0
    F1_score[mask] = 2 * (precision[mask] * recall[mask]) / (precision[mask] + recall[mask])

    max_f1_score = F1_score.max()
    auc_pr = sklearn.metrics.auc(x = recall, y = precision)
    avg_prec_score = sklearn.metrics.average_precision_score(
          y_true  = y_true,
          y_score = y_score)
    kappa = sklearn.metrics.cohen_kappa_score(y_true, y_classes)
    df = pd.DataFrame({"roc_auc_score": [roc_auc_score], "auc_pr": [auc_pr], "avg_prec_score": [avg_prec_score], "max_f1_score": [max_f1_score], "kappa": [kappa]})
    return df

In [None]:
# Load Sparsechem prediction (e.g. for new compounds or random subset of training set)
pred = np.load(r"y_hat.npy")

In [None]:
# Generate DataFrame with predictions and continuous task/compound IDs
d = []
cpd_counter = 0
for row in pred:
    task_counter = 0
    for col in row:
        d.append({"cont_descriptor_vector_id" : cpd_counter, "cont_classification_task_id" : task_counter, "prediction" : col})
        task_counter += 1
    cpd_counter += 1
    
df = pd.DataFrame(d)

In [None]:
# Get input_compound_ids from prediction mapping table
T5_pred = pd.read_csv(r"results_tmp\T2_pred_mapping_table_T5.csv")
T5_pred.drop_duplicates(subset="input_compound_id", inplace=True)

In [None]:
# Merge input ids with continuous ids from T11 prediction file
T11_pred = pd.read_csv(r"results\T2_pred_T11.csv")
df2 = T5_pred.merge(T11_pred, on="descriptor_vector_id")

In [None]:
# Add ids to data frame with predictions
df = df.merge(df2, on="cont_descriptor_vector_id")

In [None]:
# Load mapping table from model training to get initial task and assay IDs
T3_train = pd.read_csv(r"training\results\weight_table_T3_mapped.csv")
df = df.merge(T3_train, on="cont_classification_task_id")
df = df.drop("weight", 1)
df = df.drop("assay_type", 1)
df = df.drop("fp_val_json", 1)
df = df.drop("fp_json", 1)
df = df.drop("fold_id", 1)
df.head

In [None]:
# Merge with input labels based on task and compound ids (T4-like file needs to be prepared for the prediction compounds)
act = pd.read_csv(r"T4_like.csv")
act = act.groupby(["input_compound_id", "classification_task_id"]).agg(lambda x:x.value_counts().index[0])
df = df.merge(act, on=["input_compound_id", "classification_task_id"])
df.shape

In [None]:
# Check predictions for input 1s (most should be close to 1)
df_actives = df[df["class_label"] == 1]
df_actives.drop(["cont_classification_task_id", "classification_task_id"], 1, inplace=True)
print(df_actives.sort_values("prediction", ascending = False).to_string())

In [None]:
# Preview df
df.sort_values(["input_compound_id", "classification_task_id"]).head(n=10)

In [None]:
# Metrics over all tasks
all_metrics(df['class_label'], df['prediction'])

In [None]:
# Check scores for individual assays or tasks
for assay in df.input_assay_id.unique():
    assay_pred = df[df['input_assay_id'] == assay].reset_index()
    print(assay)
    print(all_metrics(assay_pred['class_label'], assay_pred['prediction']))