# Import required modules
Make sure you installed the conda environment and activated it first by running:

```
conda env create -f env.yml
```

Then you can choose that environment for the jupyter kernel.

## Important!!
If you want to use the witwidget and haven't already run these commands you must do so!

```
jupyter nbextension install --py --symlink --sys-prefix witwidget
jupyter nbextension enable --py --sys-prefix witwidget
```

In [1]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import (average_precision_score, confusion_matrix,
                             roc_auc_score, roc_curve)
from sklearn.model_selection import StratifiedKFold, train_test_split
from tensorflow import convert_to_tensor, float64
from tensorflow.data import Dataset
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from witwidget.notebook.visualization import WitConfigBuilder, WitWidget

# Load data and make required directories

In [2]:
dat_in_pth = "../data/features.pkl"
dat_df = pd.read_pickle(dat_in_pth)

results_dir = "results"
seed = 8675309
np.random.seed(seed)

if not os.path.exists(results_dir):
    os.makedirs(results_dir)

training_set_tss_ids_pth = results_dir + "/training_set_tss_ids.txt"
heldout_test_set_tss_ids_pth = results_dir + "/heldout_test_set_tss_ids.txt"
crossval_results_pth = results_dir + "/crossval_results.csv"
test_performance_pth = results_dir + "/heldout_test_performance.csv"
coefs_pth = results_dir + "/coefficients.csv"

dat_df.head()

Unnamed: 0_level_0,M0004_1.02_FWD_1,M0004_1.02_FWD_2,M0004_1.02_FWD_3,M0004_1.02_FWD_4,M0004_1.02_FWD_5,M0004_1.02_FWD_6,M0004_1.02_FWD_7,M0005_1.02_FWD_1,M0005_1.02_FWD_2,M0005_1.02_FWD_3,...,GA_REV_2,GA_REV_3,GA_REV_4,GA_REV_5,GA_REV_6,GA_REV_7,GCcontent,CAcontent,GAcontent,class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Chr4_+_7750398_AT4G13310_root_0,1.440441,0.0,0.0,4.508889,9.438149,9.714371,17.533581,5.50111,0.0,0.0,...,11.805773,9.450124,12.967102,8.974781,14.424215,13.597577,0.3,0.59,0.58,1
Chr4_+_11302680_AT4G21200_leaf_0,11.11125,11.11125,0.0,0.0,12.846526,16.47229,5.290287,2.406347,0.0,0.0,...,0.824729,9.927819,16.496307,6.568488,4.86594,4.86594,0.34,0.54,0.43,1
Chr4_-_10918262_AT4G20210_root_0,0.293377,4.510425,4.217049,4.559656,5.156968,0.597312,0.0,0.0,0.0,0.0,...,2.222053,11.24296,9.343733,6.638287,25.419486,18.781199,0.41,0.545,0.475,1
Chr5_-_26757236_AT5G67030_tair_0,0.55542,2.275065,2.275065,0.0,0.0,0.189317,22.804014,0.0,0.0,0.0,...,9.204328,3.095468,0.0,11.282284,13.337318,11.061596,0.425,0.495,0.44,1
Chr1_-_25046860_AT1G67080_tair_0,0.0,1.398959,1.398959,1.3773,2.470973,1.093673,0.0,0.0,0.0,0.0,...,11.599996,10.826384,11.891774,11.529227,11.948718,8.52895,0.305,0.545,0.55,1


# Split data into train/test sets

In [23]:
x_train, x_heldout, y_train, y_heldout = train_test_split(
    dat_df.iloc[:, :-1], dat_df.iloc[:, -1], test_size=0.2, stratify=dat_df["class"]
)

### Optionally save IDs

In [None]:
np.savetxt(training_set_tss_ids_pth, x_train.index.values, fmt="%s")
np.savetxt(heldout_test_set_tss_ids_pth, x_heldout.index.values, fmt="%s")

# Define model object

In [4]:
model = Sequential()
model.add(Dense(1, input_shape=(x_train.shape[1],), activation="sigmoid"))
model.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["binary_accuracy", "AUC"]
)

# Fit the model on training data

In [5]:
n_cv_slices = 10

for slice_num, (train_ids, test_ids) in enumerate(
    StratifiedKFold(n_splits=n_cv_slices).split(x_train, y_train), 1
):
    slice_train_xs, slice_test_xs = x_train.iloc[train_ids], x_train.iloc[test_ids]
    slice_train_ys, slice_test_ys = y_train.iloc[train_ids], y_train.iloc[test_ids]

    model.fit(x=slice_train_xs, y=slice_train_ys, epochs=10, verbose=0)

    model.evaluate(x=slice_test_xs, y=slice_test_ys)



# Test the model

In [6]:
model.evaluate(x=x_heldout, y=y_heldout)



[2.2190513610839844, 0.6346153616905212, 0.6708579659461975]

# What-If Tool

In [24]:
def df_to_examples(df, columns=None):
    examples = []
    if columns == None:
        columns = df.columns.values.tolist()
    for index, row in df.iterrows():
        example = tf.train.Example()
        for col in columns:
            if row[col].dtype is np.dtype(np.int64):
                example.features.feature[col].int64_list.value.append(int(row[col]))
            elif row[col].dtype is np.dtype(np.float64):
                example.features.feature[col].float_list.value.append(row[col])
            else:
                raise TypeError("Issue!!")
        examples.append(example)
    return examples

n_features = 10

wit_df = pd.concat([x_train.iloc[:, :n_features], y_train], axis=1)
wit_data = df_to_examples(wit_df)

In [31]:
config_builder = (
    WitConfigBuilder(wit_data)
#     .set_custom_predict_fn(model.predict) # this doesn't work :(
)
WitWidget(config_builder, height=500)

WitWidget(config={'model_type': 'classification', 'label_vocab': [], 'are_sequence_examples': False, 'inferenc…

# Compute performance metrics for predictions on the test set

In [None]:
y_true = y_heldout.values
y_prediction_probabilities = model.predict(x_heldout)

roc_curve_df = (
    pd.DataFrame(roc_curve(y_true, y_prediction_probabilities))
    .T.rename(columns={0: "FPR", 1: "TPR", 2: "threshold"})
    .drop(index=0)
    .reset_index(drop=True)
)
roc_curve_df["Y"] = roc_curve_df["TPR"] - roc_curve_df["FPR"]
youden_T = roc_curve_df.iloc[roc_curve_df["Y"].idxmax()]["threshold"]

y_pred_youden = np.where(y_prediction_probabilities >= youden_T, 1, 0)

heldout_perf_roc = roc_auc_score(y_true, y_prediction_probabilities)
heldout_perf_prc = average_precision_score(
    y_true, y_prediction_probabilities
)

confusion_mat = confusion_matrix(y_true, y_pred_youden)
TN = confusion_mat[0][0]
FP = confusion_mat[0][1]
FN = confusion_mat[1][0]
TP = confusion_mat[1][1]

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP / (TP + FN)
# Specificity or true negative rate
TNR = TN / (TN + FP)
# Precision or positive predictive value
PPV = TP / (TP + FP)

heldout_perf_f1 = 2 * ((PPV * TPR) / (PPV + TPR))

perf_metrics = pd.Series({
    "auROC": heldout_perf_roc,
    "auPRC": heldout_perf_prc,
    "youden_T": youden_T,
    "sensitivity": TPR,
    "specificity": TNR,
    "precision": PPV,
    "f1": heldout_perf_f1,
    "seed": seed,
})
perf_metrics

In [None]:
coefficients = pd.Series(model., columns=dat_df.columns[:-1])
coefficients

# Save files

In [None]:
# save metrics
perf_metrics.to_csv(test_performance_pth, header=True)

# save coefficient values
coefficients.to_csv(coefs_pth, header=True)