In [117]:
#import libs
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict

In [118]:
#import data

df_p100 = pd.read_excel('ILS-SETS.xlsx', sheet_name='P100', index_col=0)
df_MS2 = pd.read_excel('ILS-SETS.xlsx', sheet_name='MS2', index_col=0)
df_Phi6 = pd.read_excel('ILS-SETS.xlsx', sheet_name='Phi6', index_col=0)

# P100 MODEL

In [119]:
# Training set for P100 virus
Xt_p100 = df_p100[df_p100['Split']=='T'].iloc[:,-3:]
yt_p100 = df_p100[df_p100['Split']=='T']['Activity Class']

# Validation set for P100 virus
Xv_p100 = df_p100[df_p100['Split']=='V'].iloc[:,-3:]
yv_p100 = df_p100[df_p100['Split']=='V']['Activity Class']

In [120]:
# model fitting
dt_classifier = DecisionTreeClassifier(max_depth=3, class_weight='balanced')
dt_classifier.fit(Xt_p100, yt_p100)

# predictions for training and validation sets
predicted_train_p100 = dt_classifier.predict(Xt_p100)
predicted_val_p100 = dt_classifier.predict(Xv_p100)

# check model accuracy
accuracy_train = dt_classifier.score(Xt_p100, yt_p100)
accuracy_val = dt_classifier.score(Xv_p100, yv_p100)

#stratified kfold
predicted_val_p100_cv = cross_val_predict(estimator = dt_classifier, X=Xv_p100, y=yv_p100, cv=5)
val_cv_score = accuracy_score(yv_p100, predicted_val_p100_cv)

print(f'Accuracy for training set: {np.round(accuracy_train, 2)}\nAccuracy for validation set: {np.round(accuracy_val, 2)}')
print(f'Cross-validation score: {np.round(val_cv_score, 3)}')

Accuracy for training set: 0.93
Accuracy for validation set: 0.86
Cross-validation score: 0.765


In [121]:
# generate classification report
# Note that in binary classification, recall of the positive class is also known as “sensitivity”; 
# recall of the negative class is “specificity”.

print(classification_report(yv_p100, predicted_val_p100))

              precision    recall  f1-score   support

      Active       0.60      0.67      0.63         9
    Inactive       0.93      0.90      0.92        42

    accuracy                           0.86        51
   macro avg       0.76      0.79      0.77        51
weighted avg       0.87      0.86      0.87        51



# MS2 MODEL

In [122]:
# Training set for MS2 virus
Xt_MS2 = df_MS2[df_MS2['Split']=='T'].iloc[:,-3:]
yt_MS2 = df_MS2[df_MS2['Split']=='T']['Activity Class']

# Validation set for MS2 virus
Xv_MS2 = df_MS2[df_MS2['Split']=='V'].iloc[:,-3:]
yv_MS2 = df_MS2[df_MS2['Split']=='V']['Activity Class']

In [123]:
# model fitting
dt_classifier = DecisionTreeClassifier(max_depth=3, class_weight='balanced')
dt_classifier.fit(Xt_MS2, yt_MS2)

# predictions for training and validation sets
predicted_train_MS2 = dt_classifier.predict(Xt_MS2)
predicted_val_MS2 = dt_classifier.predict(Xv_MS2)

# check model accuracy
accuracy_train = dt_classifier.score(Xt_MS2, yt_MS2)
accuracy_val = dt_classifier.score(Xv_MS2, yv_MS2)

#stratified kfold
predicted_val_MS2_cv = cross_val_predict(estimator = dt_classifier, X=Xv_MS2, y=yv_MS2, cv=5)
val_cv_score = accuracy_score(yv_MS2,predicted_val_MS2_cv)

print(f'Accuracy for training set: {np.round(accuracy_train, 2)}\nAccuracy for validation set: {np.round(accuracy_val, 2)}')
print(f'Cross-validation score: {np.round(val_cv_score, 3)}')

Accuracy for training set: 0.96
Accuracy for validation set: 0.88
Cross-validation score: 0.863


In [124]:
# generate classification report
print(classification_report(yv_MS2, predicted_val_MS2))

              precision    recall  f1-score   support

      Active       0.62      0.62      0.62         8
    Inactive       0.93      0.93      0.93        43

    accuracy                           0.88        51
   macro avg       0.78      0.78      0.78        51
weighted avg       0.88      0.88      0.88        51



# PHI6 MODEL

In [125]:
# Training set for Phi6 virus
Xt_Phi6 = df_Phi6[df_Phi6['Split']=='T'].iloc[:,-3:]
yt_Phi6 = df_Phi6[df_Phi6['Split']=='T']['Activity Class']

# Validation set for Phi6 virus
Xv_Phi6 = df_Phi6[df_Phi6['Split']=='V'].iloc[:,-3:]
yv_Phi6 = df_Phi6[df_Phi6['Split']=='V']['Activity Class']

In [126]:
# model fitting
dt_classifier = DecisionTreeClassifier(max_depth=3, class_weight='balanced')
dt_classifier.fit(Xt_Phi6, yt_Phi6)

# predictions for training and validation sets
predicted_train_Phi6 = dt_classifier.predict(Xt_Phi6)
predicted_val_Phi6 = dt_classifier.predict(Xv_Phi6)

# check model accuracy
accuracy_train = dt_classifier.score(Xt_Phi6, yt_Phi6)
accuracy_val = dt_classifier.score(Xv_Phi6, yv_Phi6)

#stratified kfold
predicted_val_Phi6_cv = cross_val_predict(estimator = dt_classifier, X=Xv_Phi6, y=yv_Phi6, cv=5)
val_cv_score = accuracy_score(yv_Phi6, predicted_val_Phi6_cv)

print(f'Accuracy for training set: {np.round(accuracy_train, 2)}\nAccuracy for validation set: {np.round(accuracy_val, 2)}')
print(f'Cross-validation score: {np.round(val_cv_score, 3)}')

Accuracy for training set: 0.88
Accuracy for validation set: 0.9
Cross-validation score: 0.843


In [127]:
# generate classification report
print(classification_report(yv_Phi6, predicted_val_Phi6))

              precision    recall  f1-score   support

      Active       0.90      0.86      0.88        21
    Inactive       0.90      0.93      0.92        30

    accuracy                           0.90        51
   macro avg       0.90      0.90      0.90        51
weighted avg       0.90      0.90      0.90        51

