In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Load in the dataframe

In [None]:
df = pd.read_pickle('./main_df.pkl')
print(df.shape)
df.head()

### Load in the taxonomy JSONs

In [None]:
import json
import pathlib

orgs = {}
for file in pathlib.Path('./taxonomy/').iterdir():
    with open(file, 'r') as open_file:
        orgs[file.stem] = json.load(open_file)
        
orgs.keys()

### Find labels (which viruses infect which hosts in nature)

In [None]:
df['y'] = df.apply(lambda row: 1 if orgs['host'][row['host']]['lineage_names'][-1] == orgs['virus'][row['#virus']]['host']['lineage_names'][-1] else 0, axis=1)

In [None]:
df['y'].value_counts()

Number of positive cases:

In [None]:
len(df[df['y'] == 1].index)

Number of negative cases:

In [None]:
len(df[df['y'] == 0].index)

### Get all of positive cases + the same number of negative cases randomly

In [None]:
learning_df = df[df['y'] == 1]
negative_learning_df = df[df['y'] == 0].sample(n=len(df[df['y'] == 1].index))

learning_df = learning_df.append(negative_learning_df)
print(learning_df.shape)
learning_df.head()

### fillna - FILLING WITH mean() SEEMS LIKE A REALLY BAD IDEA

In [None]:
filled_df = learning_df.fillna(learning_df.mean())

In [None]:
# THIS IS A BETTER WAY, BUT IT RETURNS ONLY POSITIVE CASES
# filled_df = learning_df.dropna(axis=0)
# filled_df.head()

In [None]:
filled_df['y'].value_counts()

### Encode categorical values

In [None]:
transformed_data = pd.get_dummies(filled_df, columns=['#virus', 'host'])
transformed_data

### Extract X and y arrays

In [None]:
X = transformed_data.drop('y', axis=1).values
y = transformed_data['y'].values
# print(X[0])
len(y)

### test_train_split

In [None]:
# SIMPLE SPLIT
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

In [None]:
# Leave one out split
from sklearn.model_selection import LeaveOneOut
cv = LeaveOneOut()

### Feature scaling (if necessary) 

### Train the Random Forst Classifier

In [None]:
# ONLY FOR SIMPLE SPLIT USE
# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
# classifier.fit(X_train, y_train)

In [None]:
type(cv.split)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
y_t, y_p = list(), list()
for train_ix, test_ix in tqdm(cv.split(X)):
    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    # fit model
    model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state=1)
    model.fit(X_train, y_train)
    # evaluate model
    yhat = model.predict(X_test)
    # store
    y_t.append(y_test[0])
    y_p.append(yhat[0])
# calculate accuracy
acc = accuracy_score(y_t, y_p)
print(f'Accuracy: {acc:.3f}')

Explicit approach

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
# enumerate splits
y_true, y_pred = list(), list()
pbar = tqdm(total=len(X))

for train_ix, test_ix in cv.split(X):
    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    # fit model
    model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state=1)
    model.fit(X_train, y_train)
    # evaluate model
    yhat = model.predict(X_test)
    # store
    y_true.append(y_test[0])
    y_pred.append(yhat[0])
    pbar.update(1)
pbar.close()
# calculate accuracy
acc = accuracy_score(y_true, y_pred)
print(f'Accuracy: {acc:.3f}')

Using cross_val_score function

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
# create loocv procedure
cv = LeaveOneOut()
# create model
model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print(f'Accuracy: {np.mean(scores):.3f} ({np.std(scores):.3f})')

### Predict Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

### Score the model - Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

### ROC curve

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = classifier.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### ROC curve is a bad metric in this case – use F1 (harmonic mean of Precision & Recall) instead