### Import basic libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Load in the dataframe

In [2]:
df = pd.read_pickle('./main_df.pkl')
print(df.shape)
df.head()

(2213180, 14)


Unnamed: 0,virus,host,blastn,crisprdetect-2mismatch,gc_content,k25,k6-chebyshev,k6-kendalltau,k6-manhattan,piler-2mismatch,wish,group,group_code,y
0,NC_010363,NC_008527,60.8,0.0,0.021973,0.0,0.002122,0.398421,0.382144,0.0,-1.33553,Streptococcaceae,41,1
1,NC_010363,NC_002662,59.0,0.0,0.016709,0.0,0.001929,0.397773,0.377498,0.0,-1.33035,Streptococcaceae,41,1
2,NC_010363,NC_017949,59.0,0.0,0.020818,0.0,0.002088,0.396969,0.379686,0.0,-1.33341,Streptococcaceae,41,1
3,NC_010363,NC_017492,59.0,0.0,0.022209,0.0,0.002131,0.396148,0.38093,0.0,-1.33767,Streptococcaceae,41,1
4,NC_010363,NC_009004,59.0,0.0,0.020871,0.0,0.002088,0.397095,0.379834,0.0,-1.33341,Streptococcaceae,41,1


## Data preprocessing

### Get all of positive cases + the same number of negative cases randomly

In [3]:
# get all the positive cases
learning_df = df[df['y'] == 1]
# get the same amount of negative cases RANDOMLY
negative_learning_df = df[df['y'] == 0].sample(n=len(df[df['y'] == 1].index))

learning_df = pd.concat([learning_df, negative_learning_df], ignore_index=True)
print(learning_df.shape)
learning_df.head()

(33514, 14)


Unnamed: 0,virus,host,blastn,crisprdetect-2mismatch,gc_content,k25,k6-chebyshev,k6-kendalltau,k6-manhattan,piler-2mismatch,wish,group,group_code,y
0,NC_010363,NC_008527,60.8,0.0,0.021973,0.0,0.002122,0.398421,0.382144,0.0,-1.33553,Streptococcaceae,41,1
1,NC_010363,NC_002662,59.0,0.0,0.016709,0.0,0.001929,0.397773,0.377498,0.0,-1.33035,Streptococcaceae,41,1
2,NC_010363,NC_017949,59.0,0.0,0.020818,0.0,0.002088,0.396969,0.379686,0.0,-1.33341,Streptococcaceae,41,1
3,NC_010363,NC_017492,59.0,0.0,0.022209,0.0,0.002131,0.396148,0.38093,0.0,-1.33767,Streptococcaceae,41,1
4,NC_010363,NC_009004,59.0,0.0,0.020871,0.0,0.002088,0.397095,0.379834,0.0,-1.33341,Streptococcaceae,41,1


### Groups dristribution

NOTE: evening out the differences in group compositions will be added later

In [None]:
learning_df[['group','group_code']].value_counts()

group                                              group_code
Enterobacteriaceae                                 16            12661
Staphylococcaceae                                  40             4814
Pseudomonadaceae                                   34             2904
Streptococcaceae                                   41             2237
Bacillaceae                                        5              1546
Burkholderiaceae                                   9              1241
Vibrionaceae                                       46             1130
Listeriaceae                                       22              884
Propionibacteriaceae                               33              674
Mycobacteriaceae                                   25              636
Xanthomonadaceae                                   47              446
Lactobacillaceae                                   20              393
Enterococcaceae                                    17              316
Clostridiaceae 

In [None]:
learning_df[learning_df['y'] == 1][['group', 'group_code']].value_counts()

### Encode categorical values

*Note: encoding not needed because training is not based on virus and host names*

In [None]:
'''
transformed_data = pd.get_dummies(filled_df, columns=['virus', 'host'])
transformed_data
'''

### Extract X and y arrays

\+ get groups for LeaveOneGroupOut

In [4]:
# can also use .values for X and y for speed, but without it it's easier to look at these sets 
X = learning_df.drop(['virus', 'host', 'group', 'group_code', 'y'], axis=1)
y = learning_df['y']
groups = learning_df['group_code'].values

print(f'shape of X: {X.shape}')
print(f'len(y): {len(y)}')

shape of X: (33514, 9)
len(y): 33514


# Training using cross validation

### Using cross_validate function

In [5]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

# determine the scoring method
scoring = ['f1']
# create logo cv procedure
logo = LeaveOneGroupOut()
# create model
model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state=1)
# evaluate model
results = cross_validate(model, X, y, scoring=scoring, 
                        cv=logo, groups=groups, n_jobs=-1, return_estimator=True)

In [8]:
results.keys()

dict_keys(['fit_time', 'score_time', 'estimator', 'test_f1'])

In [9]:
results['test_f1']

array([0.66666667, 0.66666667, 0.        , 0.9       , 0.33333333,
       0.62008734, 0.44444444, 0.        , 0.66666667, 0.60818713,
       0.39215686, 0.75      , 0.03076923, 0.77894737, 0.        ,
       0.        , 0.53096586, 0.68518519, 0.8       , 1.        ,
       0.8125    , 0.55555556, 0.90611542, 0.        , 0.575     ,
       0.68292683, 0.88888889, 1.        , 0.45454545, 0.        ,
       0.92929293, 0.91489362, 0.28571429, 0.296     , 0.33570863,
       0.91666667, 0.44444444, 1.        , 0.8       , 0.        ,
       0.88719815, 0.86373166, 0.47058824, 0.72727273, 0.66666667,
       1.        , 0.39564428, 0.48484848])

In [40]:
len(results['estimator'])

48

## Get probabilities of class assignment for each element in main_df

for each estimator:
1. take the viruses from the test part of the estimator
2. for each of this virus estimate class for ALL the bacteria in main_df
3. check if estimation was correct
4. take best predictions (highest probabilities) for each virus
5. write these predictions down

### For main df:

Create X, y and groups sets for the main df

In [6]:
X_main = df.drop(['virus', 'host', 'group', 'group_code', 'y'], axis=1)
y_main = df['y']
groups_main = df['group_code'].values

Check if each subgroup contains all the hosts

In [7]:
hosts_num = len(pd.unique(df.loc[:, 'host']))
for i in range(0, max(groups_main)+1):
    if len(pd.unique(df.loc[groups_main == i, 'host'])) != hosts_num:
        print(f'ERROR: not enough hosts in subgroup {i} (hosts: {len(pd.unique(df.loc[groups_main == i, "host"]))})!')

Calculate the probabilities of classification for viruses in each of the test sets (i.e. where groups_main == i)

In [8]:
prob_df_main = pd.DataFrame(index=range(len(df['y'])), columns=['0', '1'])
prob_df_main['0'] = prob_df_main['0'].astype('float')
prob_df_main['1'] = prob_df_main['1'].astype('float')
for i in range(0, max(groups_main)+1):
    mask_main = groups_main == i
    prob_df_main.loc[mask_main, ['0', '1']] = results['estimator'][i].predict_proba(X_main.loc[mask_main,:])

Create a df with the results

In [9]:
df_all = pd.concat([df, prob_df_main], axis=1)

In [10]:
df_all.head()

Unnamed: 0,virus,host,blastn,crisprdetect-2mismatch,gc_content,k25,k6-chebyshev,k6-kendalltau,k6-manhattan,piler-2mismatch,wish,group,group_code,y,0,1
0,NC_010363,NC_008527,60.8,0.0,0.021973,0.0,0.002122,0.398421,0.382144,0.0,-1.33553,Streptococcaceae,41,1,0.9,0.1
1,NC_010363,NC_002662,59.0,0.0,0.016709,0.0,0.001929,0.397773,0.377498,0.0,-1.33035,Streptococcaceae,41,1,0.8,0.2
2,NC_010363,NC_017949,59.0,0.0,0.020818,0.0,0.002088,0.396969,0.379686,0.0,-1.33341,Streptococcaceae,41,1,0.8,0.2
3,NC_010363,NC_017492,59.0,0.0,0.022209,0.0,0.002131,0.396148,0.38093,0.0,-1.33767,Streptococcaceae,41,1,0.9,0.1
4,NC_010363,NC_009004,59.0,0.0,0.020871,0.0,0.002088,0.397095,0.379834,0.0,-1.33341,Streptococcaceae,41,1,0.8,0.2


Check if there indeed are all the expected predictions (2699 hosts for each virus)

In [11]:
hosts_num = len(pd.unique(df['host']))
for i, el in enumerate(df_all.groupby('virus')['host'].count()):
    if el != hosts_num:
        print(f'Error for virus {i}')

Determine if the predition was correct

In [12]:
df_all['estimator_correct'] = df['y']
df_all['estimator_correct'] = df_all['estimator_correct'].astype(bool)
mask = df_all['y'] == 0
df_all.loc[mask,'estimator_correct'] = df_all['0'] > df_all['1']
df_all.loc[mask, 'prob'] = df_all['0']
mask = df_all['y'] == 1
df_all.loc[mask, 'estimator_correct'] = df_all['1'] > df_all['0']
df_all.loc[mask, 'prob'] = df_all['1']
df_all['prob'] = df_all['prob'].astype('float')

In [15]:
correct_df = df_all[df_all['estimator_correct'] == True]
correct_df

Unnamed: 0,virus,host,blastn,crisprdetect-2mismatch,gc_content,k25,k6-chebyshev,k6-kendalltau,k6-manhattan,piler-2mismatch,wish,group,group_code,y,0,1,estimator_correct,prob
6,NC_010363,NC_012438,57.2,0.0,0.009055,0.0,0.002532,0.515620,0.588794,0.0,-1.35836,Streptococcaceae,41,0,0.8,0.2,True,0.8
7,NC_010363,NC_006814,53.6,0.0,0.010561,0.0,0.001443,0.445837,0.412691,0.0,-1.34041,Streptococcaceae,41,0,1.0,0.0,True,1.0
8,NC_010363,NC_021181,53.6,0.0,0.010478,0.0,0.001445,0.445844,0.412664,0.0,-1.34044,Streptococcaceae,41,0,1.0,0.0,True,1.0
9,NC_010363,NC_021721,53.6,0.0,0.125869,0.0,0.003324,0.691869,0.655933,0.0,-1.37431,Streptococcaceae,41,0,1.0,0.0,True,1.0
10,NC_010363,NC_018420,53.6,0.0,0.047558,0.0,0.008447,0.441399,0.475200,0.0,-1.35649,Streptococcaceae,41,0,0.9,0.1,True,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2213175,NC_024392,NC_009664,0.0,0.0,0.380006,0.0,0.006918,1.525475,1.535981,0.0,-1.46593,Listeriaceae,22,0,1.0,0.0,True,1.0
2213176,NC_024392,NC_011891,0.0,0.0,0.382821,0.0,0.010627,1.509128,1.523096,0.0,-1.47323,Listeriaceae,22,0,1.0,0.0,True,1.0
2213177,NC_024392,NC_015514,0.0,0.0,0.382899,0.0,0.007646,1.544129,1.575653,0.0,-1.45837,Listeriaceae,22,0,1.0,0.0,True,1.0
2213178,NC_024392,NC_011145,0.0,0.0,0.384063,0.0,0.010793,1.507242,1.526537,0.0,-1.47286,Listeriaceae,22,0,1.0,0.0,True,1.0


Number of hosts for each virus where the estimator's prediction was correct:

In [16]:
correct_df.groupby(['virus'])['host'].count()

virus
NC_000866    2572
NC_000871    2612
NC_000872    2623
NC_000896    2646
NC_000902    2564
             ... 
NC_024387    2601
NC_024388    2677
NC_024389    2673
NC_024391    2605
NC_024392    2655
Name: host, Length: 820, dtype: int64

Determine best hosts for each virus - ones with the highest probabilities (and correct classification) 

In [41]:
pd.unique(df.loc[groups_main == 1, 'virus'])

array(['NC_001447', 'NC_001341'], dtype=object)

NOTE - need to use .groupby and .agg for scalability below !!!

In [17]:
# WATCH OUT - long (approx. 4 min.)
viruses = pd.unique(correct_df['virus'])
best_hosts = {}

for vir in viruses:
    best_hosts[vir] = {}
    best_hosts[vir]['hosts'] = []
    vir_max_prob = max(correct_df[correct_df['virus'] == vir]['prob'])

    for _, row in correct_df[correct_df['virus'] == vir].iterrows():
        if row['prob'] == vir_max_prob:
            best_hosts[vir]['hosts'].append((row['host'], row['prob']))

### Write down the classification results

Load in the taxonomy JSONs

In [18]:
import json
import pathlib

orgs = {}
for file in pathlib.Path('./taxonomy/').iterdir():
    with open(file, 'r') as open_file:
        orgs[file.stem] = json.load(open_file)
        
orgs.keys()

dict_keys(['host', 'virus'])

Write down the results to file

Use .tsv because some names contain ',' - potential problems with .csv

In [27]:
with open('host_predicions_log.tsv', 'w') as of:
    of.write('virus_id\tvirus_name\thost_id\thost_name\tprobability\n')
    for key, val in best_hosts.items():
        for el in val["hosts"]:
            of.write(f'{key}\t{orgs["virus"][key]["organism_name"].split(",")[0]}\t{el[0]}\t{orgs["host"][el[0]]["organism_name"]}\t{el[1]}\n')


In [3]:
check_df = pd.read_csv('host_predicions_log.tsv',sep='\t')

In [6]:
check_df

Unnamed: 0,virus_id,virus_name,host_id,host_name,probability
0,NC_010363,Lactococcus phage asccphi28,NC_006814,Lactobacillus acidophilus NCFM,1.0
1,NC_010363,Lactococcus phage asccphi28,NC_021181,Lactobacillus acidophilus La-14,1.0
2,NC_010363,Lactococcus phage asccphi28,NC_021721,Lactobacillus casei LOCK919,1.0
3,NC_010363,Lactococcus phage asccphi28,NC_023063,Ehrlichia muris AS145,1.0
4,NC_010363,Lactococcus phage asccphi28,NC_015856,Collimonas fungivorans Ter331,1.0
...,...,...,...,...,...
1613644,NC_007805,Pseudomonas phage F10,NC_018417,Candidatus Carsonella ruddii HT isolate Thao2000,1.0
1613645,NC_007805,Pseudomonas phage F10,NC_018415,Candidatus Carsonella ruddii CS isolate Thao2000,1.0
1613646,NC_007805,Pseudomonas phage F10,NC_018416,Candidatus Carsonella ruddii HC isolate Thao2000,1.0
1613647,NC_007805,Pseudomonas phage F10,NC_018414,Candidatus Carsonella ruddii CE isolate Thao2000,1.0


In [5]:
check_df[check_df['probability'] != 1]

Unnamed: 0,virus_id,virus_name,host_id,host_name,probability


TODO: why do we have so many good predictions with such high probability?

# TODO: implement F1 + precision/recall curves for all estimators in cross validation

e.g. take estimator from cross_validate function + use from_estimator 

# Additional/old code

Explicit tqdm handling (better visualisaition)

*Watch out -* ***long!***

*This code is for LeaveOneOut* ***only!***

In [None]:
'''
# FOR LEAVE ONE OUT ONLY!
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
# enumerate splits
y_true, y_pred = list(), list()
pbar = tqdm(total=len(X))

for train_ix, test_ix in cv.split(X):
    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    # fit model
    model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state=1)
    model.fit(X_train, y_train)
    # evaluate model
    yhat = model.predict(X_test)
    # store
    y_true.append(y_test[0])
    y_pred.append(yhat[0])
    pbar.update(1)
pbar.close()
# calculate accuracy
acc = accuracy_score(y_true, y_pred)
print(f'Accuracy: {acc:.3f}')
'''

### Is the prediction replicable + does it work the way it's supposed to?

In [None]:
'''
idx = learning_df[learning_df['group_code'] == 0].index
X_ref_train = learning_df.drop(index=idx).drop(['virus', 'host', 'group', 'group_code', 'y'], axis=1).values
X_ref_test = learning_df.iloc[idx,:].drop(['virus', 'host', 'group', 'group_code', 'y'], axis=1).values
y_ref_train = learning_df.drop(index=idx).iloc[:, -1]
y_ref_test = learning_df.iloc[idx,-1]
'''

In [None]:
'''
model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state=1)
model.fit(X_ref_train, y_ref_train)
y_pred = model.predict(X_ref_test)
'''

In [None]:
'''
from sklearn.metrics import f1_score
print(f'F1 is: {f1_score(y_ref_test, y_pred)}')
'''