In [3]:
import pandas as pd
import numpy as np
import os
import joblib
import itertools
import shared_functions

from sklearn import model_selection, ensemble, metrics
import warnings
warnings.filterwarnings('ignore')

data_path = 'data/projects_for_test'
test_srr_info1 = 'data/projects_for_test/PMID32078625_metadata.csv'
test_srr_info2 = 'data/projects_for_test/PRJNA493726_metadata.csv'

In [4]:
bf = joblib.load('joblib/catsNdogs_mw_bf.joblib')

## Load cats and dogs test projects

In [5]:
cd_test_projects = ['PRJNA592436_f1','PRJNA589580_f1','PRJNA401442','PMID32078625_f1']
projects = [os.path.join(data_path, p) for p in cd_test_projects]
#print(projects)
projects_name = [ele.replace('_f1','') for ele in cd_test_projects]
data = shared_functions.read_data_set(zip(projects, projects_name),1)
data.index = [ele.replace('_1','') for ele in data.index]
data.index = [ele.replace('_L001_R1_001','') for ele in data.index]
data['Host_type'] = 'catNdog'
z = pd.read_csv(test_srr_info1,sep=';')
#display(z.head())
z.set_index('sample_name',inplace=True)
dz = data.merge(z,how='left',left_index=True,right_index=True)
dz['DESCRIPTION'] = dz.DESCRIPTION.apply(str)
dz['Host'] = dz.apply(lambda x: 'Canis familiaris' if (x.DESCRIPTION=='nan') 
                      else ('Canis familiaris' if (x.DESCRIPTION=='Dog feces')
                            else 'Felis catus'),axis=1)
dz.project_name.value_counts()
dz.drop('DESCRIPTION',axis=1,inplace=True)
dz['project_name2'] = dz.apply(lambda x: x.project_name+'_dog' if (x.Host=='Canis familiaris')
                              else x.project_name+'_cat',axis=1)
print(dz.shape)
print(dz.project_name.value_counts())
print(dz.project_name2.value_counts())
cd_test_data = dz

(358, 286)
PMID32078625    238
PRJNA401442      56
PRJNA589580      35
PRJNA592436      29
Name: project_name, dtype: int64
PMID32078625_dog    192
PRJNA401442_dog      56
PMID32078625_cat     46
PRJNA589580_dog      35
PRJNA592436_dog      29
Name: project_name2, dtype: int64


## Load human test projects

In [6]:
human_test_projects = ['PRJNA385551_f1','PRJNA493726_f1']
projects = [os.path.join(data_path, p) for p in human_test_projects]
#print(projects)
projects_name = [ele.replace('_f1','') for ele in human_test_projects]
data = shared_functions.read_data_set(zip(projects, projects_name),1)
data.index = [ele.replace('_1','') for ele in data.index]
data['Host_type'] = 'human'
data['Host'] = 'Homo sapiens'
z = pd.read_csv(test_srr_info2,sep=',').loc[:,['Run','SampleName']]
#.loc[:,['Run','SampleType']]
z.set_index('Run',inplace=True)
dz = data.merge(z,how='left',left_index=True,right_index=True)
dz['SampleName'] = dz.SampleName.apply(str)
dz['SampleName'] = dz.SampleName.apply(lambda x: x.split('.')[0])
dz['health'] = dz.apply(lambda x: 'HC' if (x.SampleName=='nan') 
                        else x.SampleName,axis=1)
dz['project_name2'] = dz.project_name+'_'+dz.health
dz.drop(['SampleName','health'],axis=1,inplace=True)
print(dz.shape)
print(dz.project_name.value_counts())
print(dz.project_name2.value_counts())
human_test_data = dz

(358, 279)
PRJNA385551    284
PRJNA493726     74
Name: project_name, dtype: int64
PRJNA385551_HC     284
PRJNA493726_SLE     37
PRJNA493726_HC      19
PRJNA493726_RA      18
Name: project_name2, dtype: int64


In [7]:
df = pd.concat([cd_test_data,human_test_data])
test_data = shared_functions.mydata(taxa_df = df.drop(['mean_chao','Host','Host_type','project_name','project_name2'],axis=1),
                               chao_df = df['mean_chao'],
                               info_df = df.loc[:,['Host','Host_type','project_name','project_name2']])

## Models accuracy on test data

In [8]:
def test_each_project(pr_col):
    levels = [5]
    features = ['all','best_holm','best_fdr']
    clr_b = [False,True]
    res_list = []
    for level,features,clr_b in itertools.product(levels, features, clr_b):
        view_name = shared_functions.view_name(level,features,clr_b)
        print(view_name)
        filename = shared_functions.make_name(level,features,clr_b,True)
        rf = joblib.load('joblib/'+filename)
        f,chao = bf[(level,features)]
        tf = shared_functions.transformer(bf=f,chao=chao,level=level,clr_b=clr_b)
        ti = test_data.info
        tps = pd.unique(ti[pr_col])
        for tp in tps:
            dt = ti[ti[pr_col]==tp]
            ht = pd.unique(dt.Host_type)[0]
            taxa_df,chao_df,y = test_data.get_data_from_ind(dt.index,False)
            X = tf.transform_df(taxa_df,chao_df)
            y_predict = rf.predict(X)
            acc = metrics.accuracy_score(y,y_predict)
            acc_str = '%.3f' % acc
            res_d = {'model_name':view_name,'test_project':tp,'Host_type':ht,'accuracy':acc_str}
            res_list.append(res_d)
    return(pd.DataFrame(res_list))

In [9]:
res = test_each_project('project_name2')
t_acc = res.pivot_table(index=['model_name'], columns=['Host_type','test_project'],values='accuracy',aggfunc='first').reset_index()
display(t_acc)

Genus_ALL
Genus_ALL_CLR
Genus_MW-Holm
Genus_MW-Holm_CLR
Genus_MW-FDR
Genus_MW-FDR_CLR


Host_type,model_name,catNdog,catNdog,catNdog,catNdog,catNdog,human,human,human,human
test_project,Unnamed: 1_level_1,PMID32078625_cat,PMID32078625_dog,PRJNA401442_dog,PRJNA589580_dog,PRJNA592436_dog,PRJNA385551_HC,PRJNA493726_HC,PRJNA493726_RA,PRJNA493726_SLE
0,Genus_ALL,0.87,0.984,1.0,1.0,1.0,1.0,1.0,1.0,0.919
1,Genus_ALL_CLR,0.913,0.979,1.0,1.0,1.0,1.0,1.0,1.0,0.865
2,Genus_MW-FDR,0.87,0.948,1.0,1.0,1.0,1.0,1.0,0.889,0.784
3,Genus_MW-FDR_CLR,0.935,0.995,1.0,1.0,1.0,1.0,1.0,0.944,0.838
4,Genus_MW-Holm,1.0,0.995,1.0,1.0,1.0,0.993,1.0,0.889,0.703
5,Genus_MW-Holm_CLR,0.978,0.984,1.0,1.0,1.0,0.993,0.842,0.889,0.73


In [10]:
t_acc.to_csv('results/catsNdogs_Table5_1.txt',sep='\t')

## Models accuracy on a mixed class sample

In [11]:
ti = test_data.info
cd = ti[ti.Host_type=='catNdog']
humans = ti[(ti.Host_type=='human')]
HC_humans = humans[~humans.project_name2.isin(['PRJNA493726_SLE','PRJNA493726_RA'])]

In [12]:
def test_on_sample(human_df,tf,rf):
    results_list = []
    for i in range(100):
        cd_p = shared_functions.sample_equal_categories_with_replacements(
            cd,200,'project_name2',None,i)
        hum_p = shared_functions.sample_equal_categories_with_replacements(
            human_df,200,'project_name2',None,i)
        test_i = pd.concat([cd_p,hum_p])
        taxa_df,chao_df,y = test_data.get_data_from_ind(test_i.index,False)
        X = tf.transform_df(taxa_df,chao_df)
        y_predict = rf.predict(X)
        acc = metrics.accuracy_score(y,y_predict)
        pr = metrics.precision_score(y,y_predict)
        recall = metrics.recall_score(y,y_predict)
        f1 = metrics.f1_score(y,y_predict)
        rd = {'acc':acc,'precision':pr,'recall':recall,'f1':f1}
        results_list.append(rd)
    x = pd.DataFrame(results_list)
    m = x.mean()
    st = x.std()
    res = pd.DataFrame.from_dict({'m':m,'s':st})
    res['mean_std'] = res.apply(lambda x: ('%.3f ± %.3f' % (x.m, x.s)),axis=1)
    res.drop(['m','s'],axis=1,inplace=True)
    return(res)

In [13]:
levels = [5]
features = ['all','best_fdr']
clr_b = [False,True]
res_list = []
pieces2 = {}
for level,features,clr_b in itertools.product(levels, features, clr_b):
    filename = shared_functions.make_name(level,features,clr_b,True)
    view_name = shared_functions.view_name(level,features,clr_b)
    rf = joblib.load('joblib/'+filename)
    f,chao = bf[(level,features)]
    tf = shared_functions.transformer(bf=f,chao=chao,level=level,clr_b=clr_b)
    all_res = test_on_sample(humans,tf,rf)
    healthy_res = test_on_sample(HC_humans,tf,rf)
    pieces = {'total dataset' : all_res,
              'healthy people dataset' :healthy_res}
    t5 = pd.concat(pieces, axis=1)
    t5.columns = t5.columns.droplevel(1)
    pieces2[view_name] = t5
t5_all = pd.concat(pieces2, axis=1)
t5_all

Unnamed: 0_level_0,Genus_ALL,Genus_ALL,Genus_ALL_CLR,Genus_ALL_CLR,Genus_MW-FDR,Genus_MW-FDR,Genus_MW-FDR_CLR,Genus_MW-FDR_CLR
Unnamed: 0_level_1,total dataset,healthy people dataset,total dataset,healthy people dataset,total dataset,healthy people dataset,total dataset,healthy people dataset
acc,0.976 ± 0.008,0.986 ± 0.006,0.973 ± 0.010,0.990 ± 0.006,0.942 ± 0.011,0.982 ± 0.006,0.965 ± 0.009,0.993 ± 0.005
precision,0.972 ± 0.012,0.972 ± 0.012,0.980 ± 0.011,0.981 ± 0.011,0.963 ± 0.013,0.966 ± 0.012,0.985 ± 0.010,0.986 ± 0.009
recall,0.980 ± 0.010,1.000 ± 0.000,0.966 ± 0.013,1.000 ± 0.000,0.919 ± 0.017,1.000 ± 0.000,0.945 ± 0.014,1.000 ± 0.000
f1,0.976 ± 0.008,0.986 ± 0.006,0.973 ± 0.010,0.990 ± 0.006,0.940 ± 0.011,0.982 ± 0.006,0.965 ± 0.009,0.993 ± 0.005


In [14]:
t5_all.to_csv('results/catsNdogs_Table5_2.txt',sep='\t')

In [15]:
# def test_on_sample_2(human_df):
#     results_list = []
#     for i in range(100):
#         cd_cats = shared_functions.sample_equal_categories_with_replacements(
#             cd_test_data,200,'project_name2',None,i)
#         hum_p = shared_functions.sample_equal_categories_with_replacements(
#             human_df,200,'project_name2',None,i)
#         #print(cd_p.project_name.value_counts())
#         #print(hum_p.project_name.value_counts())
#         test_data = pd.concat([cd_p,hum_p])
#         test_data.drop('project_name2',axis=1,inplace=True)
#         X,y = tf.transform_df(df=test_data)
#         y_predict = rf.predict(X)
#         acc = metrics.accuracy_score(y,y_predict)
#         pr = metrics.precision_score(y,y_predict)
#         recall = metrics.recall_score(y,y_predict)
#         f1 = metrics.f1_score(y,y_predict)
#         rd = {'acc':acc,'precision':pr,'recall':recall,'f1':f1}
#         results_list.append(rd)
#     x = pd.DataFrame(results_list)
#     m = x.mean()
#     st = x.std()
#     res = pd.DataFrame.from_dict({'mean':m,'std':st})
#     return(res)