In [1]:
import pandas as pd
import numpy as np
import glob
import os
import re
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import warnings
warnings.filterwarnings('ignore')

import shared_functions
animal_data_path = 'data/animal_data/'
human_data_path = 'data/human_data/'
catsNdogs_projects_info_path = 'data/animal_data/pets_SRR_info.txt'

In [2]:
# cat dog projects
def get_catNdog_projects():
    projects = glob.glob(os.path.join(animal_data_path,'PRJ*'))
    projects_name = list(map(os.path.basename,projects))
    projects_name = [ele.replace('_f1','') for ele in projects_name]
    data = shared_functions.read_data_set(zip(projects, projects_name),1)
    data.index = [ele.replace('_1','') for ele in data.index]
    data['Host_type'] = 'pet'
    # load host info
    host_info = pd.read_csv(catsNdogs_projects_info_path,index_col='Run',sep='\t')
    data = data.merge(host_info[['Host']],left_index=True,right_index=True)
    print('cat & dog projects:',len(pd.unique(data.project_name)))
    print('cat & dog projects:',len(data))
    return(data)
def get_human_projects():
    p_set = set()
    paths = []
    projects_list = []
    desiase_dirs = ['OB','PD','T2D','CVD','IBD']
    for d in desiase_dirs:
        projects = os.listdir(os.path.join(human_data_path,d,'HC'))
        for p in projects:
            if (p in p_set):
                continue
            paths.append(os.path.join(human_data_path,d,'HC',p))
            p_set.add(p)
            projects_list.append(p)
    data = shared_functions.read_data_set(zip(paths,projects_list),0)
    data.sort_index(inplace=True)
    data = data.groupby(data.index).agg('first')
    data['Host_type'] = 'human'
    data['Host'] = 'Homo sapiens'
    print('anothers human projects:',len(pd.unique(data.project_name)))
    print('anothers human projects:',len(data))
    return(data)

In [3]:
# def fancy_plot(df,th,sm):
#     sns.set_style("whitegrid")
#     bins = [0]
#     bins.extend(10.0**(np.arange(sm,1,step=1)))

#     g = sns.JointGrid(x=df.iloc[0,:],y=df.iloc[1,:],height=5)
#     g = g.plot_joint(sns.scatterplot, color="k",alpha=.6)
#     _ = g.ax_marg_x.hist(df.iloc[0,:], color="c", bins=bins,alpha=.6)
#     _ = g.ax_marg_y.hist(df.iloc[1,:], color="m", bins=bins,orientation="horizontal",alpha=.6)

#     ax = g.ax_joint
#     ax.axvline(x=th,color='c',alpha=.6,ls='dashed')
#     ax.axhline(y=th,color='m',alpha=.6,ls='dashed')
#     ax.set_xscale('symlog',linthreshx=10.0**(sm))
#     ax.set_yscale('symlog',linthreshy=10.0**(sm))
#     ax.set_xlim((-10.0**(sm)/2, 2))
#     ax.set_ylim((-10.0**(sm)/2, 2))
#     ax.set_xlabel('Taxon (genus) frequency in cats/dog samples')
#     ax.set_ylabel('Taxon (genus) frequency in human samples')
#     ax.fill([0,th,th,0], [0,0,th,th],'k',alpha=0.1)
def filter_data(taxa,host_group,nonzero_th=0.1):
    def _count_nonzero(x):
        return(len(x[x>0])/len(x))
    group_nonzero = taxa.groupby(host_group).agg([_count_nonzero])
    group_nonzero.columns = group_nonzero.columns.droplevel(1)
    columns_to_stay = group_nonzero.loc[:,(group_nonzero>nonzero_th).any()].columns
    print('Filtered to:',len(columns_to_stay),'of',len(taxa.columns),'columns; non-zero threshold:',nonzero_th)
    group_nonzero = group_nonzero.T
    group_nonzero['stayed_after_filtering'] = group_nonzero.index.map(lambda x: True if x in columns_to_stay else False)
    group_nonzero.columns = ['freq_in_catNdog_samples','freq_in_human_samples','stayed_after_filtering']
    group_nonzero.index.name = None
    group_nonzero = group_nonzero.sort_values(['stayed_after_filtering','freq_in_catNdog_samples','freq_in_human_samples'])
    return(columns_to_stay,group_nonzero)

## Load data

In [4]:
catsNdog_data = get_catNdog_projects()
humansA = get_human_projects()
df = pd.concat([catsNdog_data,humansA],sort=False)
df.fillna(0,inplace=True)
print(df.shape)
data = shared_functions.mydata(taxa_df = df.drop(['mean_chao','Host','Host_type','project_name'],axis=1),
                               chao_df = df['mean_chao'],
                               info_df = df.loc[:,['Host','Host_type','project_name']])

cat & dog projects: 12
cat & dog projects: 321
anothers human projects: 10
anothers human projects: 1242
(1563, 376)


## filter low abundance OTUs

In [5]:
filter_df = pd.DataFrame()
for i in range(1,6):
    data_tax_level_i = data.get_taxa(i,False)
    data_features,filter_df_level_i = filter_data(data_tax_level_i,data.info['Host_type'])
    data.set_filtered_taxa(data_features,i)
    print(shared_functions.get_tax_name_by_level(i))
    filter_df_level_i['level'] = shared_functions.get_tax_name_by_level(i)
    filter_df = pd.concat([filter_df,filter_df_level_i])
filter_df.to_csv('results/catsNdogs_SupplementaryTable1.txt',sep='\t')

Filtered to: 10 of 15 columns; non-zero threshold: 0.1
Phylum
Filtered to: 18 of 26 columns; non-zero threshold: 0.1
Class
Filtered to: 29 of 54 columns; non-zero threshold: 0.1
Order
Filtered to: 55 of 123 columns; non-zero threshold: 0.1
Family
Filtered to: 138 of 372 columns; non-zero threshold: 0.1
Genus


## Make balanced calss dataset

In [6]:
di = data.info
pets = di[di.Host_type=='pet']
humans = di[(di.Host_type=='human')]
humans_sample = shared_functions.sample_equal_categories_iterative(humans,len(pets),'project_name',None,0)
balanced_dataset_info = pd.concat([pets,humans_sample])
print('humans:',humans.shape,'pets:',pets.shape,' humans sample:',
      humans_sample.shape,'balanced dataset',balanced_dataset_info.shape)

humans: (1242, 3) pets: (321, 3)  humans sample: (321, 3) balanced dataset (642, 3)


In [7]:
pt1 = shared_functions.projects_table(di)
pt2 = shared_functions.projects_table(balanced_dataset_info)
pt = pt1.merge(pt2,left_index=True,right_index=True,suffixes=('',' in balanced dataset')).sort_values(['Host_type','Host','Samples #','Samples # in balanced dataset'],ascending=False)
display(pt)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Samples #,Samples # in balanced dataset
project_name,Host,Host_type,Unnamed: 3_level_1,Unnamed: 4_level_1
PRJNA504021,Felis catus,pet,65,65
PRJNA349988,Felis catus,pet,44,44
PRJNA248757,Felis catus,pet,30,30
PRJNA338653,Felis catus,pet,19,19
PRJNA350163,Felis catus,pet,6,6
PRJNA488105,Canis familiaris,pet,34,34
PRJNA525542,Canis familiaris,pet,32,32
PRJNA358232,Canis familiaris,pet,30,30
PRJNA391562,Canis familiaris,pet,23,23
PRJNA493249,Canis familiaris,pet,19,19


In [8]:
## write tables about projects used and dataset
pt.to_csv('results/catsNdogs_Table1_1.txt',sep='\t')
t2 = balanced_dataset_info.loc[:,['project_name','Host_type','Host']].sort_values(['Host_type','Host','project_name'])
t2.index = [ele.replace('.fastq','') for ele in t2.index]
t2['project_name'] = [ele.replace('_f1','') for ele in t2['project_name']]
t2.to_csv('results/catsNdogs_SupplementaryTable2.txt',sep='\t')

In [9]:
# joblib sampled dataset for further usage
joblib.dump(data,'joblib/catsNdogs_data.joblib')
x=joblib.dump(balanced_dataset_info,'joblib/catsNdogs_dataset_info.joblib')