In [1]:
import pandas as pd
import numpy as np
from skbio.stats.composition import clr
from skbio.stats.composition import multiplicative_replacement

In [2]:
# read OTU tables and chao
def read_data(path,skip_rows,filter_features=False):
    r = pd.read_csv(path+'/hitdb_94/alpha_rar/chao1.txt',index_col='Unnamed: 0',sep='\t')
    r_mean = r.mean(axis=1)
    r_mean = pd.DataFrame(r_mean,columns=['mean_chao']).transpose()
    l = pd.read_csv(path+'/hitdb_94/OTUs/summarized_taxa/otu_table_L6.txt',index_col='#OTU ID',skiprows=skip_rows,sep='\t')
    l = l.div(l.sum(axis=0),axis=1)
    rl = pd.concat([r_mean,l])
    return(rl.transpose())
def read_data_set(paths,skip_rows):
    df = pd.DataFrame()
    for p,p_name in paths:
        try:
            df = pd.concat([df,read_data(p,skip_rows).assign(project_name=p_name)],sort=False)
        except:
            print('Unable to load project:', p_name)
    df.fillna(0,inplace=True)
    return(df)

In [3]:
# sampling fuctions
def sample_equal_categories_with_replacements(df,n,category_column,categories_list=None,random_state=None):
    if (categories_list==None):
        categories_list = pd.unique(df[category_column])
    res = pd.DataFrame()
    i = 0
    for cat in categories_list:
        avg_size = int((n-len(res))/(len(categories_list)-i))
        #print(i,cat,avg_size)
        res = pd.concat([res,df[df[category_column]==cat].sample(avg_size,replace=True,random_state = random_state)])
        i = i + 1
    #print('Final sample size:',len(res))
    return(res)

def sample_equal_categories_iterative(df,n,category_column,categories_list=None,random_state=None):
    if (categories_list==None):
        categories_list = df.groupby(category_column).size().sort_values().index
    res = pd.DataFrame()
    i = 0
    for cat in categories_list:
        avg_size = int((n-len(res))/(len(categories_list)-i))
        sample_df = df[df[category_column]==cat]
        sample_size = min(avg_size,len(sample_df))
        #print(i,cat,avg_size)
        res = pd.concat([res,sample_df.sample(sample_size,random_state = random_state)])
        i = i + 1
    #print('Final sample size:',len(res))
    return(res)

In [10]:
# names
level_names = {0:'Kingdom',1:'Phylum',2:'Class',3:'Order',4:'Family',5:'Genus'}
def view_name(level,features,clr_b):
    l = level_names[level]
    f,c = 'ALL',''
    if (features=='best_holm'): f = 'MW-Holm'
    if (features=='best_fdr'): f = 'MW-FDR'
    if (clr_b): c = '_CLR'
    return(l+'_'+f+c)
def make_name(level,features,clr_b,fit):
    f,c,ft = 'all','F','empty'
    if (features=='best_holm'): f = 'bh'
    if (features=='best_fdr'): f = 'bf'
    if (clr_b): c = 'T'
    if (fit): ft = 'fit'
    return('catsNdogs_model_'+str(level)+':'+f+':'+c+':'+ft+'.joblib')
def decode_name(name):
    x = name.split('_')[-1].split('.')[0].split(':')
    level = int(x[0])
    features = 'all'
    if (x[1]=='bh'): features = 'best_holm'
    if (x[1]=='bf'): features = 'best_fdr'
    clr_b = False
    if (x[2]=='T'): clr_b = True
    fit = False
    if (x[3]=='fit'): fit = True
    return(level,features,clr_b,fit)
def get_tax_name_by_level(i):
    return(level_names[i])

In [2]:
# group by phylums
def gp(x,level):
    res_final = ''
    for i in range(level,-1,-1):
        res = x.split(';')[i]
        res_final = res +';'+res_final
    return(res_final[:-1])
def get_phylums(df,level):
    data_phylums = df.transpose()
    data_phylums.index.name = 'index'
    data_phylums = data_phylums.reset_index()
    data_phylums['phylum'] = data_phylums['index'].apply(gp,args=[level])
    data_phylums = data_phylums.drop('index',axis=1).groupby('phylum').agg('sum').transpose()
    return(data_phylums)

In [12]:
# data transformation
host_type_dtype = pd.api.types.CategoricalDtype(categories=['pet','human'], ordered=True)
host_dtype = pd.api.types.CategoricalDtype(categories=['Canis familiaris','Felis catus','Homo sapiens'], ordered=True)
my_dtype = {'Host_type':host_type_dtype,'Host':host_dtype}
class transformer:
    def __init__(self,bf,chao=True,level=5,clr_b=False):
        self.bf=bf
        self.add_chao=chao
        self.level=level
        self.clr_b=clr_b        
    def transform_df(self,taxa_df,chao_df):
        X_df = get_phylums(taxa_df,self.level)
        features_union = set(X_df.columns).union(set(self.bf))
        X_df = X_df.reindex(columns=features_union).fillna(0)
        if (self.clr_b):
            X_clr = clr(multiplicative_replacement(np.array(X_df)))
            X_df = pd.DataFrame(X_clr,columns = X_df.columns,index = X_df.index)
        X_df = X_df.reindex(columns=self.bf)
        if (self.add_chao):
            X_df['mean_chao'] = chao_df
        X = np.array(X_df)
        return(X)

In [8]:
#import joblib
#bf = joblib.load('joblib/catsNdogs_mw_bf.joblib')
#data = joblib.load('joblib/catsNdogs_data.joblib')
#dataset_info = joblib.load('joblib/catsNdogs_dataset_info.joblib')
#taxa_df,chao_df,y = data.get_data_from_ind(dataset_info.index,False)
##display(taxa_df)
##display(chao_df)
#f,chao = bf[(5,'all')]
##print(f)
##print(chao)
#tf = transformer(bf=f,chao=chao,level=5,clr_b=True)
#X = tf.transform_df(taxa_df,chao_df)

In [3]:
class mydata:
    def __init__(self,taxa_df,chao_df,info_df,y_col='Host_type'):
        self.taxa = taxa_df.copy()
        self.taxa.fillna(0,inplace=True)
        self.taxa = self.taxa.div(self.taxa.sum(axis=1),axis=0)
        self.chao = chao_df.copy()
        self.info = info_df.copy()
        self.y_col = y_col
        self.filtered_features = {}
    def set_filtered_taxa(self,data_f,level):
        self.filtered_features[level] = data_f
    
    def get_taxa(self,level,filtered):
        res = get_phylums(self.taxa,level)
        if (filtered):
            return(res.reindex(columns = self.filtered_features[level]))
        else:
            return(res)
        
    def get_taxa_from_ind(self,ind,level,filtered):
        res = self.get_taxa(level,filtered)
        return(res.loc[ind,:])
  
    def get_data_from_ind(self,ind,level,filtered):
        y = np.array(self.info.loc[ind,self.y_col].astype(my_dtype[self.y_col]).cat.codes)
        return(self.get_taxa_from_ind(ind,level,filtered),self.chao.loc[ind],y)

In [None]:
def projects_table(data):
    x = data.groupby(['project_name','Host','Host_type']).size().sort_values(ascending=False).to_frame()
    x.columns = ['Samples #']
    x.sort_values(['Host','Samples #'],ascending=False,inplace=True)
    return(x)