In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [7]:
import sys
sys.path.insert(0, '../sourcepredictlib/')

import normalize
import utils

In [635]:
class sourceforest():

    def __init__(self, source, sink, labels):
        self.source = pd.read_csv(source, index_col=0)
        y = pd.read_csv(labels, index_col=0)
        self.y = y['labels']
        self.tmp_sink = pd.read_csv(sink, index_col=0, dtype='int64')
        self.combined = pd.DataFrame(pd.merge(
            left=self.source, right=self.tmp_sink, how='outer', left_index=True, right_index=True).fillna(0))
        return None

    def __repr__(self):
        return(f'A sourceforest object of source {self.source} and sink {self.tmp_sink}')
        
    def add_unknown_2(self, alpha):
        
        label_avg = int(np.average(list(dict(Counter(self.y)).values())))
        
        tmp_unk = self.tmp_sink.multiply(alpha).apply(np.floor)
        tmp_unk.columns=["UNKNOWN_0"]
        unk_init = tmp_unk
        comb_unk = self.combined 
        
        unk_labs = ["UNKNOWN_0"]
        
        for i in range(1, label_avg):
            unk_lab = f"UNKNOWN_{i}"
            unk_labs.append(unk_lab)
            tmp = unk_init.apply(lambda x: int(np.random.normal(x, 0.1*x)), 1)
            tmp = tmp.to_frame()
            tmp.columns = [unk_lab]
            tmp_unk = pd.merge(
            left=tmp_unk, right=tmp, how='outer', left_index=True, right_index=True).fillna(0)
            
        self.unk_labs = pd.Series(data=['unknown']*len(unk_labs), index=unk_labs)
        self.unk = tmp_unk
        
    
    def normalize(self, method):
        if method == 'RLE':
            self.normalized = normalize.RLE_normalize(self.combined)
        elif method == 'SUBSAMPLE':
            self.normalized = normalize.subsample_normalize_pd(self.combined)
        elif method == 'CLR':
            self.normalized = normalize.CLR_normalize(self.combined)
        self.normalized = pd.merge(left = self.normalized, right=self.unk, how='outer', left_index=True, right_index=True).fillna(0)
        self.feat = self.normalized.drop(self.tmp_sink.columns, axis=1).T
        self.feat = self.feat.loc[:,
                                  self.feat.columns[self.feat.quantile(0.8, 0) > 0]]
        self.sink = self.normalized.drop(self.source.columns, axis=1).drop(self.unk.columns, axis = 1).T
        self.sink = self.sink.loc[:, self.feat.columns]
        self.y = self.y.append(self.unk_labs)

In [636]:
a = sourceforest(source="../data/sourcepredict_sources.csv", sink="../data/test/dog_test_sample.csv", labels="../data/sourcepredict_labels.csv")

In [637]:
a.add_unknown_2(alpha = 0.1)
a.normalize('RLE')

In [638]:
a.y

ERR1914197    Canis_familiaris
ERR1914272    Canis_familiaris
ERR1914092    Canis_familiaris
ERR1914908    Canis_familiaris
ERR1914999    Canis_familiaris
ERR1914572    Canis_familiaris
ERR1914926    Canis_familiaris
ERR1914242    Canis_familiaris
ERR1914475    Canis_familiaris
ERR1914938    Canis_familiaris
ERR1914917    Canis_familiaris
ERR1914619    Canis_familiaris
ERR1914543    Canis_familiaris
ERR1914476    Canis_familiaris
ERR1914069    Canis_familiaris
ERR1914108    Canis_familiaris
ERR1914652    Canis_familiaris
ERR1914484    Canis_familiaris
ERR1914073    Canis_familiaris
ERR1914029    Canis_familiaris
ERR1914262    Canis_familiaris
ERR1914800    Canis_familiaris
ERR1914850    Canis_familiaris
ERR1914978    Canis_familiaris
ERR1914553    Canis_familiaris
ERR1914227    Canis_familiaris
ERR1914065    Canis_familiaris
ERR1914397    Canis_familiaris
ERR1914070    Canis_familiaris
ERR1914166    Canis_familiaris
                    ...       
UNKNOWN_29             unknown
UNKNOWN_

In [582]:
def add_unknown(comb, sink, alpha, labels):
        '''
        alpha: proportion of unknown for each OTU
        '''
        
        label_avg = int(np.average(list(dict(Counter(labels)).values())))
        
        tmp_unk = sink.multiply(alpha).apply(np.floor)
        unk_init = tmp_unk
        tmp_unk.columns = ['UNKNOWN_0']

        comb_ukn = comb 
        
        ukn_labs = list(tmp_unk.columns)
        
        for i in range(1, label_avg):
            ukn_lab = f"UNKNOWN_{i}"
            ukn_labs.append(ukn_lab)
            tmp = unk_init.apply(lambda x: int(np.random.normal(x, 0.1*x)), 1)
            tmp = tmp.to_frame()
            tmp.columns = [ukn_lab]
            tmp_unk = pd.merge(
            left=tmp_unk, right=tmp, how='outer', left_index=True, right_index=True).fillna(0)
            
        ukn_labs = pd.Series(data=['unknown']*len(ukn_labs), index=ukn_labs)
        return(tmp_unk, ukn_labs)

In [583]:
b = add_unknown(comb=a.combined,sink=a.tmp_sink, alpha=0.1, labels=a.y)

In [585]:
b[1]

UNKNOWN_0     unknown
UNKNOWN_1     unknown
UNKNOWN_2     unknown
UNKNOWN_3     unknown
UNKNOWN_4     unknown
UNKNOWN_5     unknown
UNKNOWN_6     unknown
UNKNOWN_7     unknown
UNKNOWN_8     unknown
UNKNOWN_9     unknown
UNKNOWN_10    unknown
UNKNOWN_11    unknown
UNKNOWN_12    unknown
UNKNOWN_13    unknown
UNKNOWN_14    unknown
UNKNOWN_15    unknown
UNKNOWN_16    unknown
UNKNOWN_17    unknown
UNKNOWN_18    unknown
UNKNOWN_19    unknown
UNKNOWN_20    unknown
UNKNOWN_21    unknown
UNKNOWN_22    unknown
UNKNOWN_23    unknown
UNKNOWN_24    unknown
UNKNOWN_25    unknown
UNKNOWN_26    unknown
UNKNOWN_27    unknown
UNKNOWN_28    unknown
UNKNOWN_29    unknown
UNKNOWN_30    unknown
UNKNOWN_31    unknown
UNKNOWN_32    unknown
UNKNOWN_33    unknown
UNKNOWN_34    unknown
UNKNOWN_35    unknown
UNKNOWN_36    unknown
UNKNOWN_37    unknown
UNKNOWN_38    unknown
UNKNOWN_39    unknown
UNKNOWN_40    unknown
UNKNOWN_41    unknown
UNKNOWN_42    unknown
UNKNOWN_43    unknown
UNKNOWN_44    unknown
UNKNOWN_45