In [1]:
import pandas as pd
import numpy as np

In [24]:
def add_new_data(old_data, new_data, old_labels, label):
    """
    Update the sourcepredict learning table
    INPUT:
        old_data(str): path to csv file of existing sourcepredict source data table
        new_data(str): path to csv file of new OTU table, with TAXID as 1st column
        old_labels(str): path to sourcepredict csv file of labels
        label(str): scientific name of new sample's specie. Example: 'Sus_scrofa'
    OUTPUT:
        merged(pd.DataFrame): merged old and new source data table for sourcepredict
        labels(pd.DataFrame): updated labels data table
    """
    old = pd.read_csv(old_data, index_col=0)
    old = old.drop(['labels'], axis = 0)
    new = pd.read_csv(new_data)
    merged = pd.merge(left=old, right=new, how='outer', on='TAXID')
    merged = merged.fillna(0)
    old_labels = pd.read_csv(old_labels, index_col=0)
    new_labels = pd.DataFrame([label]*(new.shape[1]-1), new.columns[1:])
    new_labels.columns=['labels']
    labels = old_labels.append(new_labels)
    return(merged, labels)

In [25]:
old_data = "/Users/borry/Documents/GitHub/sourcepredict/data/dog_human_pig_sources.csv"
new_data = "/Users/borry/mnt/kraken_profiling/hmp_results/merged/kraken_merged_non_norm.csv"
old_labels = '/Users/borry/Documents/GitHub/sourcepredict/data/labels.csv'
label = 'human'

In [26]:
res = add_new_data(old_data=old_data, new_data=new_data, old_labels=old_labels, label=label)[0]

In [27]:
res

Unnamed: 0,TAXID,ERR1914197,ERR1914272,ERR1914092,ERR1914908,ERR1914999,ERR1914572,ERR1914926,ERR1914242,ERR1914475,...,SRR1175001,SRR062388,SRR346691,SRR1179027,SRR059395,SRR1179043,SRR1175003,SRR646438,SRR1179046,SRR642022
0,0.0,1151107.0,2971753.0,2176978.0,2179152.0,2785744.0,2415209.0,2325619.0,4328242.0,1719638.0,...,6185930.0,16614742.0,27882223.0,6744487.0,17955643.0,6081629.0,1215071.0,10554992.0,5697083.0,14131163.0
1,561.0,145.0,124.0,0.0,90.0,15451.0,0.0,99.0,91.0,71.0,...,0.0,754.0,3952.0,0.0,123.0,0.0,2797.0,2408.0,0.0,6225.0
2,286.0,58.0,71.0,177.0,101.0,105.0,112.0,101.0,182.0,0.0,...,250.0,673.0,1433.0,174.0,785.0,205.0,106.0,2981452.0,93.0,2590844.0
3,32008.0,296.0,384.0,936.0,210.0,1021.0,892.0,250.0,155319.0,132.0,...,67.0,0.0,221.0,0.0,258.0,0.0,0.0,187.0,0.0,125.0
4,194.0,490.0,181.0,518.0,735.0,2627.0,340.0,753.0,164.0,137.0,...,1539.0,2660.0,4151.0,1481.0,795.0,1378.0,118.0,1232.0,232.0,196.0
5,31988.0,108.0,77.0,234.0,72.0,154.0,104.0,69.0,84.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,572511.0,13560.0,16025.0,77685.0,6095.0,64073.0,53044.0,6570.0,13566.0,12353.0,...,2119.0,7732.0,8362.0,2058.0,5149.0,2019.0,313.0,2596.0,2436.0,536.0
7,1506553.0,766.0,503.0,2190.0,585.0,2544.0,1368.0,609.0,583.0,501.0,...,7224.0,59523.0,14989.0,7868.0,17126.0,7644.0,726.0,10333.0,2991.0,857.0
8,841.0,169.0,176.0,718.0,187.0,677.0,360.0,220.0,198.0,158.0,...,9585.0,44235.0,42226.0,14973.0,21795.0,14058.0,892.0,11076.0,12259.0,1578.0
9,207244.0,57.0,129.0,430.0,113.0,558.0,375.0,138.0,96.0,91.0,...,2058.0,14290.0,17825.0,2948.0,4473.0,2752.0,3595.0,2722.0,6943.0,1038.0


In [29]:
res.to_csv("/Users/borry/Documents/GitHub/sourcepredict/data/dog_human_pig_sources_new.csv")

In [30]:
labs = add_new_data(old_data=old_data, new_data=new_data, old_labels=old_labels, label=label)[1]

In [31]:
labs.to_csv("/Users/borry/Documents/GitHub/sourcepredict/data/labels.csv")