In [1]:
import pandas as pd

### Westernized samples from [curatedMetagenomicsData](http://bioconductor.org/packages/release/data/experiment/html/curatedMetagenomicData.html) package

In [2]:
west_count = pd.read_csv("curated_metagenomics/data/westernized_count.csv", index_col=0)

Choosing randomly 100 samples

In [3]:
west_count = west_count.sample(100, axis=1, random_state=42)

In [4]:
west_count.head()

Unnamed: 0,ZeeviD_2015.metaphlan_bugs_list.stool:PNP_Main_265,VatanenT_2016.metaphlan_bugs_list.stool:G80343,NielsenHB_2014.metaphlan_bugs_list.stool:MH0039,LiJ_2014.metaphlan_bugs_list.stool:O2.UC37-1,ChengpingW_2017.metaphlan_bugs_list.stool:AS96raw,LouisS_2016.metaphlan_bugs_list.stool:AS64_3,HMP_2012.metaphlan_bugs_list.stool:SRS014613,NielsenHB_2014.metaphlan_bugs_list.stool:O2_UC31_0,NielsenHB_2014.metaphlan_bugs_list.stool:MH0146,ThomasAM_2018b.metaphlan_bugs_list.stool:CRC_MR_SBJ55C_17,...,LouisS_2016.metaphlan_bugs_list.stool:AS62_12,NielsenHB_2014.metaphlan_bugs_list.stool:MH0118,VatanenT_2016.metaphlan_bugs_list.stool:G80356,BackhedF_2015.metaphlan_bugs_list.stool:SID195_12M,JieZ_2017.metaphlan_bugs_list.stool:SAMEA104142343,ZeeviD_2015.metaphlan_bugs_list.stool:PNP_Main_111,SchirmerM_2016.metaphlan_bugs_list.stool:G88935,VatanenT_2016.metaphlan_bugs_list.stool:G80272,ZeeviD_2015.metaphlan_bugs_list.stool:PNP_Validation_87,ZeeviD_2015.metaphlan_bugs_list.stool:PNP_Main_464
k__Bacteria,30791437,34027498,42700343,93568024,46643834,23014264,109953562,61969008,57590023,68372734,...,21279037,59182692,43512088,46307677,44856805,29125652,28566971,34473143,9163795,39983010
k__Bacteria|p__Bacteroidetes,1875829,22908002,26700859,42671303,25230447,19532454,82516619,18862646,44921316,6786688,...,10983798,8784024,25434106,809539,22753291,10559236,5003525,19096469,5130207,11544595
k__Bacteria|p__Proteobacteria,751053,500442,642897,1164701,242038,431094,905731,671689,488953,83390,...,169018,207305,654994,1066795,652661,247656,200137,2896982,276433,794393
k__Bacteria|p__Firmicutes,23260022,9800467,14863258,33589060,20481299,2090558,26318276,39041970,8494594,58886157,...,9602898,48121955,7916162,34917666,20922678,17359765,19518406,12080584,3526189,25196111
k__Bacteria|p__Actinobacteria,4902410,410133,379707,15992546,492113,960157,66478,3385462,93473,2605744,...,179291,2064901,3628071,9512695,480159,959000,3172648,390932,129729,2217743


### Non-westernized samples from [curatedMetagenomicsData](http://bioconductor.org/packages/release/data/experiment/html/curatedMetagenomicData.html) package

In [5]:
non_west_count = pd.read_csv("curated_metagenomics/data/non_westernized_count.csv", index_col=0)

In [6]:
non_west_count = non_west_count.sample(100, axis=1, random_state=42)

### Sediment samples from Archaelogical sediments

In [7]:
sediment_count = pd.read_csv("PRJEB18629/data/sediment_count.csv", index_col=1).drop("Unnamed: 0", 1)

### Marsha's samples

In [8]:
import os

In [9]:
def get_samp_basename(samp_name):
    return("_".join(samp_name.split("_")[0:2]))

In [10]:
def read_sample(samp_name):
    return(pd.read_csv("wibowo_mp/data/"+samp_name, 
                       sep="\t", index_col="#SampleID")
           .rename(columns={'Metaphlan2_Analysis':get_samp_basename(samp_name)}))

In [11]:
wib_metadata = pd.read_csv("wibowo_mp/samples_metadata.txt", sep="\t", index_col=0)

In [12]:
wib_metadata['total_reads'] = wib_metadata["Number of reads (pair 1)"] +  wib_metadata['Number of reads (pair 2)']

In [13]:
soil_samp = ['Lib4_10','Lib4_11','Lib4_12']

In [14]:
wib_samp = [i for i in os.listdir("wibowo_mp/data/")]

In [15]:
wib = read_sample(wib_samp[0])

In [16]:
for i in wib_samp[1:]:
    wib = wib.merge(read_sample(i), left_index=True, right_index=True, how='outer')

In [17]:
wib = wib.fillna(0)

In [18]:
for col in wib.columns:
    wib[col] = wib[col]/100*wib_metadata.loc[col,'total_reads']

### Merging all samples

In [19]:
all_samp = (west_count.merge(non_west_count, left_index=True, right_index=True, how='outer')
            .merge(sediment_count, left_index=True, right_index=True, how='outer')
            .merge(wib, left_index=True, right_index=True, how='outer').fillna(0))

In [20]:
all_samp.shape

(6558, 314)

Keeping only Taxons that occur in more than 0.1% of samples

In [21]:
all_samp = all_samp.loc[all_samp.astype(bool).sum(axis=1) >= int(all_samp.shape[1]/1000),:]

Keeping only samples that have more 10000 mapping reads

In [22]:
all_samp = all_samp.loc[:,all_samp.sum(axis=0) >= 10000]

In [23]:
all_samp.shape

(6558, 289)

### Preparing metadata 

In [24]:
meta_west = pd.Series(index=west_count.columns, data=['westernized_humans']*west_count.shape[1]).to_frame(name='Env')
meta_west['SourceSink'] = ["source"]*west_count.shape[1]

In [25]:
meta_non_west = pd.Series(index=non_west_count.columns, data=['non_westernized_humans']*non_west_count.shape[1]).to_frame(name='Env')
meta_non_west['SourceSink'] = ["source"]*non_west_count.shape[1]

In [26]:
meta_sediment = pd.Series(index=sediment_count.columns, data=['archeo_sediment']*sediment_count.shape[1]).to_frame(name='Env')
meta_sediment['SourceSink'] = ["source"]*sediment_count.shape[1]

In [27]:
meta_wib_sink = pd.Series(index=wib.columns, data=["-"]*wib.shape[1]).to_frame(name='Env')
meta_wib_sink['SourceSink'] = ['sink']*wib.shape[1]
meta_wib_sink.drop(soil_samp, axis=0, inplace=True)

In [28]:
meta_wib_source = pd.Series(index=soil_samp, data=['archeo_sediment']*len(soil_samp)).to_frame(name="Env")
meta_wib_source['SourceSink'] = ['source']*len(soil_samp)

In [29]:
meta = meta_west.append(meta_non_west).append(meta_wib_sink).append(meta_sediment).append(meta_wib_source)

In [30]:
meta = meta[['SourceSink','Env']]

Checking for concordance between metadata and data

In [31]:
all_samp=all_samp[meta.index.intersection(all_samp.T.index)]

In [32]:
all_samp = all_samp.astype(int)

In [33]:
meta = meta.loc[meta.index.intersection(all_samp.T.index),:]

In [34]:
meta.Env.value_counts()

westernized_humans        100
non_westernized_humans    100
archeo_sediment            78
-                          11
Name: Env, dtype: int64

### Exporting to csv

In [35]:
all_samp.to_csv("all_samp_st2.tsv", sep="\t", index_label='Taxon')

In [36]:
meta.to_csv("labels_st2.tsv", sep="\t", index_label='#SampleID')

### Converting counts file to biom format

In [37]:
!biom convert -i all_samp_st2.tsv -o all_samp_st2.biom --table-type="Taxon table" --to-hdf5

Minimum read count for SourceTracker rarefaction depth

In [38]:
all_samp.sum(axis=0).min()

10470