# Preprocesamiento

In [1]:
import pandas as pd
import numpy as np
import warnings
from biom import load_table


## Global Atlas

### Files
The Global Atlas dataset contained the following files:

* GlobalAtlas-16S/Dataset_01_22_2018.xlsx
* GlobalAtlas-16S/aap9516_Table_S1_v2.xlsx

In [2]:
GAtlas_SampleMeta=pd.read_excel("../data/GlobalAtlas-16S/Dataset_01_22_2018.xlsx",sheet_name=0)
GAtlas_Raw=pd.read_excel("../data/GlobalAtlas-16S/Dataset_01_22_2018.xlsx",sheet_name=2)
GAtlas_TaxMeta=pd.read_excel("../data/GlobalAtlas-16S/aap9516_Table_S1_v2.xlsx",sheet_name=0)

First, the samples were matched to their corresponding longitude and latitude, and the OTU count per sample was included.
After this the OTU count was normalized to its frequency per OTU.

In [None]:
GAtlas_Partial=GAtlas_Raw.T
GAtlas_Partial.columns=GAtlas_Partial.loc["Dominant_taxa_ID/ID_Environmental"]
GAtlas_Partial=GAtlas_Partial.drop(GAtlas_Partial.index[0])

GAtlas_Joined=pd.DataFrame()
GAtlas_Joined["Latitude"]=GAtlas_SampleMeta["Latitude"]
GAtlas_Joined["Longitude"]=GAtlas_SampleMeta["Longitude"]
GAtlas_Joined.index=np.arange(GAtlas_Joined.shape[0])+1
GAtlas_Joined=pd.concat(objs=[GAtlas_Joined,GAtlas_Partial],axis=1)
for col in GAtlas_Joined.columns:
    GAtlas_Joined[col]=GAtlas_Joined[col].astype(float)
    
for i in range(GAtlas_Joined.shape[0]):
    GAtlas_Joined.iloc[i,2:]/=GAtlas_Joined.iloc[i,2:].sum()
GAtlas_Partial=None

### Name Standarization
The taxonomic information was extracted as part of the name stadarization process, in order to have the same name format as the EMP dataset.

In [4]:
GAtlas_Tax_Partial=GAtlas_TaxMeta.drop(GAtlas_TaxMeta.columns[7:],axis=1).copy()
GAtlas_Tax_Partial.set_index("Taxa",inplace=True)
GAtlas_Tax_Partial.drop(GAtlas_Tax_Partial.columns[:3],axis=1,inplace=True)
taxser=[]
for i in GAtlas_Joined.columns[2:]:
    if(i in GAtlas_Tax_Partial.index):
        taxser.append(GAtlas_Tax_Partial.loc[i])
        
GAtlas_Tax=pd.concat(taxser,axis=1).T
GAtlas_Tax["Order"]="o__"+GAtlas_Tax["Order"]
GAtlas_Tax["Family"]="f__"+GAtlas_Tax["Family"]
GAtlas_Tax["Genus"]="g__"+GAtlas_Tax["Genus"]
GAtlas_Tax.columns=["o","f","g"]
GAtlas_Tax.fillna("",inplace=True)
GAtlas_Tax_Partial=None

### OTU Taxonomic data extraction
The datasets presented format differences in the data that feeds the neural network regarding to the OTU identification. Therefore, the taxonomic information of the different OTUs was extracted and used for this purpose, because it avoids the use of the "taxid".

After this, the cumulative count of each taxonomic level of the OTUs was calculated for each sample.

For the GAtlas dataset all the samples were significant given the amount of them, hence the taxonomic data of all their OTUs was used.

In [None]:
GAtlas=pd.DataFrame()
GAtlas["Latitude"]=GAtlas_Joined["Latitude"]
GAtlas["Longitude"]=GAtlas_Joined["Longitude"]

def decompose(identifier):
    if(identifier not in GAtlas_Tax.index):
        return []
    
    series=list(GAtlas_Tax.loc[identifier])
    for i in range(len(series)):
        if(i==np.nan or len(series[i])<=3):
            return series[:i]
    
    return series

series_lst=[]
for classification in GAtlas_Tax.columns:
    uniques=GAtlas_Tax[classification].unique()
    for unique in uniques:
        series_lst.append(pd.Series(data=np.zeros(GAtlas.shape[0]),index=GAtlas.index,name=unique))
GAtlas=pd.concat([GAtlas]+series_lst,axis=1)
GAtlas=GAtlas.copy()

from fastprogress import *
def PandasNightmares():
    global GAtlas
    mb=master_bar(GAtlas_Joined.index)
    for sample in mb:
        for ident in progress_bar(GAtlas_Joined.columns[2:],parent=mb):
            v=GAtlas_Joined.loc[sample,ident]
            if(type(v)!=np.float64):
                v=v.values[0]
            if(v>0):
                taxonomy=decompose(ident)
                for t in taxonomy:
                    GAtlas.loc[sample,t]+=v

#Too many pandas were harmed in the making of this code.
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    PandasNightmares()

In [6]:
#brackets from the taxonomic levels names were removed
newcol={c:c.replace("[","").replace("]","") for c in GAtlas.columns}
GAtlas.rename(columns=newcol,inplace=True)
GAtlas.columns

Index(['Latitude', 'Longitude', 'o__Rhizobiales', 'o__Actinomycetales',
       'o__Burkholderiales', 'o__Sphingomonadales', 'o__Solibacterales',
       'o__WD2101', 'o__Ellin329', 'o__Solirubrobacterales',
       ...
       'g__Rubellimicrobium', 'g__Planctomyces', 'g__Corallococcus',
       'g__Kibdelosporangium', 'g__OR-59', 'g__Methylobacterium',
       'g__Pseudomonas', 'g__Blastomonas', 'g__Friedmanniella',
       'g__Bdellovibrio'],
      dtype='object', length=191)

### GAtlas Resultant Table
The resultant table presents a sample per row, the columns contained the Latitude, Longitude, and the proportion of each taxonomic group for that sample.

In [7]:
GAtlas.head()

Unnamed: 0,Latitude,Longitude,o__Rhizobiales,o__Actinomycetales,o__Burkholderiales,o__Sphingomonadales,o__Solibacterales,o__WD2101,o__Ellin329,o__Solirubrobacterales,...,g__Rubellimicrobium,g__Planctomyces,g__Corallococcus,g__Kibdelosporangium,g__OR-59,g__Methylobacterium,g__Pseudomonas,g__Blastomonas,g__Friedmanniella,g__Bdellovibrio
1,-26.733333,-54.683333,0.287751,0.057317,0.002037,0.040442,0.009892,0.013093,0.004655,0.037824,...,0.0,0.0,0.000582,0.0,0.0,0.0,0.000291,0.0,0.0,0.0
2,64.8,-148.25,0.457778,0.122963,0.007407,0.004444,0.176296,0.002963,0.031111,0.002222,...,0.0,0.0,0.0,0.0,0.0,0.000741,0.001481,0.0,0.0,0.0
3,42.53,-72.19,0.399563,0.024017,0.00655,0.002183,0.203057,0.005459,0.108079,0.007642,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001092,0.0,0.0,0.0
4,39.09,-96.57,0.176337,0.103312,0.013869,0.040759,0.007642,0.040476,0.01302,0.059723,...,0.0,0.0,0.0,0.001132,0.0,0.0,0.000283,0.000849,0.0,0.0
5,44.21,-122.26,0.346849,0.076676,0.027298,0.015255,0.020474,0.015255,0.028101,0.010839,...,0.0,0.0,0.0,0.000401,0.0,0.000401,0.0,0.000401,0.0,0.0


## EMP



### Files
A similar process was followed for the EMP data set. The files used in this section are the following:

* EMP-16S/emp_or_gg_13_8.release1_CAMDA_2019_sel.biom
* EMP-16S/CAMDA_2019_EMP_metainformation.tsv
* otu_info/EMP_otu_taxonomy @ EMP FTP

However, the file contains more than 2,900,000 samples, with their respective OTUs. Before processing this information a subsample of the 5000 most present OTUs in all samples was obtained.

In [8]:
from biom import load_table
EMP_Raw=load_table("../data/EMP-16S/emp_or_gg_13_8.release1_CAMDA_2019_sel.biom").transpose()

In [9]:
EMP_Raw=EMP_Raw.to_dataframe()

### Data Filtration according to the most frequent OTU
After the data is uploaded, a count of each OTU frequency for the samples is done after its normalization. The count was used to filter the data.

The first 5000 OTUs with enough taxonomic information were used to form the final table.

In [None]:
Frequencies=np.zeros(EMP_Raw.shape[1])
for col in progress_bar(range(len(EMP_Raw.columns))):
    Frequencies[col]=EMP_Raw.iloc[:,col].T.sum()

In [11]:
K=5000
Significant=np.zeros(K).astype(int)
for i in range(K):
    Significant[i]=int(np.argmax(Frequencies))
    Frequencies[Significant[i]]-=2*Frequencies[Significant[i]]

In [12]:
EMP_Sampled=EMP_Raw.iloc[:,Significant].copy()

In [13]:
EMP_Raw=None

### Name Standarization
After obtaining the 5000 most frequent OTU, the format was standarized according to the previously stablish format. 
The names for Latitude and Longitude were also standarized.
Additionally, taxonomic information for these OTU was extracted and used to get the frequency of them.

In [14]:
EMP_Meta=pd.read_csv("../data/EMP-16S/CAMDA_2019_EMP_metainformation.tsv", sep="\t")
for i in range(EMP_Meta.shape[0]):
    EMP_Meta.iloc[i,0]=EMP_Meta.iloc[i,0].upper()
    
EMP_Meta.set_index("SampleID",inplace=True)
EMP_TaxMeta=pd.read_csv("../data/EMP_otu_taxonomy.txt", sep="; ",names=["ID","k","p","c","o","f","g","s"], engine="python")
EMP_TaxMeta.set_index("ID",inplace=True)

In [15]:
EMP_Sampled["Latitude"]=np.zeros(EMP_Sampled.shape[0])
EMP_Sampled["Longitude"]=np.zeros(EMP_Sampled.shape[0])
for i in range(EMP_Sampled.shape[0]):
    idx=EMP_Sampled.index[i]
    uidx=idx.upper()
    EMP_Sampled.loc[idx,"Latitude"]=EMP_Meta.loc[uidx,"latitude_deg"]
    EMP_Sampled.loc[idx,"Longitude"]=EMP_Meta.loc[uidx,"longitude_deg"]

In [16]:
EMP_Indexed=pd.DataFrame()
EMP_Indexed["Latitude"]=EMP_Sampled["Latitude"]
EMP_Indexed["Longitude"]=EMP_Sampled["Longitude"]
import re
series_lst=[EMP_Indexed]
for c in EMP_Sampled.columns[:-2]:
    match=re.findall("\d+$",c)[0]
        
    series_lst.append(pd.Series(np.asarray(EMP_Sampled[c]),index=EMP_Sampled.index,name=int(match)))
EMP_Indexed=pd.concat(series_lst,axis=1)

In [17]:
taxser=[]
for i in EMP_Indexed.columns[2:]:
    if(i in EMP_TaxMeta.index):
        taxser.append(EMP_TaxMeta.loc[i])
EMP_TaxFiltered=pd.concat(taxser,axis=1).T

After this, the name format was set as the one used in the GAtlas dataset.

In [None]:
EMP=pd.DataFrame()
EMP["Latitude"]=EMP_Indexed["Latitude"]
EMP["Longitude"]=EMP_Indexed["Longitude"]

def decompose(identifier):
    if(np.int64(identifier) not in EMP_TaxFiltered.index):
        return []
    
    series=list(EMP_TaxFiltered.loc[np.int64(identifier)])[3:-1]
    for i in range(len(series)):
        if(len(series[i])<=3):
            return series[:i]
    
    return series

series_lst=[]
for classification in EMP_TaxFiltered.columns:
    uniques=EMP_TaxFiltered[classification][3:-1].unique()
    for unique in uniques:
        series_lst.append(pd.Series(data=np.zeros(EMP.shape[0]),index=EMP.index,name=unique))
EMP=pd.concat([EMP]+series_lst,axis=1)
EMP=EMP.copy()

from fastprogress import *
def PandasNightmares():
    global EMP
    mb=master_bar(EMP_Indexed.index)
    
    for sample in mb:
        for ident in progress_bar(EMP_Indexed.columns[2:],parent=mb):
            v=EMP_Indexed.loc[sample,ident]
            if(type(v)!=np.float64):
                v=v.values[0]
            if(v>0):
                taxonomy=decompose(ident)
                for t in taxonomy:
                    EMP.loc[sample,t]+=v
        
import warnings
#Too many pandas were harmed in the making of this movie.
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    PandasNightmares()

In [None]:
newcol={c:c.replace("[","").replace("]","") for c in EMP.columns}
EMP.rename(columns=newcol,inplace=True)
EMP.columns
EMP.drop(columns="c__",inplace=True)
EMP.drop(columns="o__",inplace=True)
EMP.drop(columns="f__",inplace=True)
EMP.drop(columns="g__",inplace=True)
EMP.drop(columns="s__",inplace=True)
newcol={c:c[:3]+c[3].upper()+c[4:] for c in EMP.columns[2:]}
EMP.rename(columns=newcol,inplace=True)

### EMP dataset resultant table
The resultant table presents a sample per row, the columns contained the Latitude, Longitude, and the proportion of each taxonomic group for that sample.

In [None]:
EMP.head()

## Merged table

In [None]:
Merged=pd.merge(EMP,GAtlas,how="outer").fillna(0)

In [None]:
Merged.head()