In [1]:
import pandas as pds
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

## function to execute SQL (SQLite compliant) over pandas dataframes
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals()) 

## function for printing pandas tables as markdown
from tabulate import tabulate
def print_pandas_md(df):
    print(tabulate(df, tablefmt="pipe", headers="keys"))

# Build data frame to explore values that map to EVNO classes

In [2]:
all_data_df = pds.read_csv("Biosample_all.tsv.gz", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
elevels = ['ECOSYSTEM', 'ECOSYSTEM_CATEGORY', 'ECOSYSTEM_TYPE', 'ECOSYSTEM_SUBTYPE', 'SPECIFIC_ECOSYSTEM']
elevelsdf = all_data_df[elevels]

### Build top 7 dataset (cf. previous analyis in gold-env-elevels-analysis.ipynb)

In [4]:
## create EAV dataframe
## fist give the index column a name and reset
elevelsdf.index.name = 'id'
elevelsdf.reset_index(inplace=True)
elevelsdf.head()

Unnamed: 0,id,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_TYPE,ECOSYSTEM_SUBTYPE,SPECIFIC_ECOSYSTEM
0,0,Host-associated,Plants,Phyllosphere,Caulosphere,
1,1,Engineered,Food production,Dairy products,,
2,2,Engineered,Food production,Dairy products,,
3,3,Engineered,Food production,Dairy products,,
4,4,Engineered,Food production,Dairy products,,


In [5]:
## next use melt function to unpivot/unstack the data
eavdf = elevelsdf.melt(id_vars='id')
eavdf.sort_values(by=['id'], inplace=True)
print(pds.DataFrame(elevelsdf.iloc[8024])) ## print a random record from the original df and eav to make sure they values match
print(eavdf[eavdf.id == 8024])

                             8024
id                           8024
ECOSYSTEM           Environmental
ECOSYSTEM_CATEGORY        Aquatic
ECOSYSTEM_TYPE         Freshwater
ECOSYSTEM_SUBTYPE     Groundwater
SPECIFIC_ECOSYSTEM            NaN
          id            variable          value
264600  8024  SPECIFIC_ECOSYSTEM            NaN
72168   8024  ECOSYSTEM_CATEGORY        Aquatic
200456  8024   ECOSYSTEM_SUBTYPE    Groundwater
8024    8024           ECOSYSTEM  Environmental
136312  8024      ECOSYSTEM_TYPE     Freshwater


In [6]:
## get totals for each variable/value pair
q = """
select 
    variable, value, count(*) as total
from 
    eavdf
group by 
    variable, value
order by 
    variable, total desc
"""
totalsdf = pysqldf(q)
totalsdf.head(10)

Unnamed: 0,variable,value,total
0,ECOSYSTEM,Host-associated,30868
1,ECOSYSTEM,Environmental,27954
2,ECOSYSTEM,Engineered,5211
3,ECOSYSTEM,,111
4,ECOSYSTEM_CATEGORY,Human,19719
5,ECOSYSTEM_CATEGORY,Aquatic,19622
6,ECOSYSTEM_CATEGORY,Terrestrial,8214
7,ECOSYSTEM_CATEGORY,Mammals,4151
8,ECOSYSTEM_CATEGORY,Plants,3467
9,ECOSYSTEM_CATEGORY,Built environment,1890


In [7]:
len(totalsdf)

395

In [8]:
## remove NaN from totals ... we aren't interested in the number of missing values
dropna_totalsdf = totalsdf.dropna()

In [9]:
len(dropna_totalsdf)

390

In [10]:
q = """
WITH TOPVALS AS (
    SELECT *, ROW_NUMBER() 
    over (
        PARTITION BY variable
        order by total desc
    ) AS RowNo 
    FROM dropna_totalsdf
)
SELECT * FROM TOPVALS WHERE RowNo <= 7
"""
top7_eavdf = pysqldf(q)
top7_eavdf.head(10)

Unnamed: 0,variable,value,total,RowNo
0,ECOSYSTEM,Host-associated,30868,1
1,ECOSYSTEM,Environmental,27954,2
2,ECOSYSTEM,Engineered,5211,3
3,ECOSYSTEM_CATEGORY,Human,19719,1
4,ECOSYSTEM_CATEGORY,Aquatic,19622,2
5,ECOSYSTEM_CATEGORY,Terrestrial,8214,3
6,ECOSYSTEM_CATEGORY,Mammals,4151,4
7,ECOSYSTEM_CATEGORY,Plants,3467,5
8,ECOSYSTEM_CATEGORY,Built environment,1890,6
9,ECOSYSTEM_CATEGORY,Wastewater,1478,7


In [11]:
top7_eavdf[top7_eavdf.RowNo == 5] # test out the logic: should be able to filter on RowNo to get different counts

Unnamed: 0,variable,value,total,RowNo
7,ECOSYSTEM_CATEGORY,Plants,3467,5
14,ECOSYSTEM_SUBTYPE,Lake,1579,5
21,ECOSYSTEM_TYPE,Unclassified,1585,5
28,SPECIFIC_ECOSYSTEM,Agricultural land,725,5


In [12]:
len(top7_eavdf)
top7_eavdf.variable.unique()

array(['ECOSYSTEM', 'ECOSYSTEM_CATEGORY', 'ECOSYSTEM_SUBTYPE',
       'ECOSYSTEM_TYPE', 'SPECIFIC_ECOSYSTEM'], dtype=object)

In [13]:
## create tuples of values in top7_eav
top7_values = tuple(top7_eavdf['value'].unique())
ecosytem_values = tuple(top7_eavdf[top7_eavdf.variable == 'ECOSYSTEM']['value'].unique())
ecosytem_category_values = tuple(top7_eavdf[top7_eavdf.variable == 'ECOSYSTEM_CATEGORY']['value'].unique())
ecosytem_type_values = tuple(top7_eavdf[top7_eavdf.variable == 'ECOSYSTEM_TYPE']['value'].unique())
ecosytem_subtype_values = tuple(top7_eavdf[top7_eavdf.variable == 'ECOSYSTEM_SUBTYPE']['value'].unique())
specific_ecosytem_values = tuple(top7_eavdf[top7_eavdf.variable == 'SPECIFIC_ECOSYSTEM']['value'].unique())
specific_ecosytem_values
# top7_eavdf[['variable', 'value']].pivot(columns='variable', values='value')

('Fecal',
 'Unclassified',
 'Sediment',
 'Forest Soil',
 'Agricultural land',
 'Serum',
 'Microbial mats')

In [14]:
## build dataframe using top 7 values from each field

q = f"""
select 
    *
from
    elevelsdf
where
    ECOSYSTEM in {ecosytem_values}
    and ECOSYSTEM_CATEGORY in {ecosytem_category_values}
    and ECOSYSTEM_TYPE in {ecosytem_type_values}
    and ECOSYSTEM_SUBTYPE in {ecosytem_subtype_values}
    and SPECIFIC_ECOSYSTEM in {specific_ecosytem_values}
"""
top7df = pysqldf(q)
print(len(top7df))
print(len(elevelsdf))
print(len(elevelsdf.dropna()))

22399
64144
34600


### ECOSYSTEM values

In [15]:
q ="""
select distinct 
    ECOSYSTEM
from
    top7df
"""
sqldf(q)

Unnamed: 0,ECOSYSTEM
0,Environmental
1,Host-associated
2,Engineered


In [16]:
#print_pandas_md(sqldf(q))

## proposed mappings
* Host-associated -> **envo:ENVO_01001000**; rdfs:label "environmental system determined by an organism"@en  
note: this is subclass of envo:ecosystem

* Engineered -> **envo:ENVO_01000313**; rdfs:label "anthropogenic environment"@en  
note: this is NOT a subclass of envo:ecosystem

* Environmental -> **envo:ENVO_01000951**; rdfs:label "natural environment"@en    
note: this is NOT a subclass of envo:ecosystem


### ECOSYSTEM_CATEGORY values

In [17]:
q = """
select distinct 
    ECOSYSTEM, ECOSYSTEM_CATEGORY
from
    top7df
order by
    ECOSYSTEM
"""
pysqldf(q)

Unnamed: 0,ECOSYSTEM,ECOSYSTEM_CATEGORY
0,Engineered,Built environment
1,Engineered,Wastewater
2,Environmental,Aquatic
3,Environmental,Terrestrial
4,Host-associated,Human
5,Host-associated,Mammals
6,Host-associated,Plants


In [18]:
# print_pandas_md(sqldf(q))

## Proposed mappings
* Engineered / Built environment -> ???  
note: unsure of the distinction 'engineered' and 'built'; perhaps a building?  
  
* Engineered / Wastewater -> ???  
note: perhaps add class 'environmental system determined by waste water' unless they mean a portion of waste water differing labels with 'environment determined by' and 'environmental system determined'; 
  
* Environmental / Aquatic -> **envo:ENVO_00002030**; rdfs:label "aquatic biome" **OR** **envo:ENVO_01000317**; rdfs:label "aquatic environment"  
note: not sure of the distinction between biome and environment here; environment seem more general, so might be more appropriate  
  
* Environmental / Terrestrial ->  **envo:ENVO_01001226**; rdfs:label "terrestrial natural environment"@en  
  
* Host-associated / Mammals -> ???  
note: perhaps add class 'mammal-associated environment' as subclass of 'animal-associated environment'
  
* Host-associated / Human -> ???  
note: perhaps add class 'human-associated enviroment' as subclass of 'mammal-associated enviroment' (referrenced above)
  
* Host-associated / Plants ->  **envo:ENVO_01001001**; rdfs:label "plant-associated environment"@en

### ECOSYSTEM_TYPE values

In [26]:
q = """
select distinct 
    ECOSYSTEM, ECOSYSTEM_CATEGORY, ECOSYSTEM_TYPE
from
    top7df
order by
    ECOSYSTEM
"""
pysqldf(q)

Unnamed: 0,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_TYPE
0,Engineered,Built environment,Unclassified
1,Engineered,Built environment,City
2,Engineered,Wastewater,Unclassified
3,Environmental,Aquatic,Freshwater
4,Environmental,Terrestrial,Soil
5,Environmental,Aquatic,Marine
6,Environmental,Aquatic,Unclassified
7,Environmental,Terrestrial,Unclassified
8,Environmental,Aquatic,Thermal springs
9,Host-associated,Human,Digestive system


In [28]:
# print_pandas_md(sqldf(q))

## Proposed mappings
* Engineered / Built environment / City -> **envo:ENVO_01000248**; rdfs:label "dense settlement biome" **OR** **envo:ENVO_01000249**; rdfs:label "urban biome"
note: 'city' might refer to a type of environment zone; if so, this might require adding new classes

* Engineered / Wastewater / Unclassified ->  ???  
note: Do we want special class to identify unclassified data?  

* Environmental / Aquatic / Freshwater -> **envo:ENVO_01000306**; rdfs:label "freshwater environment"  

* Environmental / Terrestrial / Soil -> **envo:ENVO_01001044**; rdfs:label "soil environment"@en

* Environmental / Aquatic / Marine -> **ENVO_01000307**; rdfs:label "saline water environment"  

* Environmental / Aquatic / Thermal springs -> ???  
note: add as subclass of 'aquatic environment'  

* Host-associated / Mammals / Digestive system -> ???  
note: add class 'mammalian digestive tract enviroment' as subclass of 'digestive tract environment'  

* Host-associated / Human / Digestive system -> ???  
note: note: add class 'human digestive tract environment' as subclass of 'mammalian digestive tract environment' (referenced above)  

### Unclassified values
* Engineered / Built environment / Unclassified -> ???  
* Host-associated / Plants / Unclassified -> ???  
* Host-associated / Human / Unclassified -> ???  
* Host-associated / Mammals / Unclassified -> ???  
* Environmental / Aquatic / Unclassified -> ???  
* Environmental / Terrestrial / Unclassified -> ???  

### ECOSYSTEM_SUBTYPE values

In [29]:
q = """
select distinct 
    ECOSYSTEM, ECOSYSTEM_CATEGORY, ECOSYSTEM_TYPE, ECOSYSTEM_SUBTYPE
from
    top7df
order by
    ECOSYSTEM
"""
pysqldf(q)

Unnamed: 0,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_TYPE,ECOSYSTEM_SUBTYPE
0,Engineered,Built environment,Unclassified,Unclassified
1,Engineered,Built environment,City,Unclassified
2,Engineered,Wastewater,Unclassified,Unclassified
3,Engineered,Built environment,City,Subway
4,Environmental,Aquatic,Freshwater,Lake
5,Environmental,Terrestrial,Soil,Unclassified
6,Environmental,Aquatic,Marine,Oceanic
7,Environmental,Aquatic,Unclassified,Unclassified
8,Environmental,Aquatic,Freshwater,Unclassified
9,Environmental,Aquatic,Marine,Unclassified


In [31]:
# print_pandas_md(sqldf(q))

### Proposed mappings
* Engineered / Built environment / City / Subway -> ???
note: add new classes 1) 'Subway' as subclass of 'transportation feature'; 2) 'environmental system determined by transportation feature'; 3) 'environmental system determined by subway'

* Environmental / Aquatic / Freshwater / Lake -> ???
note: add new class 'lake environment' as subclass of 'freshwater environment'

* Environmental / Aquatic / Marine / Oceanic -> **envo:ENVO_01000321**; rdfs:label "sea water environment" **OR** **envo:ENVO_01000048**; rdfs:label "ocean biome"
note: Do we want to create a subclass of 'sea water environment' called 'ocean environment'? Also, is 'Oceanic' intended to reference an environmental system or a biome?

* Environmental / Aquatic / Freshwater / Groundwater -> ???  
note: add class 'groundwater environment' as subclass of 'freshwater environment'

* Host-associated / Mammals / Digestive system / Large intestine -> ???  
note: add class 'mammalian large intestine environment' as subclass of 'mammalian digestive tract environment' 

* Host-associated / Human / Digestive system / Large intestine -> ???  
note: add class 'human large intestine environment' as subclass of 'mammalian large intestine environment' (referenced above)

* Host-associated / Human / Digestive system / Oral -> ???  
note: add class 'human oral environment' as subclass of 'human digestive tract environment'

### Unclassifed values
* Engineered / Built environment / Unclassified / Unclassified -> ???  
* Engineered / Built environment / City / Unclassified -> ???  
* Engineered / Wastewater / Unclassified / Unclassified -> ???  
* Environmental / Terrestrial / Soil / Unclassified -> ???  
* Environmental / Aquatic / Unclassified / Unclassified -> ???  
* Environmental / Aquatic / Freshwater / Unclassified -> ???  
* Environmental / Aquatic / Marine / Unclassified -> ???  
* Environmental / Terrestrial / Unclassified / Unclassified -> ???  
* Environmental / Aquatic / Thermal springs / Unclassified -> ???  
* Host-associated / Human / Digestive system / Unclassified -> ???  
* Host-associated / Mammals / Digestive system / Unclassified -> ???  
* Host-associated / Human / Unclassified / Unclassified -> ???  
* Host-associated / Mammals / Unclassified / Unclassified -> ???  
* Host-associated / Plants / Unclassified / Unclassified -> ???  


### SPECIFIC_ECOSYSTEM values

In [23]:
q = """
select distinct 
    ECOSYSTEM, ECOSYSTEM_CATEGORY, ECOSYSTEM_TYPE, ECOSYSTEM_SUBTYPE, SPECIFIC_ECOSYSTEM
from
    top7df
order by
    ECOSYSTEM
"""
pysqldf(q)

Unnamed: 0,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_TYPE,ECOSYSTEM_SUBTYPE,SPECIFIC_ECOSYSTEM
0,Engineered,Built environment,Unclassified,Unclassified,Unclassified
1,Engineered,Built environment,City,Unclassified,Unclassified
2,Engineered,Wastewater,Unclassified,Unclassified,Unclassified
3,Engineered,Built environment,City,Subway,Unclassified
4,Environmental,Aquatic,Freshwater,Lake,Sediment
5,Environmental,Aquatic,Freshwater,Lake,Unclassified
6,Environmental,Terrestrial,Soil,Unclassified,Forest Soil
7,Environmental,Aquatic,Marine,Oceanic,Unclassified
8,Environmental,Aquatic,Unclassified,Unclassified,Unclassified
9,Environmental,Terrestrial,Soil,Unclassified,Unclassified


In [33]:
# print_pandas_md(sqldf(q))

### Proposed mappings
#### Need to verify that data refers to an environment or the sample itself. For example, when I look at the descriptions for SPECIFIC_ECOSYSTEM == 'Fecal', I see descriptions like "Human feces microbial communities from a cholera patient". The physical specimen is the microbes ... right?

* Environmental / Aquatic / Freshwater / Lake / Sediment -> **envo:ENVO_01001049**; rdfs:label "non-saline sediment environment"  
note: Need to add classes that the sediment is from a lake. 

* Environmental / Terrestrial / Soil / Unclassified / Forest Soil -> ???   
note: Add class "forest soil enviroment" as subclass of "soil enviroment".  
Also, this is interesting b/c the **"ECOSYSTEM_SUBTYPE" value is "Unclassified"**. 

* Environmental / Aquatic / Marine / Oceanic / Sediment -> **envo:ENVO_01001050**; rdfs:label "saline sediment environment"@en     
note: add information that the sediment is from an ocean  

* Environmental / Terrestrial / Soil / Unclassified / Agricultural land -> **envo:ENVO_01000311**; rdfs:label "cultivated environment"  
note: Do we need to further specify that the cultivated environment is used for growing food?  
Also, the **"ECOSYSTEM_SUBTYPE" value is "Unclassified"**. 

* Host-associated / Mammals / Digestive system / Large intestine / Fecal -> ???   
note: add subclass 'mammalian fecal environment' as subclass of 'fecal environment'  

* Host-associated / Human / Digestive system / Large intestine / Fecal -> ???     
note: add subclass 'human fecal environment' as subclass of 'mammalian fecal environment' (referenced above)  

### Unclassified values
* Engineered / Built environment / Unclassified / Unclassified / Unclassified -> ???  
* Engineered / Built environment / City / Unclassified / Unclassified -> ???  
* Engineered / Wastewater / Unclassified / Unclassified / Unclassified -> ???  
* Engineered / Built environment / City / Subway / Unclassified -> ???  
* Environmental / Aquatic / Freshwater / Lake / Unclassified -> ???  
* Environmental / Aquatic / Marine / Oceanic / Unclassified -> ???  
* Environmental / Aquatic / Unclassified / Unclassified / Unclassified -> ???  
* Environmental / Terrestrial / Soil / Unclassified / Unclassified -> ???  
* Environmental / Aquatic / Freshwater / Unclassified / Unclassified -> ???  
* Environmental / Aquatic / Marine / Unclassified / Unclassified -> ???  
* Environmental / Terrestrial / Unclassified / Unclassified / Unclassified -> ???  
* Environmental / Aquatic / Freshwater / Groundwater / Unclassified -> ???  
* Environmental / Aquatic / Thermal springs / Unclassified / Unclassified -> ???  
* Host-associated / Human / Digestive system / Oral / Unclassified -> ???  
* Host-associated / Human / Digestive system / Unclassified / Unclassified -> ???  
* Host-associated / Mammals / Digestive system / Unclassified / Unclassified -> ???  
* Host-associated / Mammals / Digestive system / Large intestine / Unclassified -> ???  
* Host-associated / Human / Unclassified / Unclassified / Unclassified -> ???  
* Host-associated / Mammals / Unclassified / Unclassified / Unclassified -> ???  
* Host-associated / Human / Digestive system / Large intestine / Unclassified -> ???  
* Host-associated / Plants / Unclassified / Unclassified / Unclassified -> ???  