In [1]:
import pandas as pds
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

# Build data frame to explore values that map to EVNO classes

In [2]:
all_data_df = pds.read_csv("Biosample_all.tsv.gz", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
elevels = ['ECOSYSTEM', 'ECOSYSTEM_CATEGORY', 'ECOSYSTEM_TYPE', 'ECOSYSTEM_SUBTYPE', 'SPECIFIC_ECOSYSTEM']
elevelsdf = all_data_df[elevels]

### Build top 7 dataset (cf. previous analyis in gold-env-elevels-analysis.ipynb)

In [4]:
## create EAV dataframe
## fist give the index column a name and reset
elevelsdf.index.name = 'id'
elevelsdf.reset_index(inplace=True)
elevelsdf.head()

Unnamed: 0,id,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_TYPE,ECOSYSTEM_SUBTYPE,SPECIFIC_ECOSYSTEM
0,0,Host-associated,Plants,Phyllosphere,Caulosphere,
1,1,Engineered,Food production,Dairy products,,
2,2,Engineered,Food production,Dairy products,,
3,3,Engineered,Food production,Dairy products,,
4,4,Engineered,Food production,Dairy products,,


In [5]:
## next use melt function to unpivot/unstack the data
eavdf = elevelsdf.melt(id_vars='id')
eavdf.sort_values(by=['id'], inplace=True)
print(pds.DataFrame(elevelsdf.iloc[8024])) ## print a random record from the original df and eav to make sure they values match
print(eavdf[eavdf.id == 8024])

                             8024
id                           8024
ECOSYSTEM           Environmental
ECOSYSTEM_CATEGORY        Aquatic
ECOSYSTEM_TYPE         Freshwater
ECOSYSTEM_SUBTYPE     Groundwater
SPECIFIC_ECOSYSTEM            NaN
          id            variable          value
264600  8024  SPECIFIC_ECOSYSTEM            NaN
72168   8024  ECOSYSTEM_CATEGORY        Aquatic
200456  8024   ECOSYSTEM_SUBTYPE    Groundwater
8024    8024           ECOSYSTEM  Environmental
136312  8024      ECOSYSTEM_TYPE     Freshwater


In [6]:
## get totals for each variable/value pair
q = """
select 
    variable, value, count(*) as total
from 
    eavdf
group by 
    variable, value
order by 
    variable, total desc
"""
totalsdf = pysqldf(q)
totalsdf.head(10)

Unnamed: 0,variable,value,total
0,ECOSYSTEM,Host-associated,30868
1,ECOSYSTEM,Environmental,27954
2,ECOSYSTEM,Engineered,5211
3,ECOSYSTEM,,111
4,ECOSYSTEM_CATEGORY,Human,19719
5,ECOSYSTEM_CATEGORY,Aquatic,19622
6,ECOSYSTEM_CATEGORY,Terrestrial,8214
7,ECOSYSTEM_CATEGORY,Mammals,4151
8,ECOSYSTEM_CATEGORY,Plants,3467
9,ECOSYSTEM_CATEGORY,Built environment,1890


In [7]:
len(totalsdf)

395

In [8]:
## remove NaN from totals ... we aren't interested in the number of missing values
dropna_totalsdf = totalsdf.dropna()

In [9]:
len(dropna_totalsdf)

390

In [10]:
q = """
WITH TOPVALS AS (
    SELECT *, ROW_NUMBER() 
    over (
        PARTITION BY variable
        order by total desc
    ) AS RowNo 
    FROM dropna_totalsdf
)
SELECT * FROM TOPVALS WHERE RowNo <= 7
"""
top7_eavdf = pysqldf(q)
top7_eavdf.head(10)

Unnamed: 0,variable,value,total,RowNo
0,ECOSYSTEM,Host-associated,30868,1
1,ECOSYSTEM,Environmental,27954,2
2,ECOSYSTEM,Engineered,5211,3
3,ECOSYSTEM_CATEGORY,Human,19719,1
4,ECOSYSTEM_CATEGORY,Aquatic,19622,2
5,ECOSYSTEM_CATEGORY,Terrestrial,8214,3
6,ECOSYSTEM_CATEGORY,Mammals,4151,4
7,ECOSYSTEM_CATEGORY,Plants,3467,5
8,ECOSYSTEM_CATEGORY,Built environment,1890,6
9,ECOSYSTEM_CATEGORY,Wastewater,1478,7


In [11]:
top7_eavdf[top7_eavdf.RowNo == 5] # test out the logic: should be able to filter on RowNo to get different counts

Unnamed: 0,variable,value,total,RowNo
7,ECOSYSTEM_CATEGORY,Plants,3467,5
14,ECOSYSTEM_SUBTYPE,Lake,1579,5
21,ECOSYSTEM_TYPE,Unclassified,1585,5
28,SPECIFIC_ECOSYSTEM,Agricultural land,725,5


In [12]:
len(top7_eavdf)
top7_eavdf.variable.unique()

array(['ECOSYSTEM', 'ECOSYSTEM_CATEGORY', 'ECOSYSTEM_SUBTYPE',
       'ECOSYSTEM_TYPE', 'SPECIFIC_ECOSYSTEM'], dtype=object)

In [13]:
## create tuples of values in top7_eav
top7_values = tuple(top7_eavdf['value'].unique())
ecosytem_values = tuple(top7_eavdf[top7_eavdf.variable == 'ECOSYSTEM']['value'].unique())
ecosytem_category_values = tuple(top7_eavdf[top7_eavdf.variable == 'ECOSYSTEM_CATEGORY']['value'].unique())
ecosytem_type_values = tuple(top7_eavdf[top7_eavdf.variable == 'ECOSYSTEM_TYPE']['value'].unique())
ecosytem_subtype_values = tuple(top7_eavdf[top7_eavdf.variable == 'ECOSYSTEM_SUBTYPE']['value'].unique())
specific_ecosytem_values = tuple(top7_eavdf[top7_eavdf.variable == 'SPECIFIC_ECOSYSTEM']['value'].unique())
specific_ecosytem_values
# top7_eavdf[['variable', 'value']].pivot(columns='variable', values='value')

('Fecal',
 'Unclassified',
 'Sediment',
 'Forest Soil',
 'Agricultural land',
 'Serum',
 'Microbial mats')

In [14]:
## build dataframe using top 7 values from each field

q = f"""
select 
    *
from
    elevelsdf
where
    ECOSYSTEM in {ecosytem_values}
    and ECOSYSTEM_CATEGORY in {ecosytem_category_values}
    and ECOSYSTEM_TYPE in {ecosytem_type_values}
    and ECOSYSTEM_SUBTYPE in {ecosytem_subtype_values}
    and SPECIFIC_ECOSYSTEM in {specific_ecosytem_values}
"""
top7df = pysqldf(q)
print(len(top7df))
print(len(elevelsdf))
print(len(elevelsdf.dropna()))

22399
64144
34600


### ECOSYSTEM values

In [15]:
list(top7df.ECOSYSTEM.unique())

['Environmental', 'Host-associated', 'Engineered']

## proposed mappings
Host-associated -> **envo:ENVO_01001000**; rdfs:label "environmental system determined by an organism"@en  
note: this is subclass of envo:ecosystem

Engineered -> **envo:ENVO_01000313**; rdfs:label "anthropogenic environment"@en  
note: this is NOT a subclass of envo:ecosystem

Environmental -> **envo:ENVO_01000951**; rdfs:label "natural environment"@en    
note: this is NOT a subclass of envo:ecosystem


### ECOSYSTEM_CATEGORY values

In [16]:
q = """
select distinct 
    ECOSYSTEM, ECOSYSTEM_CATEGORY
from
    top7df
order by
    ECOSYSTEM
"""
pysqldf(q)

Unnamed: 0,ECOSYSTEM,ECOSYSTEM_CATEGORY
0,Engineered,Built environment
1,Engineered,Wastewater
2,Environmental,Aquatic
3,Environmental,Terrestrial
4,Host-associated,Human
5,Host-associated,Mammals
6,Host-associated,Plants


## proposed mappings
Engineered / Built environment -> ???  
note: unusure of the distinction 'engineered' and 'built'; perhaps a building?  
  
Engineered / Wastewater -> ???  
note: perhaps add class 'enviromental system determined by waste water' unless they mean a portion of waste water differing lables with 'enironment deterimed by' and 'enviromental system determined'; 
  
Environmental / Aquatic -> **envo:ENVO_00002030**; rdfs:label "aquatic biome"^^xsd:string **OR** **envo:ENVO_01000317**; rdfs:label "aquatic environment"  
note: not sure of the distinction between biome and enviroment here; evironment seem more general, so might be more appropropiate  
  
Environmental / Terrestrial ->  **envo:ENVO_01001226**; rdfs:label "terrestrial natural environment"@en  
  
Host-associated / Mammals -> ???  
note: perhaps add class 'mammal-associated enviroment' as subclass of 'animal-associted enviroment'
  
Host-associated / Human -> ???  
note: perhaps add class 'human-associated enviroment' as subclass of 'mammal-associated enviroment' (referrenced above)
  
Host-associated / Plants ->  **envo:ENVO_01001001**; rdfs:label "plant-associated environment"@en

### ECOSYSTEM_TYPE values

In [17]:
q = """
select distinct 
    ECOSYSTEM, ECOSYSTEM_CATEGORY, ECOSYSTEM_TYPE
from
    top7df
order by
    ECOSYSTEM
"""
pysqldf(q)

Unnamed: 0,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_TYPE
0,Engineered,Built environment,Unclassified
1,Engineered,Built environment,City
2,Engineered,Wastewater,Unclassified
3,Environmental,Aquatic,Freshwater
4,Environmental,Terrestrial,Soil
5,Environmental,Aquatic,Marine
6,Environmental,Aquatic,Unclassified
7,Environmental,Terrestrial,Unclassified
8,Environmental,Aquatic,Thermal springs
9,Host-associated,Human,Digestive system


## proposed mappings
Engineered / Built environment / City -> **envo:ENVO_01000248**; rdfs:label "dense settlement biome" **OR** **envo:ENVO_01000249**; rdfs:label "urban biome"
note: 'city' might refer to a type of enviroment zone; if so, this might require adding new classes

Engineered / Wastewater / Unclassified ->  ???  
note: Do we want special class to indentify unclassified data?  

Environmental / Aquatic / Freshwater -> **envo:ENVO_01000306**; rdfs:label "freshwater environment"  

Environmental / Terrestrial / Soil -> **envo:ENVO_01001044**; rdfs:label "soil environment"@en

Environmental / Aquatic / Marine -> **ENVO_01000307**; rdfs:label "saline water environment"  

Environmental / Aquatic / Unclassified -> ???  
note: Do we want special class to indentify unclassified data?  

Environmental / Terrestrial / Unclassified -> ???  
note: Do we want special class to indentify unclassified data?  

Environmental / Aquatic / Thermal springs -> ???  
note: add as sublcass of 'aquatic enviroment'  

Host-associated / Mammals / Digestive system -> ???  
note: add class 'mammalian digestive tract enviroment' as sublcass of 'digestive tract enviroment'  

Host-associated / Human / Digestive system -> ???  
note: note: add class 'human digestive tract enviroment' as sublcass of 'mammalian digestive tract enviroment' (referrenced above)  

Host-associated / Human / Unclassified -> ???  
note: Do we want special class to indentify unclassified data?  

Host-associated / Mammals / Unclassified -> ???  
note: Do we want special class to indentify unclassified data?  

Host-associated / Plants / Unclassified -> ???  
note: Do we want special class to indentify unclassified data?  

### ECOSYSTEM_SUBTYPE values

In [18]:
q = """
select distinct 
    ECOSYSTEM, ECOSYSTEM_CATEGORY, ECOSYSTEM_SUBTYPE
from
    top7df
order by
    ECOSYSTEM
"""
pysqldf(q)

Unnamed: 0,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_SUBTYPE
0,Engineered,Built environment,Unclassified
1,Engineered,Wastewater,Unclassified
2,Engineered,Built environment,Subway
3,Environmental,Aquatic,Lake
4,Environmental,Terrestrial,Unclassified
5,Environmental,Aquatic,Oceanic
6,Environmental,Aquatic,Unclassified
7,Environmental,Aquatic,Groundwater
8,Host-associated,Human,Large intestine
9,Host-associated,Mammals,Large intestine


### proposed mappings

Engineered / Wastewater / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Engineered / Built environment / Subway -> ???
note: add new classes 1) 'Subway' as subclass of 'transportation feature'; 2) 'environmental system determined by transportation feature'; 3) 'enviromental system determined by subway'

Environmental / Aquatic / Lake -> ???
note: add new class 'lake environment' as subclass of 'aquatic enviroment'

Environmental / Terrestrial / Unclassified -> ???
note: need to decide how to handle 'Unclassified' values  

Environmental / Aquatic / Oceanic -> **envo:ENVO_01000321**; rdfs:label "sea water environment" **OR** **envo:ENVO_01000048**; rdfs:label "ocean biome"
note: Do we want to create a subclass of 'sea water enivironment' called 'ocean enviroment'? Also, is 'Oceanic' intended to reference an enviromental system or a biome?

Environmental / Aquatic / Unclassified -> ???   
note: need to decide how to handle 'Unclassified' values  

Environmental / Aquatic / Groundwater -> ???  
note: add class 'groundwater enviroment' as subclass of 'aquatic enviroment'

Host-associated / Mammals / Large intestine -> ???  
note: add class 'mammalian large intestine environment' as subcass of 'mammalian digestive tract enviroment' 

Host-associated / Human / Large intestine -> ???  
note: add class 'human large intestine enviroment' as subclass of 'mammalian large intestine environment' (referenced above)

Host-associated / Human / Oral -> ???  
note: add class 'human oral environment' as subclass of 'human digestive tract environment'

Host-associated / Human / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Host-associated / Mammals / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Host-associated / Plants / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

### SPECIFIC_ECOSYSTEM values

In [19]:
q = """
select distinct 
    ECOSYSTEM, ECOSYSTEM_CATEGORY, ECOSYSTEM_SUBTYPE, SPECIFIC_ECOSYSTEM
from
    top7df
order by
    ECOSYSTEM
"""
pysqldf(q)

Unnamed: 0,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_SUBTYPE,SPECIFIC_ECOSYSTEM
0,Engineered,Built environment,Unclassified,Unclassified
1,Engineered,Wastewater,Unclassified,Unclassified
2,Engineered,Built environment,Subway,Unclassified
3,Environmental,Aquatic,Lake,Sediment
4,Environmental,Aquatic,Lake,Unclassified
5,Environmental,Terrestrial,Unclassified,Forest Soil
6,Environmental,Aquatic,Oceanic,Unclassified
7,Environmental,Aquatic,Unclassified,Unclassified
8,Environmental,Terrestrial,Unclassified,Unclassified
9,Environmental,Aquatic,Oceanic,Sediment


### proposed mappings
#### Need to verify that data refers to an environment or the sample itself. For example, when I look at the descriptions for SPECIFIC_ECOSYSTEM == 'Fecal', I see descriptions like "Human feces microbial communities from a cholera patient". The physical specimen is the microbes ... right?

Engineered / Wastewater / Unclassified / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Engineered / Built environment / Subway / Unclassified ->  ???  
note: need to decide how to handle 'Unclassified' values  

Environmental / Aquatic / Lake / Sediment -> **envo:ENVO_01001048**; rdfs:label "sediment environment"@en  
note: Need to add classes that the sediment is from a lake. See above comment about "Environmental / Aquatic / Lake". Also, is there an assumption that lakes are freshwater?   

Environmental / Aquatic / Lake / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Environmental / Terrestrial / Unclassified / Forest Soil -> ???   
note: Add class "forest soil enviroment" as subclass of "soil enviroment". Also, this is interesting b/c the "ECOSYSTEM_SUBTYPE" value is "Unclassified". 

Environmental / Aquatic / Oceanic / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Environmental / Aquatic / Unclassified / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Environmental / Terrestrial / Unclassified / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Environmental / Aquatic / Oceanic / Sediment -> **envo:ENVO_01001050**; rdfs:label "saline sediment environment"@en     
note: add information that the sediment is from an ocean  

Environmental / Aquatic / Groundwater / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Environmental / Terrestrial / Unclassified / Agricultural land -> **envo:ENVO_01000311**; rdfs:label "cultivated environment"  note: Do we need to further specify that the cultivated enviroment is used for growing food? Also, the "ECOSYSTEM_SUBTYPE" value is "Unclassified". 

Host-associated / Mammals / Large intestine / Fecal -> ???   
note: add subclass 'mammalian fecal environment' as subclass of 'fecal enviroment'  

Host-associated / Human / Large intestine / Fecal ->   
note: add subclass 'human fecal enviroment' as subclass of 'mammalian fecal environment' (referenced above)  

Host-associated / Human / Oral / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Host-associated / Human / Unclassified / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Host-associated / Mammals / Unclassified / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Host-associated / Mammals / Large intestine / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Host-associated / Human / Large intestine / Unclassified -> ???  
note: need to decide how to handle 'Unclassified' values  

Host-associated / Plants / Unclassified / Unclassified ->???  
note: need to decide how to handle 'Unclassified' values  
