In [30]:
import os
from collections import Counter
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
os.chdir('/Users/rudraksh/Google Drive/FlavorDB-Analysis')

### Read Data Files.

In [49]:
# ADMET properties.
admet = pd.read_csv('Data/FDB_admet.tsv', sep='\t')

# 2d-3d properties.
twod_threed = pd.read_csv('Data/FDB_twod_threed.tsv', sep='\t')

# Pubchem properties
pubchem_prop = pd.read_csv('Data/FDB_pubchem_prop.tsv', sep='\t', index_col='Unnamed: 0')
pubchem_prop.fillna('', inplace=True)

# Drop certain columns
twod_threed.drop(['Molecular_Formula', 'Molecular_Composition'], inplace=True, axis=1)
pubchem_prop.drop(['IUPACName', 'common_name', 'CanonicalSMILES', 'IsomericSMILES',
                   'cas_id', 'fema_number', 'InChI'], 
                  inplace=True, axis=1)
admet.drop(['ADMET_BBB', 'ADMET_EXT_Hepatotoxic_Applicability', 'ADMET_EXT_PPB_Applicability'],
           inplace=True, axis=1)

# Set pubchem_id as index for each.
twod_threed.set_index('pubchem_id', inplace=True)
pubchem_prop.set_index('pubchem_id', inplace=True)
admet.set_index('pubchem_id', inplace=True)

### 1. Basic Statistics

#### a. Count of different flavors and overlap

In [80]:
fooddb_odor = list(pubchem_prop[pubchem_prop['fooddb_flavor_profile'] != ''].index)
print("Total FoodDB Odor molecules: %d" %len(fooddb_odor))

pubchem_odor = list(pubchem_prop[pubchem_prop['odor'] != ''].index)
print("Total PubChem odor molecules: %d" %len(pubchem_odor))

pubchem_taste = list(pubchem_prop[pubchem_prop['taste'] != ''].index)
print("Total PubChem taste molecules: %d" %len(pubchem_taste))

sweet_like = list(pubchem_prop[pubchem_prop['SuperSweet'] == 'Sweet-like'].index)
sweet = list(pubchem_prop[pubchem_prop['SuperSweet'] == 'Sweet'].index)
print("Total Sweet molecules: %d " %len(sweet))
print("Total Sweet-like molecules: %d" %len(sweet_like))

bitter = list(pubchem_prop[pubchem_prop['Bitter'].tolist()].index)
print("Total Bitter molecules: %d" %len(bitter))

fema = list(pubchem_prop[pubchem_prop['fema_flavor_profile'] != ''].index)
print("Total FEMA molecules: %d" %len(fema))

Total FoodDB Odor molecules: 2681
Total PubChem odor molecules: 544
Total PubChem taste molecules: 415
Total Sweet molecules: 8193 
Total Sweet-like molecules: 13787
Total Bitter molecules: 664
Total FEMA molecules: 1808


In [88]:
x = [fooddb_odor, pubchem_odor, pubchem_taste, sweet, sweet_like, bitter, fema]
conf = np.zeros((len(x), len(x)))

for i,mol_list_1 in enumerate(x):
    for j,mol_list_2 in enumerate(x):
        mol_list_1 = set(mol_list_1)
        mol_list_2 = set(mol_list_2)
        
        conf[i,j] = len(mol_list_1.intersection(mol_list_2))
        
c = ['FooDB Odor', 'PubChem Odor', 'PubChem Taste', 'Sweet', 'Sweet-like', 'Bitter', 'FEMA']
pd.DataFrame(conf, columns=c, index=c)

Unnamed: 0,FooDB Odor,PubChem Odor,PubChem Taste,Sweet,Sweet-like,Bitter,FEMA
FooDB Odor,2681.0,422.0,284.0,21.0,4.0,190.0,1557.0
PubChem Odor,422.0,544.0,348.0,30.0,6.0,112.0,227.0
PubChem Taste,284.0,348.0,415.0,45.0,18.0,124.0,164.0
Sweet,21.0,30.0,45.0,8193.0,0.0,5.0,13.0
Sweet-like,4.0,6.0,18.0,0.0,13787.0,1.0,15.0
Bitter,190.0,112.0,124.0,5.0,1.0,664.0,89.0
FEMA,1557.0,227.0,164.0,13.0,15.0,89.0,1808.0


##### Preprocessing
* A new odor column is created by combining "FoodDB Odor" and "PubChem Odor".
* A new taste column is created by combining "Sweet" molecules and "Bitter" molecules.
* Sweet-like molecules are not considered for further analysis since they are found using structural similarity.
* PubChem taste is not considered because of high dimensionality and being limited to just a few molecules.
* FEMA is discarded because of no clear distinction being made between Odor and Taste, as well no source info being present