In [88]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import preprocessing
from sklearn import tree
from matplotlib import pyplot as plt

In [5]:
# Visualizing 6 decimals per value
pd.set_option('display.float_format', lambda x: '%.6f' % x)

# Reading data from csv
def load_dataset(path):
    dataset = pd.read_csv(path, header=0, delimiter=',')
    return dataset

sensorama = load_dataset('sensorama_df.csv')

In [6]:
# Since odor name and canonical_smiles variables are strings, we change them to a numerical value
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(sensorama['odor name'])
sensorama['odor name']=label_encoder.transform(sensorama['odor name'])
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(sensorama['canonical_smiles'])
sensorama['canonical_smiles']= label_encoder.transform(sensorama['canonical_smiles'])

In [7]:
# Normalizing function
def standarize(x_train):
    mean = x_train.mean()
    std = x_train.std()
    x_train -= mean
    x_train /= std

# Normalizing data
s= sensorama.columns
s=s.drop(['odor level', 'cid', 'canonical_smiles', 'odor name'])
for c in s:
    standarize(sensorama[c])
# After normalizing the data, we have NaNs values, so we change them to 0
sensorama=sensorama.fillna(0)

In [35]:
# Features
X=sensorama[['xlogp','fsp3', 'small rings','arom rings','ring blocks','arom blocks','rings 3','rings 4', 'rings 5', 'rings 6',
               'rings 7', 'rings 8', 'rings 9', 'tpsa efficiency', 'zagreb indices', 'weiner path', 'weiner polarity', 'wtpt.1',
               'wtpt.2', 'wtpt.3', 'wtpt.4', 'wtpt.5', 'molecular weight', 'VAdjMat', 'topoPSA', 'lipinski failures', 'rotatable bonds',
               'topo shape', 'petitjean number', 'mdec.11', 'mdec.12', 'mdec.13', 'mdec.14', 'mdec.22', 'mdec.23', 'mdec.24',
               'mdec.33', 'mdec.34', 'mdec.44', 'mdeo.11', 'mdeo.12', 'mdeo.22', 'mden.11', 'mden.12', 'mden.13', 'mden.22',
               'mden.23', 'mden.33', 'mlogp', 'atom p', 'atom lc', 'khs.sli', 'khs.ssbe', 'khs.ssssbe', 'khs.ssbh', 'khs.sssb',
               'khs.ssssb', 'khs.sch3', 'khs.dch2', 'khs.ssch2', 'khs.tch', 'khs.dsch', 'khs.aach', 'khs.sssch', 'khs.ddc',
               'khs.tsc', 'khs.dssc', 'khs.aasc', 'khs.aaac', 'khs.ssssc', 'khs.snh3', 'khs.snh2', 'khs.ssnh', 'khs.dnh', 'khs.ssnh.1',
               'khs.aanh', 'khs.tn', 'khs.sssnh', 'khs.dsn', 'khs.aan', 'khs.sssn', 'khs.dsn.1', 'khs.aasn', 'khs.ssssn', 'khs.soh',
               'khs.do', 'khs.sso', 'khs.aao', 'khs.sf', 'khs.ssih3', 'khs.sssih2', 'khs.ssssih', 'khs.sssssi', 'khs.sph2',
               'khs.ssph', 'khs.sssp', 'khs.dsssp', 'khs.sssssp', 'khs.ssh', 'khs.ds', 'khs.sss', 'khs.aas', 'khs.dsss', 'khs.ddsss',
               'khs.scl', 'khs.sGeH3', 'khs.ssGeH2', 'khs.sssGeH', 'khs.ssssGe', 'khs.sash2', 'khs.ssash', 'khs.sssas', 'khs.sssdas',
               'khs.sssssas', 'khs.sseh', 'khs.dse', 'khs.ssse', 'khs.aase', 'khs.dssse', 'khs.ddssse', 'khs.sbr', 'khs.ssnh3', 
               'khs.sssnh2', 'khs.ssssnh', 'khs.sssssn', 'khs.si', 'khs.spbh3', 'khs.sspbh2', 'khs.ssspbh', 'khs.sssspb', 'kier 1',
               'kier 2', 'hybRatio', 'hbdon', 'hbacc', 'fragc', 'fmf', 'eccen', 'SP.0', 'SP.1', 'SP.2', 'SP.3', 'SP.4', 'SP.5', 'SP.6',
               'SP.7', 'VP.0', 'VP.1', 'VP.2', 'VP.3', 'VP.4', 'VP.5', 'VP.6', 'VP.7', 'SPC.4', 'SPC.5', 'SPC.6', 'VPC.4', 'VPC.5',
               'VPC.6', 'SC.3', 'SC.4', 'SC.5', 'SC.6', 'VC.3', 'VC.4', 'VC.5', 'VC.6', 'SCH.3', 'SCH.4', 'SCH.5', 'SCH.6', 'SCH.7',
               'VCH.3', 'VCH.4', 'VCH.3', 'VCH.4', 'VCH.5', 'VCH.6', 'VCH.7', 'C1SP1', 'C2SP1', 'C1SP2', 'C2SP2', 'C3SP2', 'C1SP3',
               'C2SP3', 'C3SP3', 'C4SP3', 'bpol', 'bond count', 'basic group count', 'atsp1', 'atsp2' , 'atsp3', 'atsp4', 'atsp5',
               'atsm1', 'atsm2','atsm3','atsm4','atsm5', 'natom','arom bond', 'arom atom', 'apol','alogp', 'alogp2', 'amr', 'acid']]

### ODOR LEVEL 1

In [120]:
y = sensorama['odor level'] == 1

In [121]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
clf = DecisionTreeClassifier() # Create Decision Tree classifer object
clf = clf.fit(X_train,y_train) # Train Decision Tree Classifer
y_pred = clf.predict(X_test) # Predict the response for test dataset

In [122]:
# how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100,"%")

Accuracy: 72.62456010372291 %


In [123]:
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_13 <= 0.52
|   |--- feature_148 <= 1.30
|   |   |--- feature_40 <= 0.33
|   |   |   |--- feature_50 <= 2.46
|   |   |   |   |--- feature_132 <= -2.25
|   |   |   |   |   |--- feature_19 <= -0.97
|   |   |   |   |   |   |--- feature_189 <= -1.74
|   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |--- feature_189 >  -1.74
|   |   |   |   |   |   |   |--- feature_172 <= 0.01
|   |   |   |   |   |   |   |   |--- feature_24 <= -1.30
|   |   |   |   |   |   |   |   |   |--- feature_31 <= -0.91
|   |   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |   |--- feature_31 >  -0.91
|   |   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |--- feature_24 >  -1.30
|   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |--- feature_172 >  0.01
|   |   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |--- feature_19 >  -0.97
|   |   |   |   |   |   |--- feature_171 

### ODOR LEVEL 2

In [116]:
y = sensorama['odor level'] == 2

In [117]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
clf = DecisionTreeClassifier() # Create Decision Tree classifer object
clf = clf.fit(X_train,y_train) # Train Decision Tree Classifer
y_pred = clf.predict(X_test) # Predict the response for test dataset

In [118]:
# how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100,"%")

Accuracy: 76.75495462122616 %


In [119]:
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_31 <= 3.00
|   |--- feature_24 <= 3.17
|   |   |--- feature_41 <= 4.97
|   |   |   |--- feature_148 <= -1.53
|   |   |   |   |--- feature_201 <= -1.19
|   |   |   |   |   |--- feature_191 <= 5.32
|   |   |   |   |   |   |--- feature_157 <= -0.62
|   |   |   |   |   |   |   |--- feature_59 <= -0.66
|   |   |   |   |   |   |   |   |--- feature_24 <= -0.89
|   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |--- feature_24 >  -0.89
|   |   |   |   |   |   |   |   |   |--- feature_21 <= 1.68
|   |   |   |   |   |   |   |   |   |   |--- feature_197 <= -1.16
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 9
|   |   |   |   |   |   |   |   |   |   |--- feature_197 >  -1.16
|   |   |   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |   |--- feature_21 >  1.68
|   |   |   |   |   |   |   |   |   |   |--- feature_198 <= -1.82
|   |   |   |   |   |   |   |   |   |   |   |--- truncated bran

### ODOR LEVEL 3

In [112]:
y = sensorama['odor level'] == 3

In [113]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
clf = DecisionTreeClassifier() # Create Decision Tree classifer object
clf = clf.fit(X_train,y_train) # Train Decision Tree Classifer
y_pred = clf.predict(X_test) # Predict the response for test dataset

In [114]:
# how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100,"%")

Accuracy: 80.53343211705871 %


In [115]:
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_192 <= -0.54
|   |--- feature_24 <= 1.53
|   |   |--- feature_170 <= 1.57
|   |   |   |--- feature_199 <= -0.23
|   |   |   |   |--- feature_66 <= 0.70
|   |   |   |   |   |--- feature_152 <= -0.22
|   |   |   |   |   |   |--- feature_189 <= -2.04
|   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |--- feature_189 >  -2.04
|   |   |   |   |   |   |   |--- feature_158 <= -0.39
|   |   |   |   |   |   |   |   |--- feature_160 <= 0.79
|   |   |   |   |   |   |   |   |   |--- feature_179 <= -0.22
|   |   |   |   |   |   |   |   |   |   |--- feature_206 <= -2.38
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- feature_206 >  -2.38
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 30
|   |   |   |   |   |   |   |   |   |--- feature_179 >  -0.22
|   |   |   |   |   |   |   |   |   |   |--- feature_159 <= -0.42
|   |   |   |   |   |   |   |   |   |   |   |--- trunca

### ODOR LEVEL 4

In [108]:
y = sensorama['odor level'] == 4

In [109]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
clf = DecisionTreeClassifier() # Create Decision Tree classifer object
clf = clf.fit(X_train,y_train) # Train Decision Tree Classifer
y_pred = clf.predict(X_test) # Predict the response for test dataset

In [110]:
# how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100,"%")

Accuracy: 84.96017781070569 %


In [111]:
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_35 <= 4.52
|   |--- feature_39 <= 0.39
|   |   |--- feature_62 <= 1.02
|   |   |   |--- feature_24 <= 4.90
|   |   |   |   |--- feature_206 <= -0.77
|   |   |   |   |   |--- feature_208 <= 0.46
|   |   |   |   |   |   |--- feature_207 <= -0.69
|   |   |   |   |   |   |   |--- feature_29 <= 0.18
|   |   |   |   |   |   |   |   |--- feature_179 <= 0.09
|   |   |   |   |   |   |   |   |   |--- feature_31 <= 0.64
|   |   |   |   |   |   |   |   |   |   |--- feature_19 <= 0.34
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 15
|   |   |   |   |   |   |   |   |   |   |--- feature_19 >  0.34
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 22
|   |   |   |   |   |   |   |   |   |--- feature_31 >  0.64
|   |   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |--- feature_179 >  0.09
|   |   |   |   |   |   |   |   |   |--- feature_179 <= 0.11
|   |   |   |   |   |   |   |   |   |   |--- class: Fa

### ODOR LEVEL 5

In [104]:
y = sensorama['odor level'] == 5

In [105]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
clf = DecisionTreeClassifier() # Create Decision Tree classifer object
clf = clf.fit(X_train,y_train) # Train Decision Tree Classifer
y_pred = clf.predict(X_test) # Predict the response for test dataset

In [106]:
# how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100,"%")

Accuracy: 89.92406001111317 %


In [107]:
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_40 <= 0.06
|   |--- feature_198 <= 0.76
|   |   |--- feature_195 <= 1.86
|   |   |   |--- feature_201 <= 0.82
|   |   |   |   |--- feature_197 <= 1.00
|   |   |   |   |   |--- feature_20 <= 0.69
|   |   |   |   |   |   |--- feature_198 <= 0.26
|   |   |   |   |   |   |   |--- feature_201 <= 0.36
|   |   |   |   |   |   |   |   |--- feature_66 <= 0.70
|   |   |   |   |   |   |   |   |   |--- feature_30 <= 1.46
|   |   |   |   |   |   |   |   |   |   |--- feature_39 <= 0.47
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 38
|   |   |   |   |   |   |   |   |   |   |--- feature_39 >  0.47
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 14
|   |   |   |   |   |   |   |   |   |--- feature_30 >  1.46
|   |   |   |   |   |   |   |   |   |   |--- feature_141 <= -0.58
|   |   |   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |   |   |--- feature_141 >  -0.58
|   |   |   |   |   |   |   |   |   

### ODOR LEVEL 6

In [100]:
y = sensorama['odor level'] == 6

In [101]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
clf = DecisionTreeClassifier() # Create Decision Tree classifer object
clf = clf.fit(X_train,y_train) # Train Decision Tree Classifer
y_pred = clf.predict(X_test) # Predict the response for test dataset

In [102]:
# how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100,"%")

Accuracy: 92.53565475087979 %


In [103]:
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_150 <= -0.09
|   |--- feature_20 <= -0.71
|   |   |--- feature_157 <= -0.12
|   |   |   |--- feature_30 <= -0.74
|   |   |   |   |--- feature_189 <= -1.68
|   |   |   |   |   |--- feature_18 <= -5.27
|   |   |   |   |   |   |--- feature_193 <= -1.88
|   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |--- feature_193 >  -1.88
|   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |--- feature_18 >  -5.27
|   |   |   |   |   |   |--- class: False
|   |   |   |   |--- feature_189 >  -1.68
|   |   |   |   |   |--- feature_149 <= -0.59
|   |   |   |   |   |   |--- feature_205 <= -1.22
|   |   |   |   |   |   |   |--- feature_189 <= -1.37
|   |   |   |   |   |   |   |   |--- feature_146 <= -1.80
|   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |--- feature_146 >  -1.80
|   |   |   |   |   |   |   |   |   |--- feature_195 <= -1.15
|   |   |   |   |   |   |   |   |   |   |--- feature_17 <= -2.27
|   |   |   |   |  

### ODOR LEVEL 7

In [96]:
y = sensorama['odor level'] == 7

In [97]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
clf = DecisionTreeClassifier() # Create Decision Tree classifer object
clf = clf.fit(X_train,y_train) # Train Decision Tree Classifer
y_pred = clf.predict(X_test) # Predict the response for test dataset

In [98]:
# how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100,"%")

Accuracy: 95.22133728468235 %


In [99]:
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_201 <= 0.43
|   |--- feature_164 <= 2.19
|   |   |--- feature_33 <= -0.23
|   |   |   |--- feature_136 <= 0.84
|   |   |   |   |--- feature_201 <= 0.43
|   |   |   |   |   |--- feature_148 <= 0.61
|   |   |   |   |   |   |--- feature_14 <= 0.07
|   |   |   |   |   |   |   |--- feature_157 <= -0.44
|   |   |   |   |   |   |   |   |--- feature_206 <= -0.93
|   |   |   |   |   |   |   |   |   |--- feature_148 <= -1.87
|   |   |   |   |   |   |   |   |   |   |--- feature_148 <= -1.99
|   |   |   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |   |   |--- feature_148 >  -1.99
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |   |--- feature_148 >  -1.87
|   |   |   |   |   |   |   |   |   |   |--- feature_185 <= 1.94
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |   |--- feature_185 >  1.94
|   |   |   |   |   |   |   

### ODOR LEVEL 8

In [81]:
y = sensorama['odor level'] == 8

In [82]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
clf = DecisionTreeClassifier() # Create Decision Tree classifer object
clf = clf.fit(X_train,y_train) # Train Decision Tree Classifer
y_pred = clf.predict(X_test) # Predict the response for test dataset

In [83]:
# how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100,"%")

Accuracy: 97.24022967216152 %


In [86]:
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_200 <= -0.08
|   |--- feature_131 <= 0.38
|   |   |--- feature_0 <= 1.01
|   |   |   |--- feature_197 <= 0.43
|   |   |   |   |--- feature_138 <= -0.14
|   |   |   |   |   |--- feature_148 <= -0.79
|   |   |   |   |   |   |--- feature_138 <= -2.21
|   |   |   |   |   |   |   |--- feature_205 <= -2.16
|   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |--- feature_205 >  -2.16
|   |   |   |   |   |   |   |   |--- feature_206 <= -0.76
|   |   |   |   |   |   |   |   |   |--- feature_194 <= -1.44
|   |   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |   |--- feature_194 >  -1.44
|   |   |   |   |   |   |   |   |   |   |--- feature_22 <= -1.99
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |   |   |--- feature_22 >  -1.99
|   |   |   |   |   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |   |   |--- feature_206 >  -0.76
|   |   |   | 