The first block sets up the environment, installing all the necessary libraries and functions.

In [None]:
import numpy as np 
import pandas as pd
import os
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

This script transforms InChIKeys from strings into an array with 25 integers. (InChIKey codes have 27 characters, but we omit the dashes (–).)

In [None]:
# transform one InChIKey into an int array
def convert_InChIKeyToInt(InChIKey):
  numeric = ''
  for c in InChIKey:
    if(c.isalpha()):
      numeric += str(ord(c))+','
  return numeric[:-1]

def convert_InChIKeyToIntArray(InChIKey):
  numeric = np.zeros(25)
  i = 0
  for c in InChIKey:
    if(c.isalpha()):
      numeric[i] = ord(c)
      i += 1
  return numeric. astype(int)

# transform all InChIKeys from one .csv file into int arrays
def getNumericInChIKeysFromFile(filename, noOfDrugs):
  dataset = pd.read_csv(filename)
  for i in range(0,noOfDrugs):
    print(convert_InChIKeyToInt(dataset.iloc[i,1:2].values[0]))

# e.g.: getNumericInChIKeysFromFile('non-antiTB InChIKey (strings).csv', 71)

We parse the three *.csv* files (one file for antiTB drugs, one for non-antiTB drugs and one for OF- substances) containing the numeric InChIKeys (after the transformation part previously described) and we create one dataset with the drugs from each file.

In [None]:
test_dataset = pd.read_csv('test - data.csv')
dataset = pd.read_csv('antiTB InChIKey (numeric).csv')
dataset_negative = pd.read_csv('non-antiTB InChIKey (numeric).csv')

Because the numeric values obtained after the InChIKey was transformed from *char* to *int* are biased (A is 65 in 'ASCII code', Z is 90), we normalize these values, using a scaller (*StandardScaler*). 

In [None]:
#print(dataset.iloc[0,:26].values) # we have 25 inchi letters/numbers and 1 label
array = np.arange(25)
dataset.iloc[:,array] = StandardScaler().fit_transform(dataset.iloc[:,array].values)
dataset_negative.iloc[:,array] = StandardScaler().fit_transform(dataset_negative.iloc[:,array].values)
test_dataset.iloc[:,array] = StandardScaler().fit_transform(test_dataset.iloc[:,array].values)
x_not_known = test_dataset.iloc[:,:25].values

We split each dataset (dataset, dataset_negative) in 2 datasets: *x* and *y*, where *x* contain the features (*x* is a matrix, the ML model's input, containing the numeric scaled InCHIKeys' values) and *y* is the expected outcome (*y* is an array, containg for each drug an *int* value: *0* - non-antiTB, *1* - antiTB). 

In [None]:
x_positive = dataset.iloc[:,:25].values
y_positive = dataset.iloc[:,25:26].values

x_negative = dataset_negative.iloc[:,:25].values
y_negative = dataset_negative.iloc[:,25:26].values

Then, each of the previously mentioned array is split into *train* and *test*. We use the *train* arrays to create the model and the *test* arrays to compute the accuracy of that model.

In [None]:
X_pos_train,X_pos_test,y_pos_train,y_pos_test = train_test_split(x_positive,y_positive,test_size=0.15,shuffle = True,random_state=1)
X_neg_train,X_neg_test,y_neg_train,y_neg_test = train_test_split(x_negative[51:75],y_negative[51:75],test_size=0.15,shuffle = True)

X_train = np.concatenate((X_pos_train, X_neg_train))
X_test = np.concatenate((X_pos_test, X_neg_test))
y_train = np.concatenate((y_pos_train, y_neg_train))
y_test = np.concatenate((y_pos_test, y_neg_test))

The next cells contain the initialization and parametrization of the used classifiers (*KMeans*, *SVC*, *LinearSVC*, *Logistic Regression*, a.s.o.).

In [None]:
classifier = KMeans(n_clusters=2, init='random', n_init=10000, max_iter=100000, 
                    tol=0.00001, precompute_distances=True, verbose=0, random_state=1, 
                    copy_x=True, algorithm='elkan')

In [None]:
from sklearn.svm import SVC, LinearSVC

classifier = SVC()
classifier = LinearSVC()

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(multi_class='ovr')

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(n_estimators=100000)

In [None]:
from sklearn.neural_network import MLPClassifier

classifier = MLPClassifier(max_iter=500, hidden_layer_sizes = (150,),
                           activation= 'relu', solver = 'adam',
                           learning_rate= 'invscaling',random_state = 0)
classifier = MLPClassifier()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(3)

In [None]:
from sklearn.tree import DecisionTreeClassifier 
classifier = DecisionTreeClassifier(max_depth=2)

We create the model and then compute the accuracy for each model for both datasets: training and testing.

In [None]:
classifier.fit(X_train, y_train)
y_pred_train = classifier.predict(X_train)
a_train = accuracy_score(y_true=y_train, y_pred=y_pred_train)

y_pred_test = classifier.predict(X_test)
a_test = accuracy_score(y_true=y_test, y_pred=y_pred_test)

print('Training acc:', a_train)
print('Test acc:', a_test)

We apply the model and then predict the outcome for the array containing OF-substances' properties, printing the results in the console.

In [None]:
y_pred = classifier.predict(x_not_known)
for i in range(0,22):
  print(str(y_pred[i])+','+str(test_dataset.iloc[i,26:27].values[0]))

1,OF-1180
1,OF-1182
1,OF-1187
1,OF-1189
1,OF-1227
1,OF-1242
1,OF-1250
1,OF-1253
1,OF-1273
1,OF-1276
1,OF-1279
1,OF-1283
1,OF-1285
1,OF-1288
1,OF-1289
1,OF-1290
1,OF-1292
1,OF-1294
1,OF-1295
1,OF-1264
1,OF-1272
1,OF-242
