# Import Libraries

In [1]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout, Input, Activation, BatchNormalization
from keras.models import Model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import label_binarize
from pandas import DataFrame
import pickle
import pandas as pd
import time
import sqlite3
import csv
from NLPProcess import NLPProcess

import numpy as np
np.random.seed(1)



### Constant

In [35]:
event_num = 65
droprate = 0.3
vector_size = 572
seed = 0
CV = 5
interaction_num = 10

---

# Data Preperation

## Feature Engineering

### Jaccard similarity 
calculates the Jaccard similarity between rows of a matrix.

**J(A, B) = |A ∩ B| / |A ∪ B| = |A ∩ B| / |A| + |B| - |A ∩ B|**

These features vectors have high dimensions and values of most dimensions are 0, thus we compress features and reduce the sparsity.

Instead of using the bit vectors as input



In [31]:
def Jaccard(matrix):
    matrix = np.mat(matrix)
    numerator = matrix * matrix.T
    denominator = (np.ones(np.shape(matrix)) * matrix.T) + \
        (matrix * np.ones(np.shape(matrix.T))) - (matrix * matrix.T)
    return numerator / denominator


**Test**

In [32]:
import numpy as np
from pandas import DataFrame
# Example input matrix
input_matrix = np.array([[1, 0, 1],
                         [1, 1, 0],
                         [0, 1, 1]])

# Calling the Jaccard function
jaccard_similarity = Jaccard(input_matrix)

# Printing the resulting Jaccard similarity matrix
print(jaccard_similarity)


[[1.         0.33333333 0.33333333]
 [0.33333333 1.         0.33333333]
 [0.33333333 0.33333333 1.        ]]


### Feature Extraction

In [33]:
def feature_vector(feature_name, df, vector_size):
    # df are the 572 kinds of drugs
    all_feature = []
    drug_list = np.array(df[feature_name]).tolist()
    # Features for each drug, for example, when feature_name is target, drug_list=["P30556|P05412","P28223|P46098|……"]
    for i in drug_list:
        all_feature.extend(i.split('|'))
    all_feature = list(set(all_feature))

    # create taple of uniques features that columns are uniques feature and rows are number of occurance in each drug
    feature_matrix = np.zeros((len(drug_list), len(all_feature)), dtype=float)
    # Consrtuct feature matrices with key of dataframe
    df_feature = DataFrame(feature_matrix, columns=all_feature)

    for i in range(len(drug_list)):
        for each_feature in df[feature_name].iloc[i].split('|'):
            df_feature[each_feature].iloc[i] = 1

    # Apply Jaccard on the feature
    sim_matrix = Jaccard(np.array(df_feature))

    # Apply dimensionality reduction Using PCA
    # use 572 feature to Create similarity matrix of 572*572
    pca = PCA(n_components=vector_size)
    pca.fit(sim_matrix)
    sim_matrix = pca.transform(sim_matrix)

    return sim_matrix


**Test**

In [34]:
# Example DataFrame
df = pd.DataFrame({
    'name': ['Drug A', 'Drug B', 'Drug C'],
    'target': ['P30556|P05412', 'P28223|P46098', 'P30556|P46098'],
    'feature1': ['A|B|C', 'C|D', 'A|B|D'],
    'feature2': ['X|Y', 'Y|Z', 'X'],
})

# Example feature_name
feature_name = 'target'

# Example vector_size
vector_size = 2

# Call the function
feature_vector_output = feature_vector(feature_name, df, vector_size)
print(feature_vector_output)


['P46098', 'P28223', 'P30556', 'P05412']
   P46098  P28223  P30556  P05412
0     0.0     0.0     1.0     1.0
1     1.0     1.0     0.0     0.0
2     1.0     0.0     1.0     0.0
[[ 7.07106781e-01 -2.35702260e-01]
 [-7.07106781e-01 -2.35702260e-01]
 [ 5.85951041e-17  4.71404521e-01]]




## Data Preparation
 Prepares the data by transforming interaction events into numbers, splicing the features, and obtaining feature vectors and labels.

### Obtaining feautre vectors

In [None]:

def prepare(df_drug, feature_list, vector_size, mechanism, action, drugA, drugB):

    d_feature = {}
    d_label = {}
    d_event = [f"{i} {j}" for i, j in zip(mechanism, action)]

    # sort according to the occurrences of each interaction event
    d_event_count = sorted(set(d_event), key=d_event.count, reverse=True)
    # make dict of postion of each event according to number  of occurrences
    d_label = dict(zip(d_event_count, range(len(d_event_count))))

    # Initializes a zero-filled NumPy array vector from n*0 to 0*n
    vector = np.zeros(
        (len(np.array(df_drug['name']).tolist()), 0), dtype=float)
    # Append Features generated features from each feature in drug table
    for i in feature_list:
        vector = np.hstack((vector, feature_vector(i, df_drug, vector_size)))
    # create dict that key is drug name and values it's new generated features
    for i in range(len(np.array(df_drug['name']).tolist())):
        d_feature[np.array(df_drug['name']).tolist()[i]] = vector[i]

    # Use the dictionary to obtain feature vector and label
    new_feature = []
    new_label = []
    # Prepare the Feature and label for training and test set
    for i in range(len(d_event)):
        new_feature.append(
            np.hstack((d_feature[drugA[i]], d_feature[drugB[i]])))
        new_label.append(d_label[d_event[i]])
    new_feature = np.array(new_feature)  # 37264 * 1144
    new_label = np.array(new_label)     # 37264 * 1
    return (new_feature, new_label, event_num)


---

# Model Building
 * **Input Layer**: The model expects input data with a shape of (vector_size * 2,). This means that the input consists of a concatenation of two feature vectors, each of size vector_size.

* **Dense Layers**: The input is passed through a series of fully connected dense layers. 
    * The first dense layer has 512 units and uses the ReLU activation function.
    * Batch normalization is applied after each dense layer to normalize the activations and improve training stability. 
    * Dropout is also applied to prevent overfitting, with the droprate parameter controlling the dropout rate.

* **Output Layer**: The final dense layer has event_num units, representing the number of different events or classes in the classification task. The activation function used in this layer is softmax, which produces a probability distribution over the classes.

* **Model Compilation**: The model is compiled using the Adam optimizer, which is an adaptive learning rate optimization algorithm. The loss function used is categorical cross-entropy, which is suitable for multi-class classification problems. The accuracy metric is also specified to evaluate the model's performance during training.




In [None]:
def DNN():
    train_input = Input(shape=(vector_size * 2,), name='Inputlayer')
    train_in = Dense(512, activation='relu')(train_input)
    train_in = BatchNormalization()(train_in)
    train_in = Dropout(droprate)(train_in)
    train_in = Dense(256, activation='relu')(train_in)
    train_in = BatchNormalization()(train_in)
    train_in = Dropout(droprate)(train_in)
    train_in = Dense(event_num)(train_in)
    out = Activation('softmax')(train_in)
    model = Model(train_input, out)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy', metrics=['accuracy'])

    return model


---

# Model Training

In [None]:
def get_index(label_matrix, event_num, seed, CV):
    index_all_class = np.zeros(len(label_matrix))
    for j in range(event_num):
        index = np.where(label_matrix == j)
        kf = KFold(n_splits=CV, shuffle=True, random_state=seed)
        k_num = 0
        for train_index, test_index in kf.split(range(len(index[0]))):
            index_all_class[index[0][test_index]] = k_num
            k_num += 1

    return index_all_class


In [None]:
def fit_model(clf_type:str, X_train, y_train, X_test, y_test=0):
    # Use Selected Model to train the model
    if clf_type == 'DDIMDL':
        dnn = DNN()
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto')
        dnn.fit(X_train, y_train, batch_size=128, epochs=100, validation_data=(X_test, y_test),
                callbacks=[early_stopping])

        # Save model for each flood and feature
        filename = f"model_Flood{k}_Feature{i}.h5"
        dnn.save(filename)
        return dnn
    elif clf_type == 'RF':
        clf = RandomForestClassifier(n_estimators=100)
    elif clf_type == 'GBDT':
        clf = GradientBoostingClassifier()
    elif clf_type == 'SVM':
        clf = SVC(probability=True)
    elif clf_type == 'FM':
        clf = GradientBoostingClassifier()
    elif clf_type == 'KNN':
        clf = KNeighborsClassifier(n_neighbors=4)
    else:
        clf = LogisticRegression()
    clf.fit(X_train, y_train)
    return clf
    

In [None]:
def cross_validation(feature_matrix, label_matrix, clf_type, event_num, seed, CV):
    all_eval_type = 11
    result_all = np.zeros((all_eval_type, 1), dtype=float)
    each_eval_type = 6
    result_eve = np.zeros((event_num, each_eval_type), dtype=float)

    y_true = np.array([])
    y_pred = np.array([])
    y_score = np.zeros((0, event_num), dtype=float)
    index_all_class = get_index(label_matrix, event_num, seed, CV)
    matrix = []
    if type(feature_matrix) != list:
        matrix.append(feature_matrix)
        feature_matrix = matrix

    # Iteration Along the Floodes
    for k in range(CV):
        train_index = np.where(index_all_class != k)
        test_index = np.where(index_all_class == k)
        pred = np.zeros((len(test_index[0]), event_num), dtype=float)

        # Itrate Along Selected Features
        for i in range(len(feature_matrix)):
            # Get Train, Test Dataset
            x_train = feature_matrix[i][train_index]
            x_test = feature_matrix[i][test_index]
            y_train = label_matrix[train_index]
            
            # one-hot encoding
            y_train_one_hot = np.array(y_train)
            y_train_one_hot = (np.arange(y_train_one_hot.max() + 1)
                               == y_train[:, None]).astype(dtype='float32')
            y_test = label_matrix[test_index]
            # one-hot encoding
            y_test_one_hot = np.array(y_test)
            y_test_one_hot = (np.arange(y_test_one_hot.max() + 1)
                              == y_test[:, None]).astype(dtype='float32')

            # Fit The Selected Model
            clf = fit_model(x_train,y_train_one_hot,x_test, y_test_one_hot )
            # Predict current flood model
            pred += clf.predict_proba(x_test)
        
        # Get Avrage Predict of all Selected Feature
        pred_score = pred / len(feature_matrix)
        pred_type = np.argmax(pred_score, axis=1)
        
        y_true = np.hstack((y_true, y_test))
        y_pred = np.hstack((y_pred, pred_type))
        y_score = np.row_stack((y_score, pred_score))

    return y_pred, y_score, y_true


---

# Model Evaluatoin

In [None]:
def evaluate(pred_type, pred_score, y_test, event_num):
    all_eval_type = 11
    result_all = np.zeros((all_eval_type, 1), dtype=float)
    each_eval_type = 6
    result_eve = np.zeros((event_num, each_eval_type), dtype=float)
    y_one_hot = label_binarize(y_test, np.arange(event_num))
    pred_one_hot = label_binarize(pred_type, np.arange(event_num))

    precision, recall, th = multiclass_precision_recall_curve(y_one_hot, pred_score)

    result_all[0] = accuracy_score(y_test, pred_type)
    result_all[1] = roc_aupr_score(y_one_hot, pred_score, average='micro')
    result_all[2] = roc_aupr_score(y_one_hot, pred_score, average='macro')
    result_all[3] = roc_auc_score(y_one_hot, pred_score, average='micro')
    result_all[4] = roc_auc_score(y_one_hot, pred_score, average='macro')
    result_all[5] = f1_score(y_test, pred_type, average='micro')
    result_all[6] = f1_score(y_test, pred_type, average='macro')
    result_all[7] = precision_score(y_test, pred_type, average='micro')
    result_all[8] = precision_score(y_test, pred_type, average='macro')
    result_all[9] = recall_score(y_test, pred_type, average='micro')
    result_all[10] = recall_score(y_test, pred_type, average='macro')
    for i in range(event_num):
        result_eve[i, 0] = accuracy_score(y_one_hot.take([i], axis=1).ravel(), pred_one_hot.take([i], axis=1).ravel())
        result_eve[i, 1] = roc_aupr_score(y_one_hot.take([i], axis=1).ravel(), pred_one_hot.take([i], axis=1).ravel(),
                                          average=None)
        result_eve[i, 2] = roc_auc_score(y_one_hot.take([i], axis=1).ravel(), pred_one_hot.take([i], axis=1).ravel(),
                                         average=None)
        result_eve[i, 3] = f1_score(y_one_hot.take([i], axis=1).ravel(), pred_one_hot.take([i], axis=1).ravel(),
                                    average='binary')
        result_eve[i, 4] = precision_score(y_one_hot.take([i], axis=1).ravel(), pred_one_hot.take([i], axis=1).ravel(),
                                           average='binary')
        result_eve[i, 5] = recall_score(y_one_hot.take([i], axis=1).ravel(), pred_one_hot.take([i], axis=1).ravel(),
                                        average='binary')
    return [result_all, result_eve] 

In [None]:
def roc_aupr_score(y_true, y_score, average="macro"):
    def _binary_roc_aupr_score(y_true, y_score):
        precision, recall, pr_thresholds = precision_recall_curve(y_true, y_score)
        return auc(recall, precision)

    def _average_binary_score(binary_metric, y_true, y_score, average):  # y_true= y_one_hot
        if average == "binary":
            return binary_metric(y_true, y_score)
        if average == "micro":
            y_true = y_true.ravel()
            y_score = y_score.ravel()
        if y_true.ndim == 1:
            y_true = y_true.reshape((-1, 1))
        if y_score.ndim == 1:
            y_score = y_score.reshape((-1, 1))
        n_classes = y_score.shape[1]
        score = np.zeros((n_classes,))
        for c in range(n_classes):
            y_true_c = y_true.take([c], axis=1).ravel()
            y_score_c = y_score.take([c], axis=1).ravel()
            score[c] = binary_metric(y_true_c, y_score_c)
        return np.average(score)

    return _average_binary_score(_binary_roc_aupr_score, y_true, y_score, average)


---

# Run the Program

In [None]:
def save_result(feature_name, result_type, clf_type, result):
    with open(feature_name + '_' + result_type + '_' + clf_type+ '.csv', "w", newline='') as csvfile:
        writer = csv.writer(csvfile)
        for i in result:
            writer.writerow(i)
    return 0

In [8]:

def main(args):
    # GET ENTARED DATA
    feature_list = args['featureList']
    featureName="+".join(feature_list)
    clf_list = args['classifier']
    nlp_arg = args['NLPProcess']
    
    # GET REQUIRED DATA FROM DATABASW
    conn = sqlite3.connect("event.db")
    df_drug = pd.read_sql('select * from drug;', conn)
    df_drug.info()
    df_event = pd.read_sql('select * from event_number;', conn)
    df_interaction = pd.read_sql('select * from event;', conn)
   
    result_all = {}
    result_eve = {}
    all_matrix = []
    drugList=[]
    
    for line in open("DrugList.txt",'r'):
        drugList.append(line.split()[0])
    
    ###################### MODEL STEPS #######################
    # STEP1: GET DRUG PREPROCESSED FEATURES
    # prevoisle saved 
    if nlp_arg=="read":
        extraction = pd.read_sql('select * from extraction;', conn)
        mechanism = extraction['mechanism']
        action = extraction['action']
        drugA = extraction['drugA']
        drugB = extraction['drugB']
    else:
        # preprocess features
        mechanism,action,drugA,drugB=NLPProcess(drugList,df_interaction)
    # STEP2: PREPARATION THE DATA
    for feature in feature_list:
        print(feature)
        new_feature, new_label, event_num = prepare(df_drug, [feature], vector_size, mechanism,action,drugA,drugB)
        all_matrix.append(new_feature)
    start = time.time()
    for clf in clf_list:
        # STEP3: TRAIN & PREDICT THE MODEL
        pred_type, pred_score, y_test = cross_validation(all_matrix, new_label, clf, event_num, seed, CV)
        
        # STEP4: EVALUATE THE MODEL
        all_result, each_result = evaluate(pred_type, pred_score, y_test, event_num)
        save_result(featureName, 'all', clf, all_result)
        save_result(featureName, 'each', clf, each_result)
        result_all[clf] = all_result
        result_eve[clf] = each_result
    print("time used:", (time.time() - start)/60)

In [None]:
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-f","--featureList",default=["smile","target","enzyme"],help="features to use",nargs="+")
    parser.add_argument("-c","--classifier",choices=["DDIMDL","RF","KNN","LR"],default=["DDIMDL"],help="classifiers to use",nargs="+")
    parser.add_argument("-p","--NLPProcess",choices=["read","process"],default="read",help="Read the NLP extraction result directly or process the events again")
    args=vars(parser.parse_args())
    print("mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm")
    print(args)
    main(args)

---

---

In [7]:
feature_list = ["osama", "mohamed", "abd"]

for feature in feature_list:
    set_name = feature + '+'
set_name = set_name[:]
set_name
    

'abd+'

In [5]:
set_name = "+".join(feature_list)

In [6]:
set_name

'osama+mohamed+abd'

---

In [87]:
import h5py
new_model = h5py.File("model_fold_0.h5", 'r')


In [88]:
loaded_dict = np.load('Drug_Smile_Features.npz')

# Convert the loaded dictionary to a regular Python dictionary
d_feature = dict(loaded_dict)


In [89]:
test = np.hstack((d_feature["Diltiazem"], d_feature["Abemaciclib"]))


In [90]:
test = test.reshape((1, 1144))
print(test.shape)


(1, 1144)


In [91]:
x = np.argmax(new_model.predict(test), axis=1)
x


AttributeError: 'File' object has no attribute 'predict'

In [None]:
conn = sqlite3.connect("event.db")
df_drug = pd.read_sql('select * from event_number;', conn)
#
x = dict(zip([str(i) for i in range(len(df_drug["event"]))], df_drug["event"]))
x


In [82]:
np.savez(f'Event_Number.npz', **x)


In [86]:
event = x[str(np.argmax(new_model.predict(test), axis=1)[0])]




'The metabolism of name can be decreased when combined with name.'

 * Serving Flask app '__name__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:57884
 * Running on http://192.168.1.3:57884
Press CTRL+C to quit


Glucosamine
Anagrelide


[2023-06-04 23:47:49,381] ERROR in app: Exception on /user [GET]
Traceback (most recent call last):
  File "c:\Users\Eng Osama Mo\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 2525, in wsgi_app
    response = self.full_dispatch_request()
  File "c:\Users\Eng Osama Mo\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 1822, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "c:\Users\Eng Osama Mo\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 1820, in full_dispatch_request
    rv = self.dispatch_request()
  File "c:\Users\Eng Osama Mo\AppData\Local\Programs\Python\Python310\lib\site-packages\flask\app.py", line 1796, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
  File "C:\Users\Eng Osama Mo\AppData\Local\Temp\ipykernel_11920\3648745965.py", line 21, in request_page
    print(predict_event(drug1, drug2))
  File "C:\Users\Eng Osa

In [35]:
import numpy as np
import pandas as pd

from keras.models import load_model


def load_npz(file):
    loaded_dict = np.load(file)
    # Convert the loaded dictionary to a regular Python dictionary
    dict_file = dict(loaded_dict)
    return dict_file


d_feature = load_npz("Drug_Smile_Features.npz")
d_event = load_npz("Event_Number.npz")


def predict_event(drug1: str, drug2: str):
    drugs_feature = np.hstack((d_feature[drug1], d_feature[drug2]))
    drugs_feature = drugs_feature.reshape((1,1144))

    new_model = load_model("model_fold_0.h5")
    event = d_event[str(np.argmax(new_model.predict(drugs_feature), axis=1)[0])]
    event = str(event)
    # Replace the first occurrence of "name" with drug1
    new_event = event.replace("name", drug1, 1)
    # Replace the second occurrence of "name" with drug2
    new_event = new_event.replace("name", drug2, 1)
    return new_event


In [34]:

def load_npz(file):
    loaded_dict = np.load(file)
    # Convert the loaded dictionary to a regular Python dictionary
    dict_file = dict(loaded_dict)
    return dict_file


d_feature = load_npz("Drug_Smile_Features.npz")
d_event = load_npz("Event_Number.npz")

drugs_feature = np.hstack((d_feature[drug1], d_feature[drug2]))
drugs_feature = drugs_feature.reshape((1,1144))

new_model = load_model("model_fold_0.h5")
event = d_event[str(np.argmax(new_model.predict(drugs_feature), axis=1)[0])]
event = str(event)
# Replace the first occurrence of "name" with drug1
new_event = event.replace("name", drug1, 1)
# Replace the second occurrence of "name" with drug2
new_event = new_event.replace("name", drug2, 1)


(1, 1144)
Glucosamine may increase the antiplatelet activities of Glucosamine.


In [37]:
from flask import Flask, request
import json
import time
app = Flask('__name__')


@app.route("/", methods=['GET'])
def home_page():
    data_set = {"page": "Home", "Message": "Succes", "Timetamp": time.time()}
    json_dump = json.dumps(data_set)

    return json_dump


@app.route("/user", methods=['GET'])
def request_page():
    user_query = str(request.args.get("user"))  # /user/?user=
    drug1, drug2 = user_query.split("-")
    print(drug1)
    print(drug2)
    print(predict_event(drug1, drug2))
    data_set = {"Event": predict_event(drug1, drug2)}
    json_dump = json.dumps(data_set)

    return json_dump


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=0000)


 * Serving Flask app '__name__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:58286
 * Running on http://192.168.1.3:58286
Press CTRL+C to quit


Glucosamine
Anagrelide
The metabolism of Glucosamine can be decreased when combined with Anagrelide.


127.0.0.1 - - [05/Jun/2023 00:24:42] "GET /user?user=Glucosamine-Anagrelide HTTP/1.1" 200 -
