In [46]:
import numpy as np
import pandas as pd
from nltk import word_tokenize

from paths import *

from sklearn.svm import SVC, LinearSVC
from sklearn.grid_search import GridSearchCV

from feature_transformer import *



### Loading the preprocessed embeddings

In [49]:
X_train = np.load('../XY/STEM_20/X_train.npy')
Y_train = np.load('../XY/STEM_20/Y_train.npy')
X_test = np.load('../XY/STEM_20/X_test2.npy')
Y_test = np.load('../XY/STEM_20/Y_test2.npy')

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(148031, 20) (148031,)
(29841, 20) (29841,)


### Loading the sentences

In [35]:
# TRAIN SET
sentences_df_train = pd.read_csv(SENTENCE_PATH_train)
entities_df_train = pd.read_csv(ENTITY_PATH_train)
pairs_df_train = pd.read_csv(PAIR_PATH_train)

#TEST2 SET
sentences_df_test = pd.read_csv(SENTENCE_PATH_test2)
entities_df_test = pd.read_csv(ENTITY_PATH_test2)
pairs_df_test = pd.read_csv(PAIR_PATH_test2)

pairs_df_train['type'].fillna('null', inplace=True)

In [84]:
Y_tr = np.empty((pairs_df_train.shape[0],))
Y_tr_type = np.zeros([pairs_df_train.shape[0], 1])

for index, row in pairs_df_train.iterrows():
    t = row['type']
    if t == 'mechanism':
        Y_tr_type[index, 0] = 1
    elif t == 'int':
        Y_tr_type[index, 0] = 2
    elif t == 'advise':
        Y_tr_type[index, 0] = 3
    elif t == 'effect':
        Y_tr_type[index, 0] = 4
    else:
        Y_tr_type[index, 0] = 0
        
    Y_tr[index] = 1 if row['ddi'] is True else 0
    
np.save(os.path.join(ROOT_DIR, 'XY', 'Y', 'Task2', 'Y_train.npy'), Y_tr)
np.save(os.path.join(ROOT_DIR, 'XY', 'Y', 'Task2', 'Y_train_type.npy'), Y_tr_type)

In [86]:
Y_tr = np.load(os.path.join(ROOT_DIR, 'XY', 'Y', 'Task2', 'Y_train.npy'))
Y_tr_type = np.load(os.path.join(ROOT_DIR, 'XY', 'Y', 'Task2', 'Y_train_type.npy'))

Y_tr = np.round(Y_tr)

In [87]:
print("entities dataframe")
entities_df_train.head()

entities dataframe


Unnamed: 0,entityID,name,position,type
0,DDI-DrugBank.d157.s0.e0,cimetidine,34-43,drug
1,DDI-DrugBank.d157.s0.e1,warfarin,49-56,drug
2,DDI-DrugBank.d157.s0.e2,Femara,97-102,brand
3,DDI-DrugBank.d157.s1.e0,Femara,48-53,brand
4,DDI-DrugBank.d157.s1.e1,tamoxifen,59-67,drug


In [88]:
print("sentences dataframe")
sentences_df_train.head()

sentences dataframe


Unnamed: 0,sentenceID,sentenceText
0,DDI-DrugBank.d157.s0,Clinical interaction studies with cimetidine a...
1,DDI-DrugBank.d157.s1,(See CLINICAL PHARMACOLOGY) Coadministration o...
2,DDI-DrugBank.d157.s2,There is no clinical experience to date on the...
3,DDI-DrugBank.d157.s3,Drug/Laboratory Test-Interactions None observed.
4,DDI-DrugBank.d110.s0,The administration of local anesthetic solutio...


In [89]:
print("pairs dataframe")
pairs_df_train.head(100)

pairs dataframe


Unnamed: 0,index,ddi,entityID1,entityID2,sentenceID,type
0,0,False,DDI-DrugBank.d157.s0.e0,DDI-DrugBank.d157.s0.e1,DDI-DrugBank.d157.s0,
1,1,False,DDI-DrugBank.d157.s0.e0,DDI-DrugBank.d157.s0.e2,DDI-DrugBank.d157.s0,
2,2,False,DDI-DrugBank.d157.s0.e1,DDI-DrugBank.d157.s0.e2,DDI-DrugBank.d157.s0,
3,3,True,DDI-DrugBank.d157.s1.e0,DDI-DrugBank.d157.s1.e1,DDI-DrugBank.d157.s1,mechanism
4,4,False,DDI-DrugBank.d157.s1.e0,DDI-DrugBank.d157.s1.e2,DDI-DrugBank.d157.s1,
5,5,False,DDI-DrugBank.d157.s1.e1,DDI-DrugBank.d157.s1.e2,DDI-DrugBank.d157.s1,
6,6,False,DDI-DrugBank.d110.s0.e0,DDI-DrugBank.d110.s0.e1,DDI-DrugBank.d110.s0,
7,7,False,DDI-DrugBank.d110.s0.e0,DDI-DrugBank.d110.s0.e2,DDI-DrugBank.d110.s0,
8,8,False,DDI-DrugBank.d110.s0.e0,DDI-DrugBank.d110.s0.e3,DDI-DrugBank.d110.s0,
9,9,True,DDI-DrugBank.d110.s0.e0,DDI-DrugBank.d110.s0.e4,DDI-DrugBank.d110.s0,effect


In [90]:
sentences_train = [row['sentenceText'] for index, row in sentences_df_train.iterrows()]
sentenceIDs_train = [row['sentenceID'] for index, row in sentences_df_train.iterrows()]

#sentences_test = [row['sentenceText'] for index, row in sentences_df_test.iterrows()]
#sentences_IDs_test = [row['sentenceID'] for index, row in sentences_df_test.iterrows()]


In [11]:
for index, row in pairs_df_train.iterrows():
    #print(row)
    sentenceID = row['sentenceID']
    sentence = sentences_train[sentenceIDs_train.index(sentenceID)]
    entityID1 = row['entityID1']
    entityID2 = row['entityID2']
    #print(sentence)
    #print(entityID1, entityID2)
    

In [12]:
has_2_entities = lambda x: len(x) == 2
has_3_or_more_entities = lambda x: len(x) >= 3

feature_list = [has_2_entities, 
                 has_3_or_more_entities]

for index, row in sentences_df_train.iterrows():
    sentenceId = row['sentenceID']
    temp_df = pairs_df_train[pairs_df_train.sentenceID == sentenceId]
    print(temp_df.head())
    if temp_df.shape[0] > 1:
        print("ALERT: more pairs in a sentence")
        for _, pair in temp_df.iterrows():
            print(pair)
    
    for feature in feature_list:
        print(feature(temp_df))
    
    break

   index    ddi                entityID1                entityID2  \
0      0   True  DDI-DrugBank.d281.s0.e0  DDI-DrugBank.d281.s0.e1   
1      1  False  DDI-DrugBank.d281.s0.e0  DDI-DrugBank.d281.s0.e2   
2      2  False  DDI-DrugBank.d281.s0.e1  DDI-DrugBank.d281.s0.e2   

             sentenceID       type  
0  DDI-DrugBank.d281.s0  mechanism  
1  DDI-DrugBank.d281.s0       null  
2  DDI-DrugBank.d281.s0       null  
ALERT: more pairs in a sentence
index                               0
ddi                              True
entityID1     DDI-DrugBank.d281.s0.e0
entityID2     DDI-DrugBank.d281.s0.e1
sentenceID       DDI-DrugBank.d281.s0
type                        mechanism
Name: 0, dtype: object
index                               1
ddi                             False
entityID1     DDI-DrugBank.d281.s0.e0
entityID2     DDI-DrugBank.d281.s0.e2
sentenceID       DDI-DrugBank.d281.s0
type                             null
Name: 1, dtype: object
index                               2
ddi

In [31]:
def token_distance(sentence, entity1, entity2):
    e1 = sentence.index(entity1)
    e2 = sentence[e1+1:].index(entity2) + (e1 + 1)
    return abs(e2 - e1) - 1

In [15]:
token_distance(["Hey", "medic", "is", "not", "like", "anything", "unusual", "penicilin"], "medic", "penicilin")

5

In [16]:
triggers = {}
with open(os.path.join(ROOT_DIR, 'triggers.txt'), 'r') as triggerfile:
    for line in triggerfile:
        interaction_type, trigger = line.split()
        triggers[trigger] = interaction_type
#print(triggers)

In [17]:
def check_for(all_words, words_list):
    for word in all_words:
        if word.lower() in words_list:
            return 1
    return 0

In [18]:
def are_there_triggers(sentence, entity1, entity2):
    e1 = sentence.index(entity1.split()[0])
    e2 = sentence[e1+1:].index(entity2.split()[-1]) + (e1 + 1)
    between = sentence[e1+1:e2]
    for word in sentence[:e2]:
        if word in triggers.keys():
            return 1
    return 0

In [19]:
are_there_triggers(["Hey", "medic", "is", "not", "mustn't", "anything", "unusual", "penicilin"], "medic", "penicilin")

1

In [20]:
conjunctions = ["and", "or", "even", "but", "for", "nor", "so", "yet"]

def are_there_conjunctions(sentence, entity1, entity2):
    e1 = sentence.index(entity1.split()[0])
    e2 = sentence[e1+1:].index(entity2.split()[-1]) + (e1 + 1)
    between = sentence[e1+1:e2]
    return check_for(between, conjunctions)


In [21]:
are_there_conjunctions(["Hey", "medic", "is", "not", "mustn't", "and", "anything", "unusual", "penicilin"], "medic", "penicilin")

1

In [22]:
punctuation = [",", ";", ":", "/", "\\", "?", "!", "(", ")"]
def is_there_punctuation(sentence, entity1, entity2):
    e1 = sentence.index(entity1.split()[0])
    e2 = sentence[e1+1:].index(entity2.split()[-1]) + (e1 + 1)
    between = sentence[e1+1:e2]
    return check_for(between, punctuation)


In [23]:
is_there_punctuation(["Hey", "medic", "is", "not",",", "mustn't", "and", "anything", "unusual", "penicilin"], "medic", "penicilin")

1

In [24]:
negations = ["no", "not", "none", "nobody", "nothing", "neither", "nowhere", "never", "n\'t"]

In [25]:
def is_there_negation(sentence, entity1, entity2):
    e1 = sentence.index(entity1.split()[0])
    e2 = sentence[e1+1:].index(entity2.split()[-1]) + (e1 + 1)
    upto2nd = sentence #[:e2]
    return check_for(upto2nd, negations)

In [26]:
is_there_negation(["Hey", "medic", "is", ",", "Not",  "mustn't", "and", "anything", "unusual", "penicilin"], "medic", "penicilin")

1

In [42]:
m = 5 #nmb of features

def build_feature_matrix(df, entities_df):
    X = np.zeros([df.shape[0], m])
    for index, row in df.iterrows():
        sentenceID = row['sentenceID']
        entityID1 = row['entityID1']
        entityID2 = row['entityID2']
        entity1 = entities_df_train.loc[entities_df.entityID == entityID1]['name'].values[0]
        entity2 = entities_df_train.loc[entities_df.entityID == entityID2]['name'].values[0]
        sentence = sentences_train[sentenceIDs_train.index(sentenceID)]   
        #print(sentence, "----", entity2, "\n\n")
        #print(entity1, "|", entity2, "\n", sentence)
        vector = [is_there_negation(sentence, entity1, entity2), 
                are_there_triggers(sentence, entity1, entity2),
                token_distance(sentence, entity1, entity2),
                is_there_punctuation(sentence, entity1, entity2),
                are_there_conjunctions(sentence, entity1, entity2)]
                # + add embeddings if wanted
        vector = np.array(vector).reshape(1, -1)
        #print(vector.shape, X_tr.shape)
        X[index,:] = vector
    return X

X_tr = build_feature_matrix(pairs_df_train, entities_df_train)
print(X_tr.shape)

(27663, 5)


In [53]:
print(X_tr.shape, Y_tr.shape)

(27663, 5) (27663, 1)


### svm grid search

In [91]:
model = SVC(kernel='rbf', 
                C=1.0,
                class_weight=None,
                gamma='auto',
                #penalty='l2',
                #loss='squared_hinge',
                tol=0.001, random_state=None)

In [92]:
param_grid = {'kernel': ['linear', 'rbf'], 'C': [0.01, 0.1, 1, 10] }

gs_clf = GridSearchCV(model, param_grid, cv=5, verbose=50)
gs_clf = gs_clf.fit(X_tr, Y_tr)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] kernel=linear, C=0.01 ...........................................
[CV] .................. kernel=linear, C=0.01, score=0.855413 -   8.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.0s remaining:    0.0s
[CV] kernel=linear, C=0.01 ...........................................
[CV] .................. kernel=linear, C=0.01, score=0.855413 -   9.7s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   17.7s remaining:    0.0s
[CV] kernel=linear, C=0.01 ...........................................
[CV] .................. kernel=linear, C=0.01, score=0.855413 -   9.5s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   27.2s remaining:    0.0s
[CV] kernel=linear, C=0.01 ...........................................
[CV] .................. kernel=linear, C=0.01, score=0.855413 -  10.3s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   37.5s remaining:    0.0s
[CV] kernel=linear, C=0.01 .........................

KeyboardInterrupt: 

In [58]:
model.fit(X_tr, Y_tr)

  y = column_or_1d(y, warn=True)


KeyboardInterrupt: 

In [30]:
Y_predicted = model.predict(X_tr)

In [31]:
sum(Y_predicted == Y_train)/len(Y_predicted)

array([0.85543867, 0.85543867, 0.85543867, ..., 0.85543867, 0.85543867,
       0.85543867])