In [1]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
PATH = '/home/arsentii/prog/camda/'
import numpy as np
import re
import spacy
import pickle
from collections import defaultdict
from tqdm import tqdm
spacy.prefer_gpu()
nlp = spacy.load("en_core_sci_md", disable=['ner', 'parser'])
#device = torch.device('cuda:0')
device = torch.device('cpu')

CUDARuntimeError: cudaErrorNoDevice: no CUDA-capable device is detected

In [2]:
torch.cuda.is_available()

True

In [2]:
nlp.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']

# Reading data, dividing by train and test sets



In [3]:
data_raw = pd.read_csv(PATH + 'DILI_data.csv')
validation_data = pd.read_csv(PATH + "Validation.tsv", sep='\t')

In [4]:
def print_example(train_data, N=5, cols=["Title", "Label"]):
    frac = N / train_data.shape[0]
    sample_data = train_data.sample( frac=frac)
    for idx, row in sample_data.iterrows():
        cmd = input()
        if cmd == "s":
            break
        print( "%s | %s >> " %  (row['Title'], row["Label"]))
        cmd = input()
        if cmd == "a":
            if type(row['Abstract']) == str:
                abstract = row['Abstract']
                abstract = abstract.replace('. ', '.\n') 
                print(abstract)
        else:
            continue
def keyword_searcher(string, keyword):
    matches = re.search(keyword, string)
    return matches

def keyword_classifier(train_data, keywords):
    results = np.zeros((train_data.shape[0]))
    targets = train_data["Label"].values
    keyword = '|'.join(keywords)
    print(keyword)
    for idx, row in train_data.iterrows():
        title = re.sub("[\?\+\*\)\(\[\]]", "", str(row["Title"]) + str(row["Abstract"]))
        matches = keyword_searcher(title, keyword)
        if matches:
            results[idx] = 1
    precision = np.sum( np.logical_and(results, targets) ) / np.sum(targets)
    accuracy = (results == targets).mean()
    return (accuracy, precision)

In [None]:
print_example(train_data, N=40)

# Point mutual information

# PMI calculation
x - class, y - word
$$
pmi(x, y) = log \frac{p(x, y)}{p(x) \cdot p(y)}
$$


$$
p(x, y) = \frac{f(y, x)}{N_x}
$$

In [5]:
def tokenize(string):
    doc = nlp.make_doc(string)
    words = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop and len(token.text) > 1 ]
    return words

def tokenization(train_data):
    tokenized_texts = []
    for _, row in train_data.iterrows():
        text = row['Title'] + ' ' + str(row['Abstract'])
        words = tokenize(text)
        tokenized_texts.append(words)
    return tokenized_texts

# TFIDF (Term frequency and inverse document frequency)

def get_word_stat(tokenized_texts):
    '''Words counts in documents
    finds in how many documents this word 
    is present
    '''
    texts_number = len(tokenized_texts)
    word2text_count = defaultdict(int)
    for text in tokenized_texts:
        uniquewords = set(text)
        for word in uniquewords:
            word2text_count[word] +=1
    return word2text_count

def get_doc_tfidf(words, word2text_count, N):
    num_words = len(words)
    word2tfidf = defaultdict(int)
    for word in words:
        if word2text_count[word] > 0:
            idf = np.log(N/(word2text_count[word]))
            word2tfidf[word] += (1/num_words) * idf
        else:
            word2tfidf[word] = 1
    return word2tfidf

def create_pmi_dict(tokenized_texts, targets, min_count=5):
    np.seterr(divide = 'ignore')
    
    # words count
    d = {0:defaultdict(int), 1:defaultdict(int), 'tot':defaultdict(int)}
    for idx, words in enumerate(tokenized_texts):
        target = targets[idx]
        for w in words:
            d[ target ][w] += 1
            
    Dictionary = set(list(d[0].keys()) + list(d[1].keys()))
    d['tot'] = {w:d[0][w] + d[1][w] for w in Dictionary}
    
    # pmi calculation
    N_0 = sum(d[0].values())
    N_1 = sum(d[1].values())
    d[0] = {w: -np.log((v/N_0 + 10**(-15)) / (0.5 * d['tot'][w]/(N_0 + N_1))) / np.log(v/N_0 + 10**(-15))
            for w, v in d[0].items() if d['tot'][w] > min_count}
    
    d[1] = {w: -np.log((v/N_1+ 10**(-15)) / (0.5 * d['tot'][w]/(N_0 + N_1))) / np.log(v/N_1 + 10**(-15))
            for w, v in d[1].items() if d['tot'][w] > min_count}
    del d['tot']
    return d    

def classify_pmi_based(words_pmis, word2text_count, tokenized_test_texts, N):
    results = np.zeros(len(tokenized_test_texts))
    for idx, words in enumerate(tokenized_test_texts):
        word2tfidf = get_doc_tfidf(words, word2text_count, N)
        # print(word2tfidf)
        # PMI - determines significance of the word for the class
        # TFIDF - determines significance of the word for the document
        tot_pmi0 = [ words_pmis[0][w] * word2tfidf[w] for w in set(words) if w in words_pmis[0] ]
        tot_pmi1 = [ words_pmis[1][w] * word2tfidf[w] for w in set(words) if w in words_pmis[1] ]
        pmi0 = np.sum(tot_pmi0)
        pmi1 = np.sum(tot_pmi1)
        diff = pmi1 - pmi0
        if diff > 0.006:
            results[idx] = 1
    return results

def text_embeddings(text_tokenized, words_pmis, word2text_count, N):
    embeddings = []
    for words in tqdm(text_tokenized):
        word2tfidf = get_doc_tfidf(words, word2text_count, N)
        embedding = torch.FloatTensor(np.zeros( nlp(text_tokenized[0][0]).vector.shape[0] + 2)).to(device)
        pmi0 = 0;
        pmi1 = 0;
        for word in words:
            embedding[:200] += torch.FloatTensor(nlp(word).vector).to(device)
            try:
                pmi0 += words_pmis[0][word] * word2tfidf[word]
                pmi1 += words_pmis[1][word] * word2tfidf[word]
            except:
                continue
        embedding[-1] = pmi0
        embedding[-2] = pmi1
        embeddings.append(embedding / len(words))
    return embeddings
        

# Preprocessing

In [6]:
data = data_raw.sample(frac=1)
idx = int(data.shape[0] * 0.2)
test_data = data.iloc[:idx]
train_data = data.iloc[idx:]
print(test_data.shape)
print(train_data.shape)

(2841, 5)
(11364, 5)


In [7]:
tokenized_texts = tokenization(train_data)
tokenized_test_texts = tokenization(test_data)

In [38]:
%%time
word2text_count = get_word_stat( tokenized_texts )
targets_train = train_data['Label'].values
targets_test = test_data['Label'].values 
N = len(tokenized_texts)
words_pmis = create_pmi_dict(tokenized_texts, targets_train, min_count=5)
embeddings = text_embeddings(tokenized_texts, words_pmis, word2text_count, N)

100%|██████████| 11364/11364 [46:08<00:00,  4.11it/s] 

CPU times: user 46min 12s, sys: 3.38 s, total: 46min 16s
Wall time: 46min 9s





In [69]:
word2text_count = get_word_stat( tokenized_texts )
N = len(tokenized_texts)

targets_train = train_data['Label'].values
targets_test = test_data['Label'].values

words_pmis = create_pmi_dict(tokenized_texts, targets_train, min_count=5)
results = classify_pmi_based(words_pmis, word2text_count, tokenized_test_texts, N)

precision = np.sum( np.logical_and(results, targets_test) ) / np.sum(targets_test)
accuracy = (results == targets_test).mean()

print("Accuracy: %s \nPrecision: %s" % (accuracy, precision))

Accuracy: 0.9412178810278071 
Precision: 0.9502046384720327


# Neural Network classifier
1. NN takes a vector as an input that consists of pmi, tfidf and multiplied embedding for all
words in the text <br>
2. NN has one hidden layer with N nonlinear neurons <br>
3. It has one linear output neuron <br>
4. Loss function -- BinaryCrossEntropy

In [25]:
class net(torch.nn.Module):
    def __init__(self, n_hidden_neurons, in_features, out_features):
        super(net, self).__init__()
        
        self.layer1 = torch.nn.Linear(in_features, n_hidden_neurons)
        self.act = torch.nn.Sigmoid()
        self.layer2 = torch.nn.Linear(n_hidden_neurons, out_features)
        self.act_out = torch.nn.Sigmoid()
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.act(x)
        x = self.layer2(x)
        x = self.act_out(x)
        return x

loss = torch.nn.BCELoss().to(device)

In [17]:
test_embeddings2 = text_embeddings(tokenized_test_texts, words_pmis, word2text_count, N=N)
with open('test_embeddings.p', 'wb') as f:
    pickle.dump(test_embeddings2, f)

100%|██████████| 2841/2841 [11:10<00:00,  4.24it/s]


In [14]:
with open('test_embeddings.p', 'rb') as f:
    data = pickle.load(f)
    test_embeddings3 = data

In [18]:
%%time
embeddings2 = text_embeddings(tokenized_texts, words_pmis, word2text_count, N)
with open('train_embeddings.p', 'wb') as f:
    pickle.dump(embeddings2, f)

100%|██████████| 11364/11364 [50:45<00:00,  3.73it/s] 


CPU times: user 50min 43s, sys: 4.7 s, total: 50min 48s
Wall time: 50min 46s


In [44]:
with open('embeddings_new.p', 'wb') as f:
    pickle.dump(embeddings_new, f)
with open('test_embeddings_new.p', 'wb') as f:
    pickle.dump(test_embeddings_new, f)
with open('targets_train.p', 'wb') as f:
    pickle.dump(targets_train, f)
with open('targets_test.p', 'wb') as f:
    pickle.dump(targets_test, f)

In [19]:
targets_train = train_data['Label'].values
targets_test = test_data['Label'].values
targets_train = torch.LongTensor(targets_train)
targets_test = torch.LongTensor(targets_test)
targets_train.unsqueeze_(1)
targets_test.unsqueeze_(1)
targets_train.to(device)
targets_test.to(device)

tensor([[1],
        [1],
        [1],
        ...,
        [1],
        [1],
        [0]])

In [20]:
embeddings_unsq = [e.unsqueeze(0) for e in embeddings2]
test_embeddings_unsq = [e.unsqueeze(0) for e in test_embeddings2]

In [21]:
embeddings_new = torch.cat(embeddings_unsq, dim=0)
test_embeddings_new = torch.cat(test_embeddings_unsq, dim=0)

In [22]:
embeddings_new = embeddings_new.to(device)
test_embeddings_new = test_embeddings_new.to(device)

## Save embeddings

In [39]:
def train(X, Y, X_test, Y_test, batch_size=13, epochs=87):
    for epoch in range(epochs):
        order = np.random.permutation(len(X))
        for start_index in range(0, len(X), batch_size):
            optimizer.zero_grad()
            batch_indices = order[start_index:start_index+batch_size]
            x_batch = X[batch_indices]
            y_batch = torch.LongTensor( Y[batch_indices] )
            preds = dili_net.forward(x_batch)
            
            loss_value = loss(preds.float(), y_batch.float())
            loss_value.backward()
            
            optimizer.step()
            
        if epoch % 50 == 0:
            test_preds = dili_net.forward(X_test)
       
            test_preds = torch.where(test_preds > 0.3, 1, 0)
  
            print((test_preds == Y_test).float().mean().numpy())


tensor([[1],
        [1],
        [1],
        ...,
        [1],
        [1],
        [0]])

In [40]:
dili_net = net(5, 202, 1)
dili_net.to(device)
optimizer = torch.optim.Adam(dili_net.parameters(), 
                             lr=1.0e-3)
train(X=embeddings_new, Y=targets_train, X_test=test_embeddings_new, Y_test=targets_test, batch_size=100, epochs=1500)

0.51601547
0.9232665
0.9281943
0.9334741
0.937346
0.94156986
0.9412179
0.9391059
0.9391059
0.93945795
0.9408659
0.93805
0.937346
0.93629
0.93593806
0.936994
0.93875396
0.9391059
0.93805
0.93945795
0.9391059
0.9408659
0.9398099
0.937698
0.937346
0.936994
0.9345301
0.93593806
0.9334741
0.93312216


In [None]:
-5.42 / np.log(0.0000000000001)
tokenize("Hepato and hepatocytes and 8SDiasadfgh;vnjskF bnGIADFBVdvgnk%%SA:F<L")