### Load BioCaser Datasets

### get promed

### get promed-extended

In [1]:
from utils.data_util import (
    biocaser2text,
    get_raw_extended_promed_df,
    get_raw_promed_df,
)
data_df = get_raw_extended_promed_df()
data_df.head()

found 3430 files
Remain 3377 files after filter_out with alerting
found files: 3377
found 3430 files
Remain 53 files after filter_in with alerting
found files: 53
found 3862 files
found files: 3862


Unnamed: 0,docs,labels
0,"\n \n \n \n AVIAN INFLUENZA, HUMAN (109) - IND...",1
1,Health officials in Hong Kong say that...,1
2,<h> Coronavirus Detected In Patient Quarantine...,1
3,\n \n EQUINE INFLUENZA - CHINA\n \n **********...,1
4,"\n \n \n \n SALMONELLOSIS SENFTENBERG, BASIL -...",1


In [2]:

# model = gensim.models.KeyedVectors.load_word2vec_format('/home/zm324/workspace/doc_cls/resources/word2vec/GoogleNews-vectors-negative300.bin.gz', binary=True) 
# word2vec = model.wv

In [3]:
# import torch
# import gensim
# import time
# import numpy as np
# from tqdm import tqdm
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.utils.data import DataLoader
# embedding = nn.EmbeddingBag(1, 300, sparse=True)
# embedding.from_pretrained(torch.tensor([word2vec["word"]]))

### Model

In [2]:
import torch
import gensim
import time
import numpy as np
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

class TextLinear(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class,emb_pretrain):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.pre_train = emb_pretrain
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.from_pretrained(self.pre_train)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)
def generate_batch_wo_label(batch):
    text = [torch.tensor(entry) for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets

def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [torch.tensor(entry[1]) for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label
    
class RandomWord2vec(object):
    
    def __init__(self, data_df, emb_dim=300, batch_size=128, max_epochs = 50, num_class = 2):
        self.device = 'cuda:1' if torch.cuda.is_available() else "cpu"
        from utils.preprocess import PreProcess
        pre_processor = PreProcess(data_df, "docs",lower=False)
        # todo: change code to provide all functions in class definition.
        pre_processor.clean_html()
        pre_processor.remove_non_ascii()
        pre_processor.remove_spaces()
        pre_processor.remove_punctuation()
        pre_processor.stop_words()
        # pre_processor.tokenize()
        data_df.head()
        self.data_df = data_df
        self.emb_dim = emb_dim
        self.batch_size = batch_size
        self.max_epochs = max_epochs
        self.num_class = num_class
        self.word2vec = gensim.models.KeyedVectors.load_word2vec_format('/home/zm324/workspace/doc_cls/resources/word2vec/GoogleNews-vectors-negative300.bin.gz', binary=True).wv
        self.build_vocab()
        self.pre_train = torch.tensor(self.pre_train).to(self.device)
        self.data_df.docs=self.data_df.docs.apply(self.doc2idx)
        
    def doc2idx(self, doc):
        idxs = []
        if type(doc) is not list:
            doc = doc.split()
        for word in doc:
            if word in self.word_to_ix:
                idxs.append(self.word_to_ix[word])
        return idxs
        
    def build_vocab(self):
        self.pre_train = []
        word_to_ix = {}
        for idx,row in self.data_df.iterrows():
            doc = row.docs
            if type(doc) is not list:
                doc = doc.split()
            for word in doc:
                if word not in word_to_ix and word in self.word2vec:
                    word_to_ix[word] = len(word_to_ix)
                    self.pre_train.append(self.word2vec[word])
        self.word_to_ix = word_to_ix
        self.vocab_size = len(self.word_to_ix)
    
    
    def train_epoch(self,X,y):

        # Train the model
        train_loss = 0
        train_acc = 0
        tran_data = np.stack((y,X), axis=-1)
        data = DataLoader(tran_data, batch_size=self.batch_size, shuffle=True,
                          collate_fn=generate_batch)
        for i, (text, offsets, cls) in enumerate(data):
            self.optimizer.zero_grad()
            text, offsets, cls = text.to(self.device), offsets.to(self.device), cls.to(self.device)
            output = self.model(text, offsets)
            loss = self.criterion(output, cls)
            train_loss += loss.item()
            loss.backward()
            self.optimizer.step()
            train_acc += (output.argmax(1) == cls).sum().item()

        # Adjust the learning rate
        self.scheduler.step()

        return train_loss / len(X), train_acc / len(X)
    
    def fit(self,X,y):
        self.model = TextLinear(self.vocab_size, self.emb_dim, self.num_class, self.pre_train).to(self.device)
        min_valid_loss = float('inf')
        self.criterion = torch.nn.CrossEntropyLoss().to(self.device)
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=4.0)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1, gamma=0.9)

        for epoch in tqdm(range(self.max_epochs)):
            start_time = time.time()
            train_loss, train_acc = self.train_epoch(X,y)
            secs = int(time.time() - start_time)
            mins = secs / 60
            secs = secs % 60

#             print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
#             print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')

    def predict(self,X):
        loss = 0
        acc = 0
        data = DataLoader(X, batch_size=self.batch_size, collate_fn=generate_batch_wo_label)
        pred_y = []
        for text, offsets in data:
            text, offsets = text.to(self.device), offsets.to(self.device)
            with torch.no_grad():
                output = self.model(text, offsets).to('cpu')
                pred_y+=list(output.argmax(1))
        return pred_y
    #     return loss / len(data_), acc / len(data_)

    def test(self,data_):
        loss = 0
        acc = 0
        data = DataLoader(data_, batch_size=self.batch_size, collate_fn=generate_batch)
        pred_y = []
        for text, offsets, cls in data:
            text, offsets, cls = text.to(self.device), offsets.to(self.device), cls.to(self.device)
            with torch.no_grad():
                output = self.model(text, offsets)
                loss = criterion(output, cls)
                loss += loss.item()
                pred_y = output.argmax(1)
                acc += (output.argmax(1) == cls).sum().item()
    #     return pred_y
        return loss / len(data_), acc / len(data_)

In [3]:
model = RandomWord2vec(data_df)
data_df

  self.word2vec = gensim.models.KeyedVectors.load_word2vec_format('/home/zm324/workspace/doc_cls/resources/word2vec/GoogleNews-vectors-negative300.bin.gz', binary=True).wv


Unnamed: 0,docs,labels
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1
1,"[46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 5...",1
2,"[165, 166, 167, 168, 169, 170, 47, 171, 164, 1...",1
3,"[327, 328, 329, 330, 331, 332, 333, 334, 15, 3...",1
4,"[447, 48, 448, 449, 450, 451, 452, 453, 51, 45...",1
...,...,...
7287,"[277, 129, 1128, 3370, 84550, 5604, 3087, 5739...",2
7288,"[902, 49, 1306, 1713, 12, 752, 756, 6860, 759,...",2
7289,"[5688, 14500, 14800, 6139, 5518, 81178, 756, 9...",2
7290,"[2292, 56543, 742, 1277, 451, 7516, 752, 24535...",2


In [4]:
from sklearn.model_selection import RepeatedKFold
from metrics import Accuracy,Precision,Recall,F1Score
metrics = [Accuracy(),Precision(),Recall(),F1Score()]

avg_results = {m.name:[] for m in metrics}
rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=2652124)
for train_index, test_index in rkf.split(data_df):
    train_set = data_df.iloc[train_index]
    test_set = data_df.iloc[test_index]
    X = list(train_set["docs"])
    y_true = list(train_set["labels"])
    model.fit(X,y_true)

    X = list(test_set["docs"])
    y_true = list(test_set["labels"])
    y_pred = model.predict(X)
    for mt in metrics:
        avg_results[mt.name].append(mt.compute(y_true, y_pred))

  return array(a, dtype, copy=False, order=order, subok=True)
  0%|          | 0/50 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered

In [None]:
for mt in metrics:
    avg_results[mt.name]=np.mean(avg_results[mt.name])

In [None]:
avg_results