In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import nltk
from gensim.models import KeyedVectors

In [2]:
data_path = './data/'
df_data = pd.read_csv(data_path + 'data_processed.csv')
df_data['TOKENS'] = df_data['CLEAN_TEXT'].apply(nltk.word_tokenize)
df_data_genRe = df_data[['TOKENS', 'GEN_RE']]
df_data_30Re = df_data[['TOKENS', '30_RE']]

Download word2vec file from Kaggle: https://www.kaggle.com/datasets/alexiscorona/pubmed-and-pmc-w2v/

In [3]:
w2v = KeyedVectors.load_word2vec_format("PubMed-and-PMC-w2v.bin", binary=True)

In [4]:
pos_genRe_idx = np.where((df_data['GEN_RE'] == 1))[0]
neg_genRe_idx = np.where((df_data['GEN_RE'] == 0))[0]
pos_30Re_idx = np.where((df_data['30_RE'] == 1))[0]
neg_30Re_idx = np.where((df_data['30_RE'] == 0))[0]

In [5]:
def vectorize(tokens):
    vectors = [w2v[token] if token in w2v else np.random.uniform(-1, 1, (200,)).astype(np.float32) for token in tokens]
    return np.array(vectors)

def generate_dataset(df_data, pos_idx, neg_idx):
    num_pos = len(pos_idx)
    labels = [1] * num_pos + [0] * num_pos
    neg_idx_sample = np.random.choice(neg_idx, size=num_pos, replace=False)
    all_idx = pos_idx.tolist() + neg_idx_sample.tolist()
    tokens_all = df_data.iloc[all_idx, :]['TOKENS'].to_list()
    vectors_all = [vectorize(tokens) for tokens in tokens_all]
    return vectors_all, labels

def collate_data(batch):
    batch = sorted(batch, key=lambda x: x[0].shape[0])[::-1]
    max_len = batch[0][0].shape[0]
    labels = [i[1] for i in batch]
    
    vectors = []
    for itm in batch:
        if itm[0].shape[0] < max_len:
            vectors.append(np.pad(itm[0], ((0, max_len - itm[0].shape[0]), (0, 0)), mode='constant', constant_values=0))
        else:
            vectors.append(itm[0])
    vectors = np.stack(vectors, axis=0)
    return torch.tensor(vectors, dtype=torch.float), torch.tensor(labels, dtype=torch.float)

In [6]:
class HFDataset(Dataset):
    def __init__(self, vectors_all, labels):
        self.vectors_all = vectors_all
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        vectors = self.vectors_all[index]
        return vectors, self.labels[index]

In [None]:
vectors_all_genRe, labels_genRe = generate_dataset(df_data_genRe, pos_genRe_idx, neg_genRe_idx)
x_train, x_test, y_train, y_test = train_test_split(vectors_all_genRe, labels_genRe, test_size=0.1, shuffle=True)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, shuffle=True)
dataset_train = HFDataset(x_train, y_train)
dataset_val = HFDataset(x_val, y_val)
dataset_test = HFDataset(x_test, y_test)
train_loader = DataLoader(dataset=dataset_train, batch_size=32, shuffle=True, collate_fn=collate_data)
val_loader = DataLoader(dataset=dataset_val, batch_size=32, shuffle=True, collate_fn=collate_data)
test_loader = DataLoader(dataset=dataset_test, batch_size=32, shuffle=True, collate_fn=collate_data)

In [37]:
class HFCNN(nn.Module):
    def __init__(self):
        super(HFCNN, self).__init__()
        out_channels = 64
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(1, 200))
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(2, 200))
        self.conv3 = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(3, 200))
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.bn2 = nn.BatchNorm1d(out_channels)
        self.bn3 = nn.BatchNorm1d(out_channels)
        self.dp = nn.Dropout(0.5)
        self.FC1 = nn.Linear(in_features=out_channels * 3, out_features=1)
        
    def forward(self, x):
        x = x.unsqueeze(1)
        feature1 = F.relu(self.conv1(x).squeeze(3))
        feature2 = F.relu(self.conv2(x).squeeze(3))
        feature3 = F.relu(self.conv3(x).squeeze(3))
                
        feature1 = F.max_pool1d(feature1, feature1.size(2)).squeeze(2)
        feature2 = F.max_pool1d(feature2, feature2.size(2)).squeeze(2)
        feature3 = F.max_pool1d(feature3, feature3.size(2)).squeeze(2)

        
        feature1 = self.bn1(feature1)
        feature2 = self.bn2(feature2)
        feature3 = self.bn3(feature3)
        
        features = torch.cat((feature1, feature2, feature3), dim=1)
        features = self.dp(features)
        output = self.FC1(features).squeeze(1)
        
        return output    

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HFCNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.05)
criterion = nn.BCEWithLogitsLoss()
num_epoch = 15
sizes = {'training': len(x_train), 'validation': len(x_val)}

for epoch in range(num_epoch):
    print('Epoch {} of {}'.format(epoch+1, num_epoch))
    print('-' * 10)
    for phase in ['training', 'validation']:
        if phase == 'training':
            model.train()
            dataloader = train_loader
        else:
            model.eval()
            dataloader = val_loader
        epoch_loss = 0.
        epoch_correct = 0.
        for i, (input, target) in enumerate(dataloader):
            input = input.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            with torch.set_grad_enabled(phase == 'training'):
                output = model(input)
                loss = criterion(output, target)
                if phase == 'training':
                    loss.backward()
                    optimizer.step()
            epoch_loss += loss.item() * input.shape[0]
            preds = (F.sigmoid(output) > 0.5).float()
            epoch_correct += torch.sum(preds == target.data)

        epoch_loss = epoch_loss / sizes[phase]
        epoch_acc = (epoch_correct / sizes[phase]).item()
        print('{} phase, current loss is {}'.format(phase, epoch_loss))
        print('{} phase, current accuracy is {}'.format(phase, epoch_acc))
        print()

Epoch 1 of 15
----------
training phase, current loss is 0.7078165553044284
training phase, current accuracy is 0.5514718890190125

validation phase, current loss is 0.66152753482418
validation phase, current accuracy is 0.6065830588340759

Epoch 2 of 15
----------
training phase, current loss is 0.5791538959283734
training phase, current accuracy is 0.7172966599464417

validation phase, current loss is 0.6516427614472129
validation phase, current accuracy is 0.6253918409347534

Epoch 3 of 15
----------
training phase, current loss is 0.5612895263962745
training phase, current accuracy is 0.7221738696098328

validation phase, current loss is 0.6314202898348387
validation phase, current accuracy is 0.6504701972007751

Epoch 4 of 15
----------
training phase, current loss is 0.5489493644397405
training phase, current accuracy is 0.7456889152526855

validation phase, current loss is 0.6516947054937715
validation phase, current accuracy is 0.6144200563430786

Epoch 5 of 15
----------
train