In [2]:
import sys
sys.path.append('../') 
from data_preprocessing.features import Features
import pandas as pd

## Task2

In [3]:
features = Features()
features.extract_features("C:\\Users\\User\\Documents\\8.semestar\\APT\\Projekt\\TAR_Project\\data\\features.csv")

In [4]:
dict_features = {}
dict_feature_num = {}
duplicates = {}

k = 1
for i,f in zip(features.feature_idx, features.features):
    pronaden = 0
    for key, value in dict_features.items():
        if value == f:
            pronaden = 1;
            duplicates[i] = key

    if (pronaden == 0):
        dict_features[i] = f
        dict_feature_num[i] = k
        k += 1

In [5]:
features_df = pd.DataFrame(dict_features.items(), columns=['Key', 'Value'])

In [6]:
train_df = pd.read_csv("../data/train.csv")

In [7]:
patient_df = pd.read_csv("../data/patient_notes.csv") 

In [8]:
train_data = train_df.merge(patient_df, on=['pn_num', 'case_num'], how='left')

In [9]:
train_df = train_df[train_df['annotation'].apply(lambda x: x != "[]")]

In [11]:
import ast
import re
from transformers import BertTokenizer, BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

tekst_preproccesed = []
labels = []
labels_binary = []

for index, p in patient_df.iterrows():
    mapa = tokenizer(p["pn_history"], return_offsets_mapping=True)
    off_map = mapa["offset_mapping"][1:-1]
    
    tekst_p = tokenizer.tokenize(p["pn_history"])
    #tekst_p = re.split(r'[\s,.!?;:{}()/-]+', p["pn_history"].lower().strip())
    
    label_p = len(tekst_p) * [0]
    label_b = len(tekst_p) * [0]

    row_df = pd.DataFrame.from_dict([p])
    feature_df = pd.merge(train_df, row_df[['case_num', 'pn_num']], on=['case_num', 'pn_num'], how='inner')
    
    if not feature_df.empty:
        for locs in feature_df["location"]:
            feat_num = feature_df.loc[feature_df["location"] == locs, "feature_num"].item()
            if feat_num in duplicates.keys():
                feat_num = dict_feature_num[duplicates[feat_num]]
            else:
                feat_num = dict_feature_num[feat_num]

            locs = re.sub(r';', "','", locs)
            locs = [tuple(map(int, pair.split())) for pair in ast.literal_eval(locs)]
            for l in locs:
                x, y = l[0], l[1]

                for index, (start_i, end_i) in enumerate(off_map):
                    if start_i >= x and y >= end_i:
                        label_p[index] = feat_num
                        label_b[index] = 1

        tekst_preproccesed.append(tekst_p)
        labels.append(label_p)
        labels_binary.append(label_b)

In [28]:
import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from transformers import BertTokenizerFast, BertModel

from torch.utils.data import Dataset
from torch.utils.data import DataLoader, random_split

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_recall_fscore_support

def encode_data(data):
    inputs_ids = []
    
    for t in tekst_preproccesed:
        token_ids = tokenizer.convert_tokens_to_ids(t)
        #input_ids = torch.tensor([token_ids])
        
        #inputs_ids.append([input_ids.squeeze()])
        inputs_ids.append(token_ids)

    return inputs_ids

class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
        self.lengths = [len(text) for text in data]
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = (torch.tensor(self.data[idx]), self.targets[idx], self.lengths[idx])
        return sample

def pad_collate_fn(batch, pad_index):
    texts, labels, lengths = zip(*batch)

    texts = pad_sequence(texts, batch_first=True, padding_value=pad_index)
    
    max_length = texts.shape[1]
    
    padded_labels = []
    for label, length in zip(labels, lengths):
        padded_label = torch.cat([torch.tensor(label), torch.full((max_length - length,), 133)])
        padded_labels.append(padded_label)
    padded_labels = torch.stack(padded_labels)

    return texts, padded_labels, torch.tensor(lengths)

class ModelSeqLab(nn.Module):
    def __init__(self, bert_model_name, hidden_dim, num_layers=2, output_size=132):
        super(ModelSeqLab, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        self.pad_index = 133
        
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        self.translate_vec = nn.Linear(self.bert.config.hidden_size, 300) #je li bolje koristiti PCA?
        self.lstm = nn.LSTM(300, hidden_dim, num_layers=self.num_layers, batch_first=True, bidirectional=True, dropout=0.2)
        
        #self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_dim, num_layers=2, batch_first=True, bidirectional=True, dropout=0.2)
        self.fc = nn.Linear(hidden_dim * 2, output_size)

    def forward(self, input_ids, lengths):
        attention_mask = input_ids.ne(self.pad_index)
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings_org = bert_outputs.last_hidden_state

        embeddings = self.translate_vec(embeddings_org)

        #batch_size  = input_ids.shape[0]
        #h_01 = torch.zeros(2 * self.num_layers, batch_size, self.hidden_size).to(input_ids.device)
        #c_01 = Variable(torch.zeros(2 * self.num_layers, batch_size, self.hidden_size)).to(input_ids.device)

        packed_input = pack_padded_sequence(embeddings, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        output_scores = self.fc(output)
        return output_scores


num_classes = 132
#num_classes = 2

hidden_dim = 150
output_dim = num_classes

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = ModelSeqLab('bert-base-uncased', hidden_dim, output_dim).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=133)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

pad_id = BertTokenizerFast.from_pretrained("bert-base-uncased").pad_token_id

processed_data = encode_data(tekst_preproccesed)

custom_dataset = CustomDataset(processed_data, labels) #labels_binary
train_size = int(0.8 * len(custom_dataset))
val_size = len(custom_dataset) - train_size
train_dataset, val_dataset = random_split(custom_dataset, [train_size, val_size])

train_dataloader = DataLoader(dataset=train_dataset, batch_size=10, shuffle=True, collate_fn=lambda batch: pad_collate_fn(batch, pad_index=pad_id))
val_dataloader = DataLoader(dataset=val_dataset, batch_size=20, shuffle=True, collate_fn=lambda batch: pad_collate_fn(batch, pad_index=pad_id))

print("Training...")
for epoch in range(1):
    model.train()
    
    for batch_num, (podaci, targets, lengths) in enumerate(train_dataloader):
        podaci = podaci.to(device)
        targets = targets.to(device)
        lengths = lengths.to(device)
        
        model.zero_grad()
    
        output_scores = model(podaci, lengths)
        print(output_scores.view(-1, num_classes).shape)
        print(targets.view(-1).shape)
        #print(targets)
        
        #predictions = torch.nn.functional.softmax(output_scores, dim=1)
        loss = criterion(output_scores.view(-1, num_classes), targets.long().view(-1))
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
        optimizer.step()

    if epoch % 1 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

print("Evaluating...")
model.eval()

targets_all = []
predictions_all = []
loss_all = 0

with torch.no_grad():
    for batch_num, (podaci, targets, lengths) in enumerate(val_dataloader):
        podaci = podaci.to(device)
        targets = targets.to(device)
        lengths = lengths.to(device)

        output_scores = model(podaci, lengths)
        loss = criterion(output_scores.view(-1, num_classes), targets.long().view(-1))

        loss_all += loss

        predictions = torch.argmax(output_scores.view(-1, num_classes), dim=1)

        targets_all.extend(targets.view(-1).cpu().tolist())
        predictions_all.extend(predictions.cpu().tolist())

    #accuracy = accuracy_score(targets_all, predictions_all)
    #f1 = f1_score(targets_all, predictions_all)
    #precision, recall, f1, support = precision_recall_fscore_support(targets_all, predictions_all)
    #conf_matrix = confusion_matrix(targets_all, predictions_all)

cpu
Training...
torch.Size([2720, 132])
torch.Size([2720])
torch.Size([2690, 132])
torch.Size([2690])


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000002475D50D570>>
Traceback (most recent call last):
  File "C:\Users\User\miniconda3\envs\tar\lib\site-packages\ipykernel\ipkernel.py", line 783, in _clean_thread_parent_frames
    if phase != "start":
KeyboardInterrupt: 


KeyboardInterrupt: 