In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification


In [17]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [18]:
token_df = pd.read_csv('/home/chudeo/coding-evidence-extraction-main/33k_sentence.csv')

In [19]:
token_df.count()

sentence_id    625510
words          625262
labels         625510
dtype: int64

In [20]:
#checking for null values
token_df.isnull().sum()

sentence_id      0
words          248
labels           0
dtype: int64

In [21]:
data = token_df.fillna(method='ffill')
data.head()

  data = token_df.fillna(method='ffill')


Unnamed: 0,sentence_id,words,labels
0,0,Baseline,O
1,0,artifact,O
2,0,.,O
3,1,Probable,O
4,1,sinus,B


In [22]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,sentence_id,words,labels,sentence,word_labels
0,0,Baseline,O,Baseline artifact .,"O,O,O"
1,0,artifact,O,Baseline artifact .,"O,O,O"
2,0,.,O,Baseline artifact .,"O,O,O"
3,1,Probable,O,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"
4,1,sinus,B,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"


In [23]:

label2id = {k: v for v, k in enumerate(data.labels.unique())}
id2label = {v: k for v, k in enumerate(data.labels.unique())}
label2id

{'O': 0, 'B': 1, 'I': 2}

In [24]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Baseline artifact .,"O,O,O"
1,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"
2,Low limb lead voltage .,"O,O,O,O,O"
3,Leftward axis .,"O,O,O"
4,Late R wave progression .,"O,O,O,O,O"


In [25]:
len(data)


11262

In [26]:
data.iloc[1].sentence

'Probable sinus tachycardia with atrial premature beats .'

In [27]:
data.iloc[1].word_labels

'O,B,I,O,O,O,O,O'

##### **Sampled sentences**

In [28]:
from sklearn.utils import resample


# Separate majority and minority classes
majority_class = data[data['word_labels'].apply(lambda x: all(label == 'O' for label in x.split(',')))]
minority_class = data[~data['word_labels'].apply(lambda x: all(label == 'O' for label in x.split(',')))]

# Downsample majority class
majority_downsampled = resample(majority_class, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(minority_class),  # match minority class size
                                 random_state=42)  # reproducible results

# Combine minority class with downsampled majority class
balanced_data = pd.concat([majority_downsampled, minority_class])



In [29]:
len(balanced_data)

4378

In [33]:
data = balanced_data

In [34]:
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from models.bert_crf import BertCrf
from train import train_ner

In [35]:
# Extract words and labels from DataFrame
split_data = {
    "words": [word for sent in data["sentence"].str.split() for word in sent],
    "labels": [label.split(',') for label in data["word_labels"]]
}

In [36]:
# Split data into sentences and labels
sentences = split_data["words"]
labels = split_data["labels"]

In [None]:
class NERDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = self.tokenize_and_preserve_labels(sentence, word_labels)

        label_ids = [label2id[label] for label in labels]
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]


        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(attn_mask, dtype=torch.long),
            'labels': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

    def tokenize_and_preserve_labels(self, sentence, text_labels):
        # Implement your tokenization logic here
        tokenized_sentence = self.tokenizer.tokenize(sentence)
        labels = text_labels.split(',')
        return tokenized_sentence, labels
