In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification


In [17]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [18]:
token_df = pd.read_csv('/home/chudeo/coding-evidence-extraction-main/33k_sentence.csv')

In [19]:
token_df.count()

sentence_id    625510
words          625262
labels         625510
dtype: int64

In [20]:
#checking for null values
token_df.isnull().sum()

sentence_id      0
words          248
labels           0
dtype: int64

In [21]:
data = token_df.fillna(method='ffill')
data.head()

  data = token_df.fillna(method='ffill')


Unnamed: 0,sentence_id,words,labels
0,0,Baseline,O
1,0,artifact,O
2,0,.,O
3,1,Probable,O
4,1,sinus,B


In [22]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,sentence_id,words,labels,sentence,word_labels
0,0,Baseline,O,Baseline artifact .,"O,O,O"
1,0,artifact,O,Baseline artifact .,"O,O,O"
2,0,.,O,Baseline artifact .,"O,O,O"
3,1,Probable,O,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"
4,1,sinus,B,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"


In [23]:

label2id = {k: v for v, k in enumerate(data.labels.unique())}
id2label = {v: k for v, k in enumerate(data.labels.unique())}
label2id

{'O': 0, 'B': 1, 'I': 2}

In [24]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Baseline artifact .,"O,O,O"
1,Probable sinus tachycardia with atrial prematu...,"O,B,I,O,O,O,O,O"
2,Low limb lead voltage .,"O,O,O,O,O"
3,Leftward axis .,"O,O,O"
4,Late R wave progression .,"O,O,O,O,O"


In [25]:
len(data)


11262

In [26]:
data.iloc[1].sentence

'Probable sinus tachycardia with atrial premature beats .'

In [27]:
data.iloc[1].word_labels

'O,B,I,O,O,O,O,O'

##### **Sampled sentences**

In [28]:
from sklearn.utils import resample


# Separate majority and minority classes
majority_class = data[data['word_labels'].apply(lambda x: all(label == 'O' for label in x.split(',')))]
minority_class = data[~data['word_labels'].apply(lambda x: all(label == 'O' for label in x.split(',')))]

# Downsample majority class
majority_downsampled = resample(majority_class, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(minority_class),  # match minority class size
                                 random_state=42)  # reproducible results

# Combine minority class with downsampled majority class
balanced_data = pd.concat([majority_downsampled, minority_class])



In [29]:
len(balanced_data)

4378

In [37]:
balanced_data.to_csv('sampled.csv', index=False)

##### Part of Speech

In [41]:
import csv
import spacy

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to add POS tags to word labels
def add_pos_tags(sentence):
    # Process sentence using SpaCy
    doc = nlp(sentence)
    # Extract POS tags for each word
    pos_tags = [token.pos_ for token in doc]
    return pos_tags

In [42]:
# Path to your CSV file
csv_file = "/home/chudeo/coding-evidence-extraction-main/sampled.csv"

# Path to save the updated CSV file
output_file = "pos_sentence.csv"

In [43]:
# Open CSV file and create a new file for writing
with open(csv_file, "r", newline="", encoding="utf-8") as infile, open(output_file, "w", newline="", encoding="utf-8") as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ["pos_tags"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()

    # Process each row in the CSV file
    for row in reader:
        sentence = row["sentence"]
        word_labels = row["word_labels"]
        # Add POS tags to word labels
        pos_tags = add_pos_tags(sentence)
        # Add POS tags to the row
        row["pos_tags"] = ",".join(pos_tags)
        # Write the updated row to the new CSV file
        writer.writerow(row)

print("POS tags added successfully.")

POS tags added successfully.
