In [1]:
# Explore the dataset
# Save test and train datasets as csv for a token classification task


In [74]:
import json
import pandas as pd

In [75]:
with open("data/train_raw.txt", encoding="utf8") as f:
    raw_train = f.read()

with open("data/train_clean.txt", encoding="utf8") as g:
    clean_train = g.read()

with open("data/train_labels.json", encoding="utf8") as h:
    labels_train = json.load(h)

In [76]:
print(raw_train[:515])

Tampa Scale for Kinesiophobia
(Miller , Kori and Todd 1991)
1 = <a>strongly disagree</a>
2 = <a>disagree</a>
3 = <a>agree</a>
4 = <a>strongly agree</a>
1. <q>I'm afraid that I might injury myself if I exercise</q> 1 2 3 4
2. <q>If I were to try to overcome it, my pain would
increase</q>
1 2 3 4
3. <q>My body is telling me I have something
dangerously wrong</q>
1 2 3 4
4. <q>My pain would probably be relieved if I were to
exercise</q>
1 2 3 4
5. <q>People aren't taking my medical condition
seriously enough</q>



In [77]:
print(clean_train[:451])


Tampa Scale for Kinesiophobia
(Miller , Kori and Todd 1991)
1 = strongly disagree
2 = disagree
3 = agree
4 = strongly agree
1. I'm afraid that I might injury myself if I exercise 1 2 3 4
2. If I were to try to overcome it, my pain would
increase
1 2 3 4
3. My body is telling me I have something
dangerously wrong
1 2 3 4
4. My pain would probably be relieved if I were to
exercise
1 2 3 4
5. People aren't taking my medical condition
seriously enough


In [78]:
start, end = labels_train["q"][0]

clean_train[start:end]

"I'm afraid that I might injury myself if I exercise"

In [79]:
from intervaltree import Interval, IntervalTree

tree_q = IntervalTree(
    Interval(start, end) for start, end in labels_train["q"] if start != end
)

tree_a = IntervalTree(
    Interval(start, end) for start, end in labels_train["a"] if start != end
)
labels_train['q'][0]

[127, 178]

In [90]:
# Look at label distribution in test set
# Tokenize the text then evaluate label distribution
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

label_list = ["other", "question", "answer"]

id2label = {k: v for k, v in enumerate(label_list)}
label2id = {v: k for k, v in enumerate(label_list)}

MAX_LENGTH = 512
STRIDE = 32


def tokenize(text, tokenizer, tree_q, tree_a):
    encodings = tokenizer(
        text,
        return_offsets_mapping=True,
        return_overflowing_tokens=True,
        truncation=True,
        max_length=MAX_LENGTH,
        stride=STRIDE,
        add_special_tokens=True,  # Includes the [CLS] and [SEP] tokens
    )

    all_token_labels = []
    for batch_index, (input_ids, offsets) in enumerate(
        zip(encodings["input_ids"], encodings["offset_mapping"])
    ):
        word_ids = encodings.word_ids(batch_index=batch_index)

        token_labels = []
        current_word_idx = None

        for word_id, (start, end) in zip(word_ids, offsets):
            if word_id is None:  # Special tokens like [CLS] or [SEP]
                token_labels.append(-100)
            elif word_id != current_word_idx:  # New word
                if len(tree_q.overlap(start, end)) > 0:
                    label = "question"
                elif len(tree_a.overlap(start, end)) > 0:
                    label = "answer"
                else:
                    label = "other"

                token_labels.append(label2id[label])
                current_word_idx = word_id
            else:  # Subword token
                token_labels.append(-100)

        all_token_labels.append(token_labels)

    encodings["labels"] = all_token_labels

    return encodings

tokenized_dataset = tokenize(clean_train, tokenizer, tree_q, tree_a)

dataset = Dataset.from_dict(
    {
        "input_ids": tokenized_dataset["input_ids"],
        "attention_mask": tokenized_dataset["attention_mask"],
        "labels": tokenized_dataset["labels"],
    }
)

dataset = dataset.train_test_split(test_size=0.2, shuffle=False)
training_dataset = dataset['train']
test_dataset = dataset['test']

In [96]:
training_dataset = training_dataset.shuffle()

In [98]:
out_tokens = []
out_labels = []
for i, (input_ids, labels) in enumerate(  # type: ignore
    zip(training_dataset["input_ids"], training_dataset["labels"])  # type: ignore
):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    for token, label in zip(tokens, labels):
        real_label = id2label.get(label)
        print(f"Token: {token:<20} Label: {real_label}")
        out_tokens.append(token)
        out_labels.append(real_label)

    if i > 50:
        break

Token: [CLS]                Label: None
Token: of                   Label: other
Token: reality              Label: other
Token: orientation          Label: other
Token: and                  Label: other
Token: adaptive             Label: other
Token: behaviour            Label: other
Token: .                    Label: other
Token: basis                Label: other
Token: for                  Label: other
Token: rating               Label: other
Token: -                    Label: other
Token: inter                Label: other
Token: ##personal           Label: None
Token: behaviour            Label: other
Token: observed             Label: other
Token: during               Label: other
Token: the                  Label: other
Token: course               Label: other
Token: of                   Label: other
Token: interview            Label: other
Token: .                    Label: other
Token: 1                    Label: other
Token: absent               Label: other
Token: -          

In [52]:
df = pd.DataFrame({'tokens': out_tokens, 'labels': out_labels})

In [53]:
len(df)

389115

In [54]:
total = len(df['labels'])
out_labels == 'question'

False

In [73]:
print('question: ', 100*sum(df['labels'] == 'question') / total)
print('answer: ', 100*sum(df['labels'] == 'answer') / total)
print('other: ', 100*sum(df['labels'] == 'other') / total)

question:  22.370507433535074
answer:  11.608650398982306
other:  59.90928131786233
