In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import spacy

In [3]:
def load_dataset(file_path):
    sentences = []
    labels = []
    
    with open(file_path, 'r') as f:
        lines = f.readlines()
        sentence = []
        
        for line in lines:
            line = line.strip() # remove leading/trailing whitespaces
            if line:
                if not line.startswith('ARG') and not line.startswith('REL') and not line.startswith('NONE'):
                    sentence = line
                else:
                    current_label = line
                    sentences.append(sentence)
                    labels.append(current_label)
                    
    return sentences, labels

In [4]:
# load the dataset into a pandas dataframe
sentences, labels = load_dataset('./Dataset/original_cleaned')
df = pd.DataFrame({
    'Sentence': sentences,
    'Labels': labels
})

In [5]:
df.shape

(166889, 2)

In [6]:
df

Unnamed: 0,Sentence,Labels
0,Simon is quoted as saying `` if you 'd ever se...,ARG1 REL REL ARG2 ARG2 ARG2 ARG2 ARG2 ARG2 ARG...
1,Simon is quoted as saying `` if you 'd ever se...,NONE NONE NONE NONE NONE NONE NONE NONE NONE N...
2,Simon is quoted as saying `` if you 'd ever se...,NONE NONE NONE NONE NONE NONE NONE ARG1 REL TI...
3,Simon is quoted as saying `` if you 'd ever se...,ARG1 NONE NONE REL REL NONE NONE NONE NONE NON...
4,The couple had no children .,ARG1 ARG1 REL ARG2 ARG2 NONE
...,...,...
166884,TIME ARG1 REL NONE NONE ARG2 ARG2 ARG2 ARG2 AR...,NONE NONE NONE NONE NONE ARG1 REL ARG2 ARG2 AR...
166885,TIME ARG1 REL NONE NONE ARG2 ARG2 ARG2 ARG2 AR...,NONE NONE NONE NONE NONE NONE NONE NONE NONE N...
166886,This was the time when Yang Luchan made the Ch...,ARG1 REL ARG2 ARG2 ARG2 ARG2 ARG2 ARG2 ARG2 AR...
166887,This was the time when Yang Luchan made the Ch...,NONE NONE TIME TIME NONE ARG1 ARG1 REL ARG2 AR...


In [7]:
# convert all sentences to lower case
df['Sentence'] = df['Sentence'].str.lower()

In [8]:
# function to tokenize and compare the lengths
nlp = spacy.load('en_core_web_sm')

def check_token_label_length(row):
    # tokenize each sentence
    doc = nlp(row['Sentence'])
    tokens = [token.text for token in doc]
    labels = row['Labels'].split()
    
    return len(tokens) == len(labels), len(tokens), len(labels)

In [None]:
# tokenization
df['Token_Label_Match'] = df.apply(check_token_label_length, axis=1)

In [None]:
df['Token_Label_Match']

In [None]:
# splitting the dataset
