In [2]:
import pickle
import numpy as np
import pandas as pd
import plotly.express as px
from numpy.random import RandomState

from src.extraction.jsonl_data_reader import JsonlDataReader

In [3]:
## Utility function to save as pickly file

def save_file(data:dict, filename:str, path:str="/Users/melloo21/Desktop/NUS Items/CS4248/2024_CS4248/Project/cs4248/data"):
    if '.pickle' not in filename:
        filename = f"{filename}.pickle"
    with open(f"{path}/{filename}", 'wb') as handle:
        pickle.dump(data,handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f" saved {filename}")

In [4]:
seed = 7
random_state = RandomState(seed=seed)
np.random.seed(seed)

In [5]:
all_data = { "train_data"  : JsonlDataReader(file_name='train.jsonl').read(),
"dev_data" : JsonlDataReader(file_name='dev.jsonl').read(),
"test_data" : JsonlDataReader(file_name='test.jsonl').read() }

In [None]:
# Do not do anything at all
# Change to numeric token <num_token>
# Remove duplicate 
# Remove citation

In [13]:
from src.preprocessing.simple_preprocessor import SimplePreprocessor

# Choose the type to create dataset
# Returns Document class 
# raw_instances: Collection[DataInstance]
# texts: Collection[str]
# id: Collection[str]
# labels: Collection[Labels]
# label_indices: Collection[LabelIndices] = field(init=False)

# Create remove_citation set
removed_citation = dict()
remove_cite_preprocessor = SimplePreprocessor(remove_citations=True, remove_duplicates=False)

for key, value in all_data.items():
    print(f" Removing citations for {key}")
    proc_doc = remove_cite_preprocessor.preprocess(value)
    removed_citation[key] = {"sentence": proc_doc.texts, "labels":proc_doc.labels} 
    
save_file(removed_citation, filename="removed_citation")

 Removing citations for train_data
 Removing citations for dev_data
 Removing citations for test_data
 saved removed_citation.pickle


In [16]:
# Create removed_duplicates set
removed_duplicates = dict()
remove_duplicates_preprocessor = SimplePreprocessor(remove_citations=False, remove_duplicates=True)

for key, value in all_data.items():
    print(f" Removing duplicates for {key} , shape {len(value.texts)}")
    proc_doc = remove_duplicates_preprocessor.preprocess(value)
    removed_duplicates[key] = {"sentence": proc_doc.texts, "labels":proc_doc.labels} 
    print(len(proc_doc.texts))

save_file(removed_duplicates, filename="removed_duplicates")

 Removing duplicates for train_data , shape 8243
7549
 Removing duplicates for dev_data , shape 916
916
 Removing duplicates for test_data , shape 1861
1861
 saved removed_duplicates.pickle


In [5]:
sorted(all_data["train_data"].texts)

['%) and in the Registro\n Informatizado de la Enfermedad TromboEmboÃÅlica (RIETE) (major bleeding at 3 months: 3.4% vs. VTE recurrence at 3 months: 2.1%).16 Similar results were also observed in the Worcester Venous Thromboembolism Study.11\n The main strength of our study is the multicentric prospective inclusion of unselected VTE patients in a real-world setting that provides a representative picture of the age distribution in patients with acute VTE.',
 '%) fallow deer were positive to A. phagocytophylum DNA, however a work recently conducted by Michalik et al. (2009) on a population of 44 fallow deer living in West–Central Poland using only a PCR approach, showed a prevalence rate (20.5',
 '%) individuals with psoriatic arthritis had CD (the Swedish study used no controls; Lindqvist et al., 2002).',
 '%) was a close relative (at 97% identity) of the facultative denitrifying bacterium Aromatoleum aromaticum (which belongs to the Azoarcus cluster) (Kuhner et al., 2005), and the clos

In [18]:
from src.tokenize.spacy_tokenizer import SpacyTokenizer

tokenizer = SpacyTokenizer(        
        merge_nouns = False,
        merge_entities= False,
        remove_stopwords= False,
        replace_numbers= True,
        lowercase= False,
        lemmatize= False)

# # Returns Tokenizer class
# # tokens: Collection[Collection[str]]
# # id: Collection[str]
# # labels: Collection[Labels]
# tokenized_train = tokenizer.tokenize(train_data)
# tokenized_test = tokenizer.tokenize(test_data)

# Create replace number set
replace_numbers = dict()

for key, value in all_data.items():
    print(f" Replacing numbers for {key}")
    proc_doc = tokenizer.tokenize(value)
    all_sentences = list()
    # Processing to correct format
    for elem in proc_doc.tokens:
        all_sentences.append(" ".join(elem))
    print(proc_doc.labels)
    replace_numbers[key] = {"sentence": all_sentences, "labels":proc_doc.labels} 

save_file(replace_numbers, filename="replace_numbers")

 Replacing numbers for train_data
['background', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'result', 'background', 'method', 'background', 'background', 'result', 'background', 'background', 'background', 'background', 'method', 'method', 'background', 'background', 'background', 'background', 'method', 'result', 'background', 'background', 'result', 'result', 'result', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'background', 'method', 'background', 'background', 'background', 'method', 'background', 'method', 'background', 'background', 'background', 'background', 'background', 'background', 'result', 'background', 'background', 'method', 'background', 'background', 'background', 'background', 'method', 'background', 'background', 'result', 'background', 'back

In [8]:
# all_sentences = list()
# for elem in tokenized_train.tokens:
#     all_sentences.append(" ".join(elem))

In [9]:
# all_sentences[0]