# Dataset creation process

In [1]:
import os
import pandas as pd

from Source_code.z_utils.data_preparing import split_data
from Source_code.z_utils.data_preprocessing import xml_to_df
from Source_code.z_utils.global_constants import RANDOM_SEED


datasets = ["human_medical_data/", "veterinary_medical_data/"]
data_path = "./data/"
xml_files = []
for folder in datasets:
    xml_files.append([f"{data_path}{folder}{xml}" for xml in os.listdir(data_path + folder)])

hum_df, vet_df = xml_to_df(xml_files)

# balance case reports and other text types
vet_case_rep = vet_df[vet_df['text_types'].apply(lambda x: "Case Reports" in x)].sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True, inplace=False)
max_num = len(vet_case_rep)
vet_jour_art = vet_df[vet_df['text_types'].apply(lambda x: "Case Reports" not in x)].sample(max_num, random_state=RANDOM_SEED).reset_index(drop=True, inplace=False)
hum_case_rep = hum_df[hum_df['text_types'].apply(lambda x: "Case Reports" in x)].sample(max_num, random_state=RANDOM_SEED).reset_index(drop=True, inplace=False)
hum_jour_art = hum_df[hum_df['text_types'].apply(lambda x: "Case Reports" not in x)].sample(max_num, random_state=RANDOM_SEED).reset_index(drop=True, inplace=False)

train_set_case_rep, val_set_case_rep, test_set_case_rep = split_data(hum_case_rep, vet_case_rep)
train_set_jour_art, val_set_jour_art, test_set_jour_art = split_data(hum_jour_art, vet_jour_art)

train_set = pd.concat([train_set_case_rep, train_set_jour_art]).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True, inplace=False)
val_set = pd.concat([val_set_case_rep, val_set_jour_art]).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True, inplace=False)
test_set = pd.concat([test_set_case_rep, test_set_jour_art]).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True, inplace=False)

train_set["title_abstract"] = train_set[["title", "abstract"]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
val_set["title_abstract"] = val_set[["title", "abstract"]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
test_set["title_abstract"] = test_set[["title", "abstract"]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

  4%|███████                                                                                                                                                                                            | 3869/105930 [00:00<00:05, 19459.84it/s]

Processing medical field: human_medicine


 32%|██████████████████████████████████████████████████████████████▍                                                                                                                                   | 34093/105930 [00:03<00:05, 12965.68it/s]

Processing medical field: veterinary_medicine


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 105930/105930 [00:10<00:00, 10063.50it/s]


In [2]:
print("Shapes")
print("-------------")
print(f"Training set: {train_set.shape}")
print(f"Validation set: {val_set.shape}")
print(f"Test set: {test_set.shape}")

Shapes
-------------
Training set: (38360, 7)
Validation set: (4796, 7)
Test set: (4796, 7)


In [3]:
print("Class balance")
print("-------------")
print(f"Training set: {train_set.labels.describe().loc['mean']}")
print(f"Validation set: {val_set.labels.describe().loc['mean']}")
print(f"Test set: {test_set.labels.describe().loc['mean']}")

Class balance
-------------
Training set: 0.5
Validation set: 0.5
Test set: 0.5


In [4]:
train_set[["pmid", "text_types", "title", "abstract", "title_abstract", "meshtermlist", "labels"]].to_json('train.json', orient='records')
val_set[["pmid", "text_types", "title", "abstract", "title_abstract", "meshtermlist", "labels"]].to_json('valid.json', orient='records')
test_set[["pmid", "text_types", "title", "abstract", "title_abstract", "meshtermlist", "labels"]].to_json('test.json', orient='records')