## Dependencies

In [56]:
import sys

sys.path.append('../lib')

## Prepare training and test datasets

In [9]:
# DRKG data
!cp ../graph_data/formatted_relations/drkg/formatted_drkg.tsv ./rawdata/drkg/formatted_drkg.tsv
!cp ../graph_data/formatted_relations/drkg/unformatted_drkg.tsv ./rawdata/drkg/unformatted_drkg.tsv

We need to prepare the training and external test datasets. We will use the training dataset to train the model and the test dataset to evaluate the model for all KGE models. But the drkg dataset have so many relations in the `unformatted_drkg.tsv` which cannot be formatted, so we use the relations in the `formatted_drkg.tsv` to prepare the training and external test datasets.

After that, we split the `formatted_drkg.tsv` into training and test datasets. The training dataset is named as `formatted_drkg_train.tsv` and the test dataset is named as `formatted_drkg_test.tsv`.

In [14]:
!python ../lib/data.py split -i ./rawdata/drkg/formatted_drkg.tsv -o1 ./rawdata/drkg/formatted_drkg_train.tsv -o2 ./rawdata/drkg/formatted_drkg_test.tsv -r 0.95

In [15]:
# HSDN data
!cp ../graph_data/formatted_relations/hsdn/formatted_hsdn.tsv ./rawdata/optional/formatted_hsdn.tsv

## Prepare Raw DRKG dataset

In [42]:
!cp ../graph_data/formatted_relations/drkg/raw_drkg.tsv ./rawdata/drkg/raw_drkg.tsv

In [45]:
# Extract raw records from formatted_drkg_test.tsv
import pandas as pd

df = pd.read_csv("./rawdata/drkg/formatted_drkg_test.tsv", sep="\t")
selected_columns = [
    # raw_source_id	raw_target_id	raw_source_type	raw_target_type	relation_type	resource	pmids	key_sentence
    "raw_source_id",
    "raw_target_id",
    "raw_source_type",
    "raw_target_type",
    "relation_type",
    "resource",
    "pmids",
    "key_sentence",
]

df = df[selected_columns]
df.columns = [
    "source_id",
    "target_id",
    "source_type",
    "target_type",
    "relation_type",
    "resource",
    "pmids",
    "key_sentence",
]

df.to_csv("./raw_drkg/relations_test.tsv", sep="\t", index=False)

In [46]:
!python ../lib/data.py substract -i1 ./rawdata/drkg/raw_drkg.tsv -i2 ./raw_drkg/test.tsv -o ./raw_drkg/relations.tsv

Substracting the two dataframes with the following columns:  ['source_id', 'target_id', 'source_type', 'target_type', 'relation_type', 'resource', 'pmids', 'key_sentence']


In [47]:
!python ../lib/data.py hrt -i ./raw_drkg/relations.tsv -o ./raw_drkg/relations_hrt.tsv

In [48]:
!python ../lib/data.py split -i ./raw_drkg/relations_hrt.tsv -o1 ./raw_drkg/train.tsv -o2 ./raw_drkg/valid.tsv -r 0.95

In [49]:
!python ../lib/data.py hrt -i ./raw_drkg/relations_test.tsv -o ./raw_drkg/test.tsv

## Prepare DRKG dataset

Merge the `formatted_drkg_train.tsv` and `unformatted_drkg.tsv` to get the `train.tsv` and `valid.tsv` which is used to train the KGE models.

In [16]:
datadir = './rawdata/drkg'

In [17]:
import os
import pandas as pd

selected_columns = [
    "relation_type",
    "source_type",
    "source_id",
    "target_type",
    "target_id",
    "resource",
]

formatted_drkg_data = pd.read_csv(
    os.path.join(datadir, "formatted_drkg_train.tsv"), sep="\t"
)
formatted_drkg_data = formatted_drkg_data[selected_columns]
print("Formatted DRKG data shape: ", formatted_drkg_data.shape)

unformatted_drkg_data = pd.read_csv(
    os.path.join(datadir, "unformatted_drkg.tsv"), sep="\t"
)
unformatted_drkg_data = unformatted_drkg_data[selected_columns]
print("Unformatted DRKG data shape: ", unformatted_drkg_data.shape)

relations = pd.concat(
    [
        formatted_drkg_data,
        unformatted_drkg_data,
    ]
)

# Save the merged data
relations.to_csv(os.path.join("drkg", "relations.tsv"), sep="\t", index=False)

Formatted DRKG data shape:  (5394641, 6)
Unformatted DRKG data shape:  (194412, 6)


In [18]:
!python ../lib/data.py hrt -i ./drkg/relations.tsv -o ./drkg/relations_hrt.tsv

In [20]:
!python ../lib/data.py split -i ./drkg/relations_hrt.tsv -o1 ./drkg/train.tsv -o2 ./drkg/valid.tsv -r 0.95

In [21]:
!python ../lib/data.py hrt -i ./rawdata/drkg/formatted_drkg_test.tsv -o ./drkg/test.tsv

## Prepare DRKG + HSDN dataset

Merge the `formatted_drkg_train.tsv`, `unformatted_drkg.tsv` and `graph_data/formatted_relations/hsdn/formatted_hsdn.tsv` to get the `train.tsv` and `valid.tsv` which is used to train the KGE models.

In [30]:
datadir = "./rawdata/drkg"

In [32]:
import os
import pandas as pd

selected_columns = [
    "relation_type",
    "source_type",
    "source_id",
    "target_type",
    "target_id",
    "resource",
]

formatted_drkg_data = pd.read_csv(
    os.path.join(datadir, "formatted_drkg_train.tsv"), sep="\t"
)
formatted_drkg_data = formatted_drkg_data[selected_columns]
print("Formatted DRKG data shape: ", formatted_drkg_data.shape)

unformatted_drkg_data = pd.read_csv(
    os.path.join(datadir, "unformatted_drkg.tsv"), sep="\t"
)
unformatted_drkg_data = unformatted_drkg_data[selected_columns]
print("Unformatted DRKG data shape: ", unformatted_drkg_data.shape)

hsdn_data = pd.read_csv(
    os.path.join(
        "../graph_data/formatted_relations/hsdn",
        "formatted_hsdn.tsv",
    ),
    sep="\t",
)
hsdn_data = hsdn_data[selected_columns]
print("HSDN data shape: ", hsdn_data.shape)

relations = pd.concat(
    [
        formatted_drkg_data,
        unformatted_drkg_data,
        hsdn_data,
    ]
)

# Save the merged data
relations.to_csv(os.path.join("drkg+hsdn", "relations.tsv"), sep="\t", index=False)

Formatted DRKG data shape:  (5394641, 6)
Unformatted DRKG data shape:  (194412, 6)
HSDN data shape:  (130857, 6)


In [33]:
!python ../lib/data.py hrt -i ./drkg+hsdn/relations.tsv -o ./drkg+hsdn/relations_hrt.tsv

In [34]:
!python ../lib/data.py split -i ./drkg+hsdn/relations_hrt.tsv -o1 ./drkg+hsdn/train.tsv -o2 ./drkg+hsdn/valid.tsv -r 0.95

In [35]:
!python ../lib/data.py hrt -i ./rawdata/drkg/formatted_drkg_test.tsv -o ./drkg+hsdn/test.tsv

## Prepare DRKG + HSDN + Custom ME/CFS + Malacards dataset

Merge the `formatted_drkg_train.tsv`, `unformatted_drkg.tsv`, `datasets/rawdata/optional/formatted_custom_mecfs.tsv`, `datasets/rawdata/optional/formatted_malacards.tsv` and `graph_data/formatted_relations/hsdn/formatted_hsdn.tsv` to get the `train.tsv` and `valid.tsv` which is used to train the KGE models.

In [57]:
datadir = "./rawdata/drkg"

In [58]:
import os
import pandas as pd

selected_columns = [
    "relation_type",
    "source_type",
    "source_id",
    "target_type",
    "target_id",
    "resource",
]

formatted_drkg_data = pd.read_csv(
    os.path.join(datadir, "formatted_drkg_train.tsv"), sep="\t"
)
formatted_drkg_data = formatted_drkg_data[selected_columns]
print("Formatted DRKG data shape: ", formatted_drkg_data.shape)

unformatted_drkg_data = pd.read_csv(
    os.path.join(datadir, "unformatted_drkg.tsv"), sep="\t"
)
unformatted_drkg_data = unformatted_drkg_data[selected_columns]
print("Unformatted DRKG data shape: ", unformatted_drkg_data.shape)

hsdn_data = pd.read_csv(
    os.path.join(
        "../graph_data/formatted_relations/hsdn",
        "formatted_hsdn.tsv",
    ),
    sep="\t",
)
hsdn_data = hsdn_data[selected_columns]
print("HSDN data shape: ", hsdn_data.shape)

custom_mecfs_data = pd.read_csv(
    os.path.join(
        "rawdata/optional",
        "formatted_custom_mecfs.tsv",
    ),
    sep="\t",
)
custom_mecfs_data = custom_mecfs_data[selected_columns]
print("Custom MECFS data shape: ", custom_mecfs_data.shape)

malacards_data = pd.read_csv(
    os.path.join(
        "rawdata/optional",
        "formatted_malacards.tsv",
    ),
    sep="\t",
)
malacards_data = malacards_data[selected_columns]
print("Malacards data shape: ", malacards_data.shape)

relations = pd.concat(
    [
        formatted_drkg_data,
        unformatted_drkg_data,
        hsdn_data,
        custom_mecfs_data,
        malacards_data,
    ]
)

# Save the merged data
relations.to_csv(
    os.path.join("drkg+hsdn+custom+malacards", "relations.tsv"), sep="\t", index=False
)

Formatted DRKG data shape:  (5394641, 6)
Unformatted DRKG data shape:  (194412, 6)
HSDN data shape:  (130857, 6)
Custom MECFS data shape:  (602, 6)
Malacards data shape:  (201, 6)


In [53]:
!python ../lib/data.py hrt -i ./drkg+hsdn+custom+malacards/relations.tsv -o ./drkg+hsdn+custom+malacards/relations_hrt.tsv

In [54]:
!python ../lib/data.py split -i ./drkg+hsdn+custom+malacards/relations_hrt.tsv -o1 ./drkg+hsdn+custom+malacards/train.tsv -o2 ./drkg+hsdn+custom+malacards/valid.tsv -r 0.95

In [55]:
!python ../lib/data.py hrt -i ./rawdata/drkg/formatted_drkg_test.tsv -o ./drkg+hsdn+custom+malacards/test.tsv