## Preprocess Datasets into one Format

1. Sentence
2. Entity_1
3. Entitiy_2
4. Relation

In [1]:
import random
random.seed(0)

import os
import json
import pandas as pd

## 1. Core Data

In [2]:
df_train = pd.read_json("./../datasets/core_data/train.json")
print(df_train.shape)
df_train.head()

(4000, 10)


Unnamed: 0,id,relation,invert_relation,e1_start,e1_end,e2_start,e2_end,e1_name,e2_name,context
0,E8026501,shareholder_of,1,0,0,15,17,Badgeville,Norwest Venture Partners,"[Badgeville, subsequently, raised, a, $, 12M, ..."
1,E8034594,competitor_of,0,0,0,3,3,Bioverativ Inc.,Baxalta,"[Bioverativ, competes, with, Baxalta, (, acqui..."
2,E8029931,client_of,1,0,0,12,14,Baton Rouge Southern Railroad,Kansas City Southern,"[It, also, serves, as, a, switching, and, car,..."
3,E8202016,shareholder_of,1,3,4,36,38,PEG Africa Ltd.,Blue Haven Initiative,"[In, 2017, ,, the, company, raised, a, further..."
4,E8110590,product_or_service_of,1,10,14,34,35,"Lewis Galoob Toys, Inc.",Micro Machines,"[Products, Toys, ,, video, games, ,, consumer,..."


In [7]:
def reformat_core_data(df):
    df["sentence"] = df["context"].apply(lambda x: " ".join(x).strip())
    
    # renaming columns
    df.rename(
        columns={
            "e1_name": "entity_1",
            "e2_name": "entity_2",
        },
        inplace=True
    )
    # where invert_relation is 1, we need to swap the entities
    mask = df["invert_relation"] == 1
    entity_1 = df.loc[mask, "entity_2"]
    entity_2 = df.loc[mask, "entity_1"]
    df.loc[mask, "entity_1"] = entity_1
    df.loc[mask, "entity_2"] = entity_2
    
    # delete unnecessary columns
    df.drop(
        columns=[
            "context",
            "invert_relation",
            "e1_start",
            "e1_end",
            "e2_start",
            "e2_end"
        ],
        inplace=True
    )
    return df

In [8]:
df_train = reformat_core_data(df_train)
df_train.head()

Unnamed: 0,id,relation,entity_1,entity_2,sentence
0,E8026501,shareholder_of,Norwest Venture Partners,Badgeville,Badgeville subsequently raised a $ 12M Series ...
1,E8034594,competitor_of,Bioverativ Inc.,Baxalta,Bioverativ competes with Baxalta ( acquired by...
2,E8029931,client_of,Kansas City Southern,Baton Rouge Southern Railroad,It also serves as a switching and car storage ...
3,E8202016,shareholder_of,Blue Haven Initiative,PEG Africa Ltd.,"In 2017 , the company raised a further $ 13.5 ..."
4,E8110590,product_or_service_of,Micro Machines,"Lewis Galoob Toys, Inc.","Products Toys , video games , consumer electro..."


In [9]:
df_test = reformat_core_data(pd.read_json("./../datasets/core_data/test.json"))
print(df_test.shape)
df_test.head()

(708, 5)


Unnamed: 0,id,relation,entity_1,entity_2,sentence
0,E8294494,undefined,"Winnie, Inc.",Android,The Winnie iPhone app was launched in June 201...
1,E8169451,product_or_service_of,Duplex,Marker International,Hannes Marker felt he could improve on the con...
2,E8256769,shareholder_of,Lion,Stone & Wood Brewing Co.,"In 2012 Cook , Rogers and Jurisich bought back..."
3,E8053281,collaboration,Bell Canada,"Cellport Systems, Inc.",In 1994 a consortium of six cellular carriers ...
4,E8114162,subsidiary_of,GETCO,Global Electronic Trading Company,The firm 's primary business is electronic mar...


In [10]:
df_train.to_csv("./../datasets/core_data/final_train.csv", index=False)
df_test.to_csv("./../datasets/core_data/final_test.csv", index=False)

## 2. SemEval Data

In [21]:
df_train = pd.read_csv("./../datasets/semeval_2008_task8/train.csv")
df_test = pd.read_csv("./../datasets/semeval_2008_task8/test.csv")

In [22]:
df_train.head()

Unnamed: 0,sentence,relation,label,entity1,entity2,sentence_without_tags,split
0,The <e1>team</e1> stapled the plastic along th...,"Instrument-Agency (e2, e1)",11,team,guns,The team stapled the plastic along the joists ...,train
1,The <e1>barbels</e1> of the exposed <e2>catfis...,"Component-Whole (e1, e2)",2,barbels,catfish,The barbels of the exposed catfish curled with...,train
2,The confrontation between Martyn and Liz earli...,"Cause-Effect (e1, e2)",0,struggle,drowning,The confrontation between Martyn and Liz earli...,train
3,The <e1>batteries</e1> are kept in a <e2>phone...,"Component-Whole (e1, e2)",2,batteries,phone,The batteries are kept in a phone.,train
4,The first two <e1>papers</e1> define the conte...,"Message-Topic (e1, e2)",14,papers,threat,The first two papers define the contemporary t...,train


In [23]:
def reformat_semeval_data(df):
    # delete unnecessary columns
    df.drop(
        columns=[
            "sentence",
            "label", "split"
        ],
        inplace=True
    )
    df.rename(
        columns={
            "entity1": "entity_1",
            "entity2": "entity_2",
            "sentence_without_tags": "sentence"
        },
        inplace=True
    )
    return df
df_train = reformat_semeval_data(df_train)
df_test = reformat_semeval_data(df_test)


In [24]:
df_train.head()

Unnamed: 0,relation,entity_1,entity_2,sentence
0,"Instrument-Agency (e2, e1)",team,guns,The team stapled the plastic along the joists ...
1,"Component-Whole (e1, e2)",barbels,catfish,The barbels of the exposed catfish curled with...
2,"Cause-Effect (e1, e2)",struggle,drowning,The confrontation between Martyn and Liz earli...
3,"Component-Whole (e1, e2)",batteries,phone,The batteries are kept in a phone.
4,"Message-Topic (e1, e2)",papers,threat,The first two papers define the contemporary t...


In [25]:
df_train.to_csv("./../datasets/semeval_2008_task8/final_train.csv", index=False)
df_test.to_csv("./../datasets/semeval_2008_task8/final_test.csv", index=False)

## 3. Refind Data

In [3]:
df_train = pd.read_csv("./../datasets/refind_data/train.csv")
df_test = pd.read_csv("./../datasets/refind_data/test.csv")
print(df_train.shape, df_test.shape)
df_train.head()

(20070, 13) (4300, 13)


Unnamed: 0,id,docid,sentence,head_entity_text,tail_entity_text,head_entity_char_idxs,tail_entity_char_idxs,relation,label,relation_group,e1_type,e2_type,sentence_with_entity_tags
0,,2016/2017,warrants to purchase Lumos Networks Corp. comm...,Lumos Networks Corp.,the Pamplona Entities,[3 6],[ 9 12],no_relation,0,ORG-ORG,ORG,ORG,warrants to purchase <e1> Lumos Networks Corp....
1,,2016/2017,warrants to purchase Lumos Networks Corp. comm...,the Pamplona Entities,Lumos Networks Corp.,[ 9 12],[3 6],no_relation,0,ORG-ORG,ORG,ORG,warrants to purchase <e2> Lumos Networks Corp....
2,BERTPretrain_10KReports/2017/QTR2/20170518_10-...,2016/2017,turn over to Global Gold at its offices in Rye...,Global Gold,Stockhouse,[115 117],[95 96],no_relation,0,ORG-ORG,ORG,ORG,turn over to Global Gold at its offices in Rye...
3,,2016/2017,ts Eighteen of FelCor Lodging LP Consolidated ...,FelCor Lodging LP,Embassy Suites,[21 24],[15 17],no_relation,0,ORG-ORG,ORG,ORG,ts Eighteen of FelCor Lodging LP Consolidated ...
4,,2016/2017,the WFOE will waive and release you unconditio...,"YOU ON DEMAND HOLDINGS , INC . s",WFOE,[21 29],[1 2],no_relation,0,ORG-ORG,ORG,ORG,the <e2> WFOE </e2> will waive and release you...


In [4]:
def reformat_refind_data(df):
    # delete unnecessary columns
    df.drop(
        columns=[
            "id", "docid",
            "head_entity_char_idxs", "tail_entity_char_idxs",
            "label", "relation_group",
            "e1_type", "e2_type", "sentence_with_entity_tags"
        ],
        inplace=True
    )
    df.rename(
        columns={
            "head_entity_text": "entity_1",
            "tail_entity_text": "entity_2"
        },
        inplace=True
    )
    return df

In [5]:
df_train = reformat_refind_data(df_train)
df_test = reformat_refind_data(df_test)
df_train.head()

Unnamed: 0,sentence,entity_1,entity_2,relation
0,warrants to purchase Lumos Networks Corp. comm...,Lumos Networks Corp.,the Pamplona Entities,no_relation
1,warrants to purchase Lumos Networks Corp. comm...,the Pamplona Entities,Lumos Networks Corp.,no_relation
2,turn over to Global Gold at its offices in Rye...,Global Gold,Stockhouse,no_relation
3,ts Eighteen of FelCor Lodging LP Consolidated ...,FelCor Lodging LP,Embassy Suites,no_relation
4,the WFOE will waive and release you unconditio...,"YOU ON DEMAND HOLDINGS , INC . s",WFOE,no_relation


In [6]:
df_train.to_csv("./../datasets/refind_data/final_train.csv", index=False)
df_test.to_csv("./../datasets/refind_data/final_test.csv", index=False)