In [1]:
from datasets import load_dataset
import pandas as pd

In [2]:
id2label = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
label2id = {'entailment': 0, 'neutral': 1, 'contradiction': 2}

# Concat IndoNLI and MNLI-translated into Augmented DF

In [3]:
data_basic = load_dataset('indonli')

data_indonli_test_lay_df = pd.DataFrame(data_basic["test_lay"])
data_indonli_test_expert_df = pd.DataFrame(data_basic["test_expert"])

data_indonli_train_df = pd.DataFrame(data_basic["train"])
data_indonli_validation_df = pd.DataFrame(data_basic["validation"])
data_indonli_test_df = pd.concat([data_indonli_test_lay_df, data_indonli_test_expert_df])

data_indonli_train_df['label'] = data_indonli_train_df['label'].replace(id2label)
data_indonli_validation_df['label'] = data_indonli_validation_df['label'].replace(id2label)
data_indonli_test_df['label'] = data_indonli_test_df['label'].replace(id2label)

Reusing dataset indo_nli (/root/.cache/huggingface/datasets/indo_nli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)


  0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
data_mnli_translated_train_df = pd.read_json(path_or_buf='train.jsonl', lines=True)
data_mnli_translated_validation_df = pd.read_json(path_or_buf='dev.jsonl', lines=True)

data_mnli_translated_train_df = data_mnli_translated_train_df[['sentence1', 'sentence2', 'gold_label']]
data_mnli_translated_train_df = data_mnli_translated_train_df.rename(columns={'sentence1': 'premise', 
                                                                              'sentence2': 'hypothesis', 
                                                                              'gold_label': 'label'})

data_mnli_translated_validation_df = data_mnli_translated_validation_df[['sentence1', 'sentence2', 'gold_label']]
data_mnli_translated_validation_df = data_mnli_translated_validation_df.rename(columns={'sentence1': 'premise', 
                                                                              'sentence2': 'hypothesis', 
                                                                              'gold_label': 'label'})

data_mnli_translated_train_df = data_mnli_translated_train_df[data_mnli_translated_train_df.label != '-']
data_mnli_translated_validation_df = data_mnli_translated_validation_df[data_mnli_translated_validation_df.label != '-']

In [5]:
data_augmented_train_df = pd.concat([data_indonli_train_df, data_mnli_translated_train_df]).reset_index(drop=True)
data_augmented_validation_df = pd.concat([data_indonli_validation_df, data_mnli_translated_validation_df]).reset_index(drop=True)
data_augmented_test_df = data_indonli_test_df # MNLI-translated does not have test set

# Concat Augmented DF and IDK-MRC-NLI_Keep

In [6]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd

data_files = {"train": "data_nli_train_df_keep.csv", 
              "validation": "data_nli_val_df_keep.csv", 
              "test": "data_nli_test_df_keep.csv"}

dataset = load_dataset("muhammadravi251001/debug-entailment", data_files=data_files)

selected_columns = ["premise", "hypothesis", "label"]
# selected_columns = dataset.column_names['train'] # Uncomment this line to retrieve all of the columns

df_train = pd.DataFrame(dataset["train"])
df_train = df_train[selected_columns]

df_val = pd.DataFrame(dataset["validation"])
df_val = df_val[selected_columns]

df_test = pd.DataFrame(dataset["test"])
df_test = df_test[selected_columns]

train_dataset = Dataset.from_dict(df_train)
validation_dataset = Dataset.from_dict(df_val)
test_dataset = Dataset.from_dict(df_test)

dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})

df_idk_mrc_nli_train = pd.DataFrame(dataset['train'])
df_idk_mrc_nli_validation = pd.DataFrame(dataset['validation'])
df_idk_mrc_nli_test = pd.DataFrame(dataset['test'])

Using custom data configuration muhammadravi251001--debug-entailment-558429883ba3b414
Reusing dataset csv (/root/.cache/huggingface/datasets/muhammadravi251001___csv/muhammadravi251001--debug-entailment-558429883ba3b414/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
data_multilingual_indonesia_train_df = pd.concat([data_augmented_train_df, df_idk_mrc_nli_train]).reset_index(drop=True)
data_multilingual_indonesia_validation_df = pd.concat([data_augmented_validation_df, df_idk_mrc_nli_validation]).reset_index(drop=True)
data_multilingual_indonesia_test_df = pd.concat([data_augmented_test_df, df_idk_mrc_nli_test]).reset_index(drop=True)

In [8]:
data_multilingual_indonesia_train_df.to_csv("multilingual_nli_train_df.csv", index=False)
data_multilingual_indonesia_validation_df.to_csv("multilingual_nli_validation_df.csv", index=False)
data_multilingual_indonesia_test_df.to_csv("multilingual_nli_test_df.csv", index=False)

In [9]:
from tqdm import tqdm

for i in tqdm(range(len(data_multilingual_indonesia_train_df))):
    
    if 'Douwes Dekker' in data_multilingual_indonesia_train_df['premise'][i]:
        print("Douwes Dekker was here.")

100%|███████████████████████████████████████████████████████████████████████| 413116/413116 [00:01<00:00, 229242.15it/s]

Douwes Dekker was here.
Douwes Dekker was here.
Douwes Dekker was here.
Douwes Dekker was here.
Douwes Dekker was here.
Douwes Dekker was here.





In [10]:
def check_type(val):
    return type(val)

def check_type_value(data, check_type=check_type):
    
    data['premise_types'] = data['premise'].apply(check_type)
    data['hypothesis_types'] = data['hypothesis'].apply(check_type)
    data['label_types'] = data['label'].apply(check_type)
    
    print("Premise")
    print(data['premise_types'].value_counts())
    print()
    
    print("Hypothesis")
    print(data['hypothesis_types'].value_counts())
    print()
    
    print("Label")
    print(data['label_types'].value_counts())
    print()

In [11]:
data_multilingual_indonesia_train_df = check_type_value(data_multilingual_indonesia_train_df)
data_multilingual_indonesia_validation_df = check_type_value(data_multilingual_indonesia_validation_df)
data_multilingual_indonesia_test_df = check_type_value(data_multilingual_indonesia_test_df)

Premise
<class 'str'>    413116
Name: premise_types, dtype: int64

Hypothesis
<class 'str'>    413116
Name: hypothesis_types, dtype: int64

Label
<class 'str'>    413116
Name: label_types, dtype: int64

Premise
<class 'str'>    22608
Name: premise_types, dtype: int64

Hypothesis
<class 'str'>    22608
Name: hypothesis_types, dtype: int64

Label
<class 'str'>    22608
Name: label_types, dtype: int64

Premise
<class 'str'>    6029
Name: premise_types, dtype: int64

Hypothesis
<class 'str'>    6029
Name: hypothesis_types, dtype: int64

Label
<class 'str'>    6029
Name: label_types, dtype: int64



# Check with existing databases

In [12]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd

data_files = {"train": "multilingual_nli_train_df.csv", 
              "validation": "multilingual_nli_validation_df.csv", 
              "test": "multilingual_nli_test_df.csv"}

dataset = load_dataset("muhammadravi251001/multilingual-nli-dataset", data_files=data_files)

selected_columns = ["premise", "hypothesis", "label"]
# selected_columns = dataset.column_names['train'] # Uncomment this line to retrieve all of the columns

df_train = pd.DataFrame(dataset["train"])
df_train = df_train[selected_columns]

df_val = pd.DataFrame(dataset["validation"])
df_val = df_val[selected_columns]

df_test = pd.DataFrame(dataset["test"])
df_test = df_test[selected_columns]

train_dataset = Dataset.from_dict(df_train)
validation_dataset = Dataset.from_dict(df_val)
test_dataset = Dataset.from_dict(df_test)

dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})

df_multilingual_train = pd.DataFrame(dataset['train'])
df_multilingual_validation = pd.DataFrame(dataset['validation'])
df_multilingual_test = pd.DataFrame(dataset['test'])

Using custom data configuration muhammadravi251001--multilingual-nli-dataset-7737087635c3493e
Reusing dataset csv (/root/.cache/huggingface/datasets/muhammadravi251001___csv/muhammadravi251001--multilingual-nli-dataset-7737087635c3493e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
len(data_augmented_train_df)

403032

In [14]:
len(df_idk_mrc_nli_train)

10084

In [15]:
len(df_multilingual_train)

402989

In [16]:
len(data_augmented_train_df) - len(df_multilingual_train)

43