<a href="https://colab.research.google.com/github/mypeacefulcode/ml-research/blob/main/Natural_language_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.15.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from trans

In [2]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
%env WANDB_PROJECT=nli

env: WANDB_PROJECT=nli


In [4]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset, load_metric
from transformers import (
      AutoTokenizer, 
      ElectraForSequenceClassification, 
      DataCollatorWithPadding, 
      TrainingArguments, 
      Trainer
    )

In [5]:
!git clone https://github.com/kakaobrain/kor-nlu-datasets.git

Cloning into 'kor-nlu-datasets'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Total 16 (delta 1), reused 1 (delta 1), pack-reused 14[K
Unpacking objects: 100% (16/16), 46.67 MiB | 7.92 MiB/s, done.


In [6]:
!ls ./kor-nlu-datasets/KorNLI/

multinli.train.ko.tsv  snli_1.0_train.ko.tsv  xnli.dev.ko.tsv  xnli.test.ko.tsv


In [7]:
train_df = pd.read_csv("./kor-nlu-datasets/KorNLI/snli_1.0_train.ko.tsv", sep="\t")
train_df = train_df.rename(columns={'gold_label':'label_name'})
test_df = pd.read_csv("./kor-nlu-datasets/KorNLI/xnli.dev.ko.tsv", sep="\t")
test_df = test_df.rename(columns={'gold_label':'label_name'})

In [8]:
train_df.isnull().any()

sentence1     False
sentence2      True
label_name    False
dtype: bool

In [9]:
train_df[train_df['sentence2'].isnull()==True]

Unnamed: 0,sentence1,sentence2,label_name
91479,설명할 그림을 볼 수 없습니다.,,neutral
91480,설명할 그림을 볼 수 없습니다.,,entailment
91481,설명할 그림을 볼 수 없습니다.,,contradiction
311124,보라색 공으로 점프하는 것은 정말 재미있어요!,,contradiction
311125,보라색 공으로 점프하는 것은 정말 재미있어요!,,neutral
311126,보라색 공으로 점프하는 것은 정말 재미있어요!,,entailment


In [10]:
train_df = train_df.drop(index=train_df[train_df['sentence2'].isnull()==True].index)

In [11]:
test_df.isnull().any()

sentence1     False
sentence2     False
label_name     True
dtype: bool

In [12]:
test_df[test_df['label_name'].isnull()==True]

Unnamed: 0,sentence1,sentence2,label_name
898,상처! 스스로 설명해 주실 수 있겠습니까? 휘더가 울버스톤을 떠나 보냈나요?\t너는...,neutral,


In [13]:
test_df = test_df.drop(index=test_df[test_df['label_name'].isnull()==True].index)

In [14]:
labels = train_df['label_name'].unique().tolist()
labels

['neutral', 'contradiction', 'entailment']

In [15]:
id2label = {int(k): v for k, v in enumerate(labels)}
id2label

{0: 'neutral', 1: 'contradiction', 2: 'entailment'}

In [16]:
label2id = {v: k for k, v in id2label.items()}
label2id

{'neutral': 0, 'contradiction': 1, 'entailment': 2}

In [32]:
train_df['label'] = train_df.apply(lambda x: label2id[x['label_name']], axis=1)
test_df['label'] = test_df.apply(lambda x: label2id[x['label_name']], axis=1)

In [33]:
train_ds = Dataset.from_pandas(train_df, preserve_index=False)
test_ds = Dataset.from_pandas(test_df, preserve_index=False)

In [34]:
train_ds = train_ds.class_encode_column("label")

Stringifying the column:   0%|          | 0/550146 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/550146 [00:00<?, ? examples/s]

In [35]:
train_ds

Dataset({
    features: ['sentence1', 'sentence2', 'label_name', 'label'],
    num_rows: 550146
})

In [36]:
test_ds = test_ds.class_encode_column("label")

Stringifying the column:   0%|          | 0/1570 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1570 [00:00<?, ? examples/s]

In [37]:
test_ds

Dataset({
    features: ['sentence1', 'sentence2', 'label_name', 'label'],
    num_rows: 1570
})

In [24]:
model = ElectraForSequenceClassification.from_pretrained(
    "monologg/koelectra-base-v3-discriminator",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [25]:
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

Downloading (…)okenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

In [26]:
def preprocess_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True)

In [27]:
tokenized_train_ds = train_ds.map(preprocess_function, batched=True)
tokenized_test_ds = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/55014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1570 [00:00<?, ? examples/s]

In [28]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [29]:
metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [30]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_strategy = "steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
wandb.finish()