In [1]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 11.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 36.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting

In [2]:
import json
import os
import nltk
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import collections

import datasets
from datasets import load_dataset
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

import tensorflow as tf

from transformers import BertTokenizer, BertForSequenceClassification
import torch

In [3]:
DATA_DIR = '/content/drive/MyDrive/codeml'

In [4]:
df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

In [5]:
value_map = {label: i  for i, label in enumerate(df.label.unique())}
value_map_inv = {i: label  for i, label in enumerate(df.label.unique())}

df.label = df.label.apply(lambda label: value_map.get(label))

In [6]:
train_df, val_df = train_test_split(df, test_size=.1)

dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(train_df),
    'validation': datasets.Dataset.from_pandas(val_df),
})

In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples['claim'], examples['evidence'], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    'bert_trainer',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=5e-5,
    weight_decay=0.01,
)

In [10]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: claim, claim_id, __index_level_0__, evidence.
***** Running training *****
  Num examples = 9000
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5630


Step,Training Loss
500,0.416
1000,0.3245
1500,0.248
2000,0.2102
2500,0.177
3000,0.1357
3500,0.0937
4000,0.0608


Saving model checkpoint to bert_trainer/checkpoint-500
Configuration saved in bert_trainer/checkpoint-500/config.json
Model weights saved in bert_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in bert_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in bert_trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to bert_trainer/checkpoint-1000
Configuration saved in bert_trainer/checkpoint-1000/config.json
Model weights saved in bert_trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in bert_trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in bert_trainer/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to bert_trainer/checkpoint-1500
Configuration saved in bert_trainer/checkpoint-1500/config.json
Model weights saved in bert_trainer/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in bert_trainer/checkpoint-1500/tokenizer_config.json
Special tokens file saved

Step,Training Loss
500,0.416
1000,0.3245
1500,0.248
2000,0.2102
2500,0.177
3000,0.1357
3500,0.0937
4000,0.0608
4500,0.0217
5000,0.01


Saving model checkpoint to bert_trainer/checkpoint-4500
Configuration saved in bert_trainer/checkpoint-4500/config.json
Model weights saved in bert_trainer/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in bert_trainer/checkpoint-4500/tokenizer_config.json
Special tokens file saved in bert_trainer/checkpoint-4500/special_tokens_map.json
Saving model checkpoint to bert_trainer/checkpoint-5000
Configuration saved in bert_trainer/checkpoint-5000/config.json
Model weights saved in bert_trainer/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in bert_trainer/checkpoint-5000/tokenizer_config.json
Special tokens file saved in bert_trainer/checkpoint-5000/special_tokens_map.json
Saving model checkpoint to bert_trainer/checkpoint-5500
Configuration saved in bert_trainer/checkpoint-5500/config.json
Model weights saved in bert_trainer/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in bert_trainer/checkpoint-5500/tokenizer_config.json
Special tokens file 

TrainOutput(global_step=5630, training_loss=0.15132589685345205, metrics={'train_runtime': 2904.5441, 'train_samples_per_second': 30.986, 'train_steps_per_second': 1.938, 'total_flos': 1.3536112332698928e+16, 'train_loss': 0.15132589685345205, 'epoch': 10.0})

In [None]:
!zip -r /content/saved_model.zip /content/bert_trainer/checkpoint-2500

  adding: content/bert_trainer/checkpoint-2500/ (stored 0%)
  adding: content/bert_trainer/checkpoint-2500/vocab.txt (deflated 49%)
  adding: content/bert_trainer/checkpoint-2500/special_tokens_map.json (deflated 40%)
  adding: content/bert_trainer/checkpoint-2500/training_args.bin (deflated 48%)
  adding: content/bert_trainer/checkpoint-2500/scheduler.pt (deflated 49%)
  adding: content/bert_trainer/checkpoint-2500/trainer_state.json (deflated 61%)
  adding: content/bert_trainer/checkpoint-2500/pytorch_model.bin (deflated 7%)
  adding: content/bert_trainer/checkpoint-2500/rng_state.pth (deflated 27%)
  adding: content/bert_trainer/checkpoint-2500/optimizer.pt (deflated 11%)
  adding: content/bert_trainer/checkpoint-2500/config.json (deflated 51%)
  adding: content/bert_trainer/checkpoint-2500/tokenizer_config.json (deflated 39%)
  adding: content/bert_trainer/checkpoint-2500/tokenizer.json (deflated 57%)


In [None]:
from google.colab import files
files.download("/content/saved_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Validation

In [11]:
val_predictions = trainer.predict(tokenized_dataset['validation'])

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: claim, claim_id, __index_level_0__, evidence.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 16


In [26]:
val_y_pred = val_predictions.predictions.argmax(axis=1)

In [24]:
f1_score(dataset['validation']['label'], val_y_pred, average='micro')

0.878

In [14]:
print(classification_report(dataset['validation']['label'], y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       543
           1       0.90      0.87      0.89       427
           2       0.06      0.07      0.06        30

    accuracy                           0.88      1000
   macro avg       0.62      0.62      0.62      1000
weighted avg       0.88      0.88      0.88      1000



# Predictions on Test set

In [None]:
test_df

Unnamed: 0,claim,evidence,claim_id
0,Asiatic Society of Bangladesh(housed in Nimtal...,Asiatic Society of Bangladesh The society is h...,14802
1,Herbivore men was coined by Maki Fukasawa and ...,Herbivore men The term was coined by the autho...,70296
2,"Shulin, a 33.1288 km (12.7911 sq mi) land loca...",Shulin District 'forest district') is an inner...,16578
3,Before the first Europeans arrived or copra co...,Maupihaa History Maupihaa Atoll was inhabited ...,1196
4,Sumo wrestler Toyozakura Toshiaki committed ma...,Toyozakura Toshiaki Toyozakura Toshiaki (born ...,5407
...,...,...,...
2495,"Evie Hone, born on April 22, 1894 at Roebuck G...",Evie Hone She is considered to be an early pio...,24421
2496,The Greek parliament changed the rules for Eur...,European Parliament elections in Greece Electo...,27184
2497,"Antwine Perez was born in Westville, New Jerse...",Antwine Perez Early life He was born on April ...,68814
2498,Rachel Bradley never studied a foreign languag...,Rachel Bradley Rachel Louise Bradley is a fict...,25112


In [15]:
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [16]:
test_dataset = datasets.Dataset.from_pandas(test_df)

In [17]:
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

In [18]:
test_predictions = trainer.predict(tokenized_test_dataset)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: claim, claim_id, evidence.
***** Running Prediction *****
  Num examples = 2500
  Batch size = 16


In [19]:
test_y_pred = test_predictions.predictions.argmax(axis=1)

In [20]:
test_pred_labels = [value_map_inv.get(label) for label in test_y_pred]

In [21]:
pred_df = pd.DataFrame({
    'claim_id': test_df.claim_id,
    'label': test_pred_labels
})

pred_df.to_csv(f'Submission-4.csv', index=False)