## Prepare the environment and load the necessary libraries

In [None]:
# mount the drive to load and store files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# find Colab GPU for training 
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
!pip3 install -q tensorflow_gpu==2.1.0

[K     |████████████████████████████████| 421.8 MB 5.0 kB/s 
[K     |████████████████████████████████| 448 kB 52.9 MB/s 
[K     |████████████████████████████████| 50 kB 6.4 MB/s 
[K     |████████████████████████████████| 3.8 MB 41.2 MB/s 
[?25h  Building wheel for gast (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.0 requires tf-estimator-nightly==2.8.0.dev2021122109, which is not installed.
tensorflow 2.8.0 requires tensorboard<2.9,>=2.8, but you have tensorboard 2.1.1 which is incompatible.
tensorflow-probability 0.16.0 requires gast>=0.3.2, but you have gast 0.2.2 which is incompatible.[0m


In [None]:
import tensorflow as tf
print(tf.__version__)

2.8.0


In [None]:
!pip3 install -q ktrain

[K     |████████████████████████████████| 25.3 MB 643 kB/s 
[K     |████████████████████████████████| 22.3 MB 119 kB/s 
[K     |████████████████████████████████| 981 kB 62.3 MB/s 
[K     |████████████████████████████████| 263 kB 55.1 MB/s 
[K     |████████████████████████████████| 2.8 MB 52.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 26.0 MB/s 
[K     |████████████████████████████████| 468 kB 51.8 MB/s 
[K     |████████████████████████████████| 895 kB 59.3 MB/s 
[K     |████████████████████████████████| 596 kB 64.1 MB/s 
[K     |████████████████████████████████| 3.3 MB 34.5 MB/s 
[K     |████████████████████████████████| 77 kB 6.4 MB/s 
[?25h  Building wheel for ktrain (setup.py) ... [?25l[?25hdone
  Building wheel for keras-bert (setup.py) ... [?25l[?25hdone
  Building wheel for keras-transformer (setup.py) ... [?25l[?25hdone
  Building wheel for keras-embed-sim (setup.py) ... [?25l[?25hdone
  Building wheel for keras-layer-normalization (setup.py) ...

# Model 1

## Load the dataset

In [None]:
import pandas as pd

# load the training set
with open("/content/drive/MyDrive/Colab Notebooks/Model/JESC/tuning_train.csv") as file:
    train = pd.read_csv(file)

print(train.head(30))

    Unnamed: 0                   jp_sentence     label
0      1533289            落ち着いて 私のアドバイスを忘れずに  informal
1      1247871                  私 魔女にはなりたくない  informal
2       950993     翌日の4時04分に 404ドルもの賞金を渡しました    polite
3      1736253                  大丈夫ですよ。 行こう。    polite
4      2185184                  5年もやってれば慣れます    polite
5      1537257                     自分の電気ですから    polite
6      1930911              お...おはよう...ございます    polite
7      2402529                判ったよ 静かに、黙るんだ!  informal
8      2144582        5つの強みを知るための有効なテストがあります    polite
9      1823023      あの... シャワー浴びてる 女性がいますけど。    polite
10     1537836                       20分前は3+  informal
11     2050660                     トントン叩くのです    polite
12     1450758            場所によっては 火星よりも寒いのです    polite
13      293247          検事に話しといたわ なかった事にするって  informal
14     2185524                    ごくかすかな音でした    polite
15      447746                あなたにしか話していないのよ  informal
16      752718             指示されてすることでは ない点です    polite
17      41

In [None]:
# convert to format expected by ktrain
x_train = train['jp_sentence'].tolist()
y_train = train['label'].tolist()

In [None]:
# load the validation set
with open("/content/drive/MyDrive/Colab Notebooks/Model/JESC/validation.csv") as file:
    dev = pd.read_csv(file)

print(dev.head())

   Unnamed: 0              jp_sentence     label
0         405               地球にもたらされた。  informal
1        1190               うるせーな 返せよッ  informal
2        1132         わからないわ 私は心を読めないの  informal
3         731  曲は完璧なんだよ。 作り直す必要なんかねえよ。  informal
4        1754                座るつもりはない!  informal


In [None]:
# convert to format expected by ktrain
x_dev = dev['jp_sentence'].tolist()
y_dev = dev['label'].tolist()

## Load the libraries required by Tohoku BERT version

In [None]:
pip install fugashi[unidic-lite]

Collecting fugashi[unidic-lite]
  Downloading fugashi-1.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (568 kB)
[K     |████████████████████████████████| 568 kB 5.1 MB/s 
[?25hCollecting unidic-lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[K     |████████████████████████████████| 47.4 MB 53.7 MB/s 
[?25hBuilding wheels for collected packages: unidic-lite
  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic-lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658836 sha256=c067c9d9291bdb2c2ae4b13e2499b41240193726c6696c2fdfb086eb3d0a14e3
  Stored in directory: /root/.cache/pip/wheels/de/69/b1/112140b599f2b13f609d485a99e357ba68df194d2079c5b1a2
Successfully built unidic-lite
Installing collected packages: unidic-lite, fugashi
Successfully installed fugashi-1.1.2 unidic-lite-1.0.8


In [None]:
pip install ipadic

Collecting ipadic
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 4.9 MB/s 
[?25hBuilding wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-any.whl size=13556723 sha256=e8349cc267431c1ba46fe43cb2576adcb29dafd7075c04781710e4a270dbcc4f
  Stored in directory: /root/.cache/pip/wheels/33/8b/99/cf0d27191876637cd3639a560f93aa982d7855ce826c94348b
Successfully built ipadic
Installing collected packages: ipadic
Successfully installed ipadic-1.0.0


In [None]:
import ktrain
from ktrain import text

## Create and fit the model

In [None]:
MODEL_NAME = 'cl-tohoku/bert-base-japanese-v2'
t = text.Transformer(MODEL_NAME, maxlen=128, classes=['polite', 'informal'])
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_dev, y_dev)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=12)



Downloading:   0%|          | 0.00/517 [00:00<?, ?B/s]

preprocessing train...
language: ja




Downloading:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/174 [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: ja


Downloading:   0%|          | 0.00/550M [00:00<?, ?B/s]

In [None]:
learner.fit_onecycle(5e-5, 3)



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fd579e251d0>

## Evaluate the model

In [None]:
learner.validate(class_names=t.get_classes())

              precision    recall  f1-score   support

    informal       0.82      1.00      0.90      1632
      polite       0.00      0.00      0.00       368

    accuracy                           0.82      2000
   macro avg       0.41      0.50      0.45      2000
weighted avg       0.67      0.82      0.73      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[1632,    0],
       [ 368,    0]])

*The second model using a smaller dataset also shows the tendency to assign the same label regardless of the input. It will be attempted to resolved by using a balanced validation set on top of balanced training set.*

# Model 2 - test if using a balanced validation set resolves the issue

## Load the dataset

In [None]:
# load the train set
with open("/content/drive/MyDrive/Colab Notebooks/Model/JESC/tuning_train.csv") as file:
    train = pd.read_csv(file)

print(train.head(30))

    Unnamed: 0                   jp_sentence     label
0      1533289            落ち着いて 私のアドバイスを忘れずに  informal
1      1247871                  私 魔女にはなりたくない  informal
2       950993     翌日の4時04分に 404ドルもの賞金を渡しました    polite
3      1736253                  大丈夫ですよ。 行こう。    polite
4      2185184                  5年もやってれば慣れます    polite
5      1537257                     自分の電気ですから    polite
6      1930911              お...おはよう...ございます    polite
7      2402529                判ったよ 静かに、黙るんだ!  informal
8      2144582        5つの強みを知るための有効なテストがあります    polite
9      1823023      あの... シャワー浴びてる 女性がいますけど。    polite
10     1537836                       20分前は3+  informal
11     2050660                     トントン叩くのです    polite
12     1450758            場所によっては 火星よりも寒いのです    polite
13      293247          検事に話しといたわ なかった事にするって  informal
14     2185524                    ごくかすかな音でした    polite
15      447746                あなたにしか話していないのよ  informal
16      752718             指示されてすることでは ない点です    polite
17      41

In [None]:
# convert to format expected by ktrain
x_train = train['jp_sentence'].tolist()
y_train = train['label'].tolist()

In [None]:
# load the balanced validation set
with open("/content/drive/MyDrive/Colab Notebooks/Model/JESC/balanced_dev.csv") as file:
    dev = pd.read_csv(file)

print(dev.head())

   Unnamed: 0                         jp_sentence     label
0        1350                            退却! 隠れろ!  informal
1        1373                                 異常?  informal
2         685                           チャーリーは死んだ  informal
3         855         一年に一回, 毎年, みんな... 祝ってくれるんだ.  informal
4        1318  うろたえることなく 答えることが出来る人物 それがなぜ重要なんです?    polite


In [None]:
# convert to format expected by ktrain
x_dev = dev['jp_sentence'].tolist()
y_dev = dev['label'].tolist()

## Create and fit the model

In [None]:
MODEL_NAME = 'cl-tohoku/bert-base-japanese-v2'
t = text.Transformer(MODEL_NAME, maxlen=128, classes=['polite', 'informal'])
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_dev, y_dev)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=12)



preprocessing train...
language: ja


Is Multi-Label? False
preprocessing test...
language: ja


In [None]:
learner.fit_onecycle(5e-5, 3)



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fd58a635ed0>

## Evaluate the model

In [None]:
learner.validate(class_names=t.get_classes())

              precision    recall  f1-score   support

    informal       0.99      1.00      1.00       368
      polite       1.00      0.99      1.00       368

    accuracy                           1.00       736
   macro avg       1.00      1.00      1.00       736
weighted avg       1.00      1.00      1.00       736



array([[368,   0],
       [  2, 366]])

*Results indicate that using a balanced validation set improved the model performance - the model now assigns both labels to input.*

## Save the predictor

In [None]:
# save the model in the drive as Colab environment does not store models permanently
predictor = ktrain.get_predictor(learner.model, preproc=t)
predictor.save('/content/drive/MyDrive/mini_jf_classifier')

## Load the test set and predict using the model

In [None]:
# load unlabelled test set for the model to make predictions
with open("/content/drive/MyDrive/Colab Notebooks/Model/JESC/test-unlabelled.csv") as file:
    test = pd.read_csv(file)

print(test.head())

        jp_sentence
0           ほぼ無関係です
1         ゲイル 酔ってる?
2              注意しろ
3  最後の引き出しが 5日前にあった
4    僕が自殺し 物語を完成させる


In [None]:
# convert to format expected by ktrain model
test = test['jp_sentence'].tolist()

In [None]:
# make predictions on the test set
pred = predictor.predict(test)

In [None]:
print(pred)

['polite', 'informal', 'informal', 'informal', 'informal', 'informal', 'polite', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'polite', 'informal', 'informal', 'polite', 'informal', 'polite', 'informal', 'informal', 'informal', 'polite', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'polite', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'polite', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'polite', 'informal', 'polite', 'informal', 'informal', 'informal', 'polite', 'informal', 'informal', 'informal', 'informal', 'informal', 'polite', 'polite', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'informal', 'polite', 'informal', 'informal', 'informal', 'informal', 'informal', 'polite', 'informal', 'informal

## Evaluate predictions made by the model against the answers

In [None]:
# load test set with asnwers labelled using Japanese formality recoginition heuristic
with open("/content/drive/MyDrive/Colab Notebooks/Model/JESC/test-labelled.csv") as file:
    test_labelled = pd.read_csv(file)

print(test_labelled.head(30))
answers = test_labelled['label'].tolist()

                     jp_sentence     label
0                        ほぼ無関係です    polite
1                      ゲイル 酔ってる?  informal
2                           注意しろ  informal
3               最後の引き出しが 5日前にあった  informal
4                 僕が自殺し 物語を完成させる  informal
5                    国際犯罪に起きたのか?  informal
6      皆さんのiphoneで この活動に繋がってください    polite
7                 なら ついてくればいいでしょ  informal
8        それで こんなザマに 理由なんて どうでもいい  informal
9                      ああ 約束するぜ!  informal
10               ダイナマイト tnt 雷酸水銀  informal
11    売っちゃうよ! あの家。 家族みんな バラバラだよ。  informal
12                          遅いわよ  informal
13                       携帯を出したら  informal
14                     やめろ やめるんだ  informal
15         裁判所の判断によっては 子供たちは定期的に  informal
16  役者も まだ。 監督も まだ。 もちろん 企画も まだ。  informal
17                          問題ない  informal
18        駆除業者に捕まれば 二度と戻っては来られまい  informal
19                     我らの役目...。  informal
20                 レトロウイルスによるもの?  informal
21                          ユーリ!  informal
22         

In [None]:
# Calculate Accuracy Rate manually
count = len(["ok" for idx, label in enumerate(answers) if label == pred[idx]])
print ("Accuracy Rate is: %f" % (float(count) / len(answers)))

Accuracy Rate is: 0.997500


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# Calculate accuracy with scikit-learn method
print(accuracy_score(answers, pred))

0.9975


In [None]:
# download the model file
from google.colab import files
f_dir = '/content/drive/MyDrive/mini_jf_classifier/tf_model.h5'
files.download(f_dir)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Test on random sentences from BSD corpus

In [None]:
reloaded_predictor = ktrain.load_predictor('/content/drive/MyDrive/mini_jf_classifier')

In [None]:
# test the predictor on random sentences from BSD corpus - polite sentence
reloaded_predictor.predict('はい、K社システム開発部です。')

In [None]:
# polite sentence
reloaded_predictor.predict('そうですね、しましょう。')

'polite'

In [None]:
# check probability of the prediction with ktrain inbuilt method
reloaded_predictor.predict_proba('そうですね、しましょう。')

array([0.00144011, 0.9985599 ], dtype=float32)

In [None]:
# informal sentence
reloaded_predictor.predict('良かった。')

'informal'