# CRFを用いたスロットフィリング

## 準備

### パッケージのインストール

In [2]:
!pip install -q sklearn-crfsuite==0.3.6 seqeval==1.2.2 scikit-learn==0.23.2 nltk==3.2.5 scipy==1.4.1 numpy==1.19.5

Collecting sklearn-crfsuite==0.3.6
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting seqeval==1.2.2
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.8 MB/s 
[?25hCollecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 34.2 MB/s 
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743 kB)
[K     |████████████████████████████████| 743 kB 41.8 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=caecd227e13c5c1ef0cf2ac9434ab60fb723f5103d91ec9c06d9eeeae32ebde5
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3

### インポート

In [35]:
import json
import os

import nltk
import numpy as np
import scipy
from nltk.tag import pos_tag
from seqeval.metrics import classification_report, f1_score
from sklearn_crfsuite import CRF
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### データのアップロード

データセットをアップロードします。ノートブックと同じ階層にDataフォルダがあり、その下にsnipsフォルダがあるので、学習・検証用データセットをアップロードしましょう。Colabでない場合は、データセットを読み込むときに正しいパスを指定します。



In [4]:
from google.colab import files
uploaded = files.upload()

Saving train_PlayMusic_full.json to train_PlayMusic_full.json
Saving validate_PlayMusic.json to validate_PlayMusic.json


データの中身を確認しておきましょう。`data`の中の各要素には`text`が必ず含まれており、つなげると1つの文になります。また、固有表現の場合は`entity`も含まれています。

In [7]:
!head -34 train_PlayMusic_full.json

{
  "PlayMusic": [
    {
      "data": [
        {
          "text": "I need to hear the "
        },
        {
          "text": "song",
          "entity": "music_item"
        },
        {
          "text": " "
        },
        {
          "text": "Aspro Mavro",
          "entity": "track"
        },
        {
          "text": " from "
        },
        {
          "text": "Bill Szymczyk",
          "entity": "artist"
        },
        {
          "text": " on "
        },
        {
          "text": "Youtube",
          "entity": "service"
        }
      ]
    },


### データの読み込み

In [10]:
def load_data(filename):
    with open(filename, encoding="iso-8859-2") as f:
        datalist = json.load(f)
    output = []
    for data in datalist["PlayMusic"]:
        sent = []
        tags = []
        for phrase in data["data"]:
            words = phrase["text"].strip().split()
            if "entity" in phrase:
                label = phrase["entity"]
                labels = [f"B-{label}"] + [f"I-{label}"] * (len(words) - 1)
            else:
                labels = ["O"] * len(words)
            sent.extend(words)
            tags.extend(labels)
        output.append([sent, tags])
    return output

In [12]:
train_file = "train_PlayMusic_full.json"
test_file = "validate_PlayMusic.json"

train_data = load_data(train_file)
test_data = load_data(test_file)
train_data[0]

[['I',
  'need',
  'to',
  'hear',
  'the',
  'song',
  'Aspro',
  'Mavro',
  'from',
  'Bill',
  'Szymczyk',
  'on',
  'Youtube'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'B-music_item',
  'B-track',
  'I-track',
  'O',
  'B-artist',
  'I-artist',
  'O',
  'B-service']]

### 単語埋め込みの読み込み

特徴として利用するために、単語埋め込みを用意しましょう。今回は、事前学習済み単語埋め込みとしてGloVeを使います。



In [36]:
# GloVeのダウンロードと展開
!wget  https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d data

--2021-09-22 08:54:00--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-09-22 08:54:00--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-09-22 08:56:40 (5.14 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: data/glove.6B.50d.txt   
  inflating: data/glove.6B.100d.txt  
  inflating: data/glove.6B.200d.txt  
  inflating: data/glove.6B.300d.txt  


In [42]:
BASE_DIR = "data"
GLOVE_FILE = os.path.join(BASE_DIR, "glove.6B.100d.txt")
EMBEDDING_DIM = 100

In [43]:
print("Preparing embedding matrix.")
embeddings_index = {}
with open(GLOVE_FILE, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
print("Found %s word vectors in Glove embeddings." % len(embeddings_index))

def get_embeddings(word):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is None:
        # words not found in embedding index will be all-zeros.
        embedding_vector = np.zeros(shape=(EMBEDDING_DIM,))
    return embedding_vector

Preparing embedding matrix.
Found 400000 word vectors in Glove embeddings.


## 前処理

データを読み込み終えたので、特徴を定義します。今回は前後2単語に関して、以下の特徴を使います。

- 単語
- 品詞

また、用意した単語埋め込みも特徴として使います。

In [53]:
def sent2feats(sentence):
    feats = []
    sen_tags = pos_tag(sentence)  # This format is specific to this POS tagger!
    for i in range(0, len(sentence)):
        word = sentence[i]
        wordfeats = {}
        # word features: word, prev 2 words, next 2 words in the sentence.
        wordfeats["word"] = word
        if i == 0:
            wordfeats["prevWord"] = wordfeats["prevSecondWord"] = "<S>"
        elif i == 1:
            wordfeats["prevWord"] = sentence[0]
            wordfeats["prevSecondWord"] = "</S>"
        else:
            wordfeats["prevWord"] = sentence[i - 1]
            wordfeats["prevSecondWord"] = sentence[i - 2]
        # next two words as features
        if i == len(sentence) - 2:
            wordfeats["nextWord"] = sentence[i + 1]
            wordfeats["nextNextWord"] = "</S>"
        elif i == len(sentence) - 1:
            wordfeats["nextWord"] = "</S>"
            wordfeats["nextNextWord"] = "</S>"
        else:
            wordfeats["nextWord"] = sentence[i + 1]
            wordfeats["nextNextWord"] = sentence[i + 2]

        # POS tag features: current tag, previous and next 2 tags.
        wordfeats["tag"] = sen_tags[i][1]
        if i == 0:
            wordfeats["prevTag"] = wordfeats["prevSecondTag"] = "<S>"
        elif i == 1:
            wordfeats["prevTag"] = sen_tags[0][1]
            wordfeats["prevSecondTag"] = "</S>"
        else:
            wordfeats["prevTag"] = sen_tags[i - 1][1]

            wordfeats["prevSecondTag"] = sen_tags[i - 2][1]
            # next two words as features
        if i == len(sentence) - 2:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = "</S>"
        elif i == len(sentence) - 1:
            wordfeats["nextTag"] = "</S>"
            wordfeats["nextNextTag"] = "</S>"
        else:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = sen_tags[i + 2][1]

        # Adding word vectors
        vector = get_embeddings(word.lower())
        for iv, value in enumerate(vector):
            wordfeats["v{}".format(iv)] = value
        feats.append(wordfeats)
    return feats

In [54]:
# Extract features from the conll data, after loading it.
def get_feats_conll(conll_data):
    feats = []
    labels = []
    for sentence in conll_data:
        feats.append(sent2feats(sentence[0]))
        labels.append(sentence[1])
    return feats, labels

In [55]:
x_train, y_train = get_feats_conll(train_data)
x_valid, y_valid = get_feats_conll(test_data)
print(x_train[0])

[{'word': 'I', 'prevWord': '<S>', 'prevSecondWord': '<S>', 'nextWord': 'need', 'nextNextWord': 'to', 'tag': 'PRP', 'prevTag': '<S>', 'prevSecondTag': '<S>', 'nextTag': 'VBP', 'nextNextTag': 'TO', 'v0': -0.046539, 'v1': 0.61966, 'v2': 0.56647, 'v3': -0.46584, 'v4': -1.189, 'v5': 0.44599, 'v6': 0.066035, 'v7': 0.3191, 'v8': 0.14679, 'v9': -0.22119, 'v10': 0.79239, 'v11': 0.29905, 'v12': 0.16073, 'v13': 0.025324, 'v14': 0.18678, 'v15': -0.31001, 'v16': -0.28108, 'v17': 0.60515, 'v18': -1.0654, 'v19': 0.52476, 'v20': 0.064152, 'v21': 1.0358, 'v22': -0.40779, 'v23': -0.38011, 'v24': 0.30801, 'v25': 0.59964, 'v26': -0.26991, 'v27': -0.76035, 'v28': 0.94222, 'v29': -0.46919, 'v30': -0.18278, 'v31': 0.90652, 'v32': 0.79671, 'v33': 0.24825, 'v34': 0.25713, 'v35': 0.6232, 'v36': -0.44768, 'v37': 0.65357, 'v38': 0.76902, 'v39': -0.51229, 'v40': -0.44333, 'v41': -0.21867, 'v42': 0.3837, 'v43': -1.1483, 'v44': -0.94398, 'v45': -0.15062, 'v46': 0.30012, 'v47': -0.57806, 'v48': 0.20175, 'v49': -1.659

## モデルの学習

In [56]:
%%time
model = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=False
)
model.fit(x_train, y_train)

CPU times: user 10.3 s, sys: 45.2 ms, total: 10.4 s
Wall time: 10.5 s


## モデルの評価

In [57]:
y_pred = model.predict(x_valid)
print(classification_report(y_valid, y_pred, digits=4))

              precision    recall  f1-score   support

       album     0.3333    0.0769    0.1250        13
      artist     0.8939    0.9365    0.9147        63
       genre     0.6667    0.6667    0.6667         3
  music_item     0.9375    0.9677    0.9524        31
    playlist     0.7143    0.5556    0.6250         9
     service     0.9487    0.9487    0.9487        39
        sort     0.9375    0.8824    0.9091        17
       track     0.4000    0.6667    0.5000         6
        year     0.9615    1.0000    0.9804        25

   micro avg     0.8812    0.8641    0.8725       206
   macro avg     0.7548    0.7446    0.7358       206
weighted avg     0.8617    0.8641    0.8561       206



## ハイパーパラメータの最適化

In [58]:
%%time
model = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(f1_score)

rs = RandomizedSearchCV(
    model,
    params_space,
    cv=3,
    verbose=1,
    n_jobs=-1,
    n_iter=30,
    scoring=f1_scorer
)
rs.fit(x_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 32.8min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 64.1min finished


CPU times: user 1h 2min 46s, sys: 48 s, total: 1h 3min 34s
Wall time: 1h 4min 14s


In [59]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.23182413219547646, 'c2': 0.011154765175489217}
best CV score: 0.9180060910576224
model size: 0.13M


テストデータを使って、最適なパラメータのモデルを評価してみましょう。

In [60]:
crf = rs.best_estimator_
y_pred = crf.predict(x_valid)
print(classification_report(y_valid, y_pred, digits=4))

              precision    recall  f1-score   support

       album     0.5000    0.2308    0.3158        13
      artist     0.8636    0.9048    0.8837        63
       genre     0.4000    0.6667    0.5000         3
  music_item     0.9688    1.0000    0.9841        31
    playlist     0.7143    0.5556    0.6250         9
     service     1.0000    0.9744    0.9870        39
        sort     0.9412    0.9412    0.9412        17
       track     0.6250    0.8333    0.7143         6
        year     1.0000    1.0000    1.0000        25

   micro avg     0.8922    0.8835    0.8878       206
   macro avg     0.7792    0.7896    0.7723       206
weighted avg     0.8850    0.8835    0.8796       206

