In [1]:
import json
import random
from pathlib import Path
import thinc.extra.datasets

import spacy
from spacy.util import minibatch, compounding

In [2]:
nlp = spacy.load("fr_core_news_sm")

In [3]:
INTENTS = [
    'find-around-me',
    'find-flight',
    'find-hotel',
    'find-restaurant',
    'find-train',
    'irrelevant',
    'provide-showtimes',
    'purchase',
]

In [4]:
def load_data(limit=0, split=0.8):
    with open("../data/training_set.json", "r") as f:
        training_set = json.load(f)
    texts = [elem.get('sentence') for elem in training_set]
    cats = [{ intent : (elem.get('intent')==intent) for intent in INTENTS} for elem in training_set]
    split = int(len(training_set) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [5]:
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe(
        "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
    )
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe("textcat")

# add label to text classifier
for intent in INTENTS :
    textcat.add_label(intent)

In [6]:
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()

In [7]:
print(
"Using {} examples ({} training, {} evaluation)".format(
    len(train_texts)+ len(dev_texts), len(train_texts), len(dev_texts)
)
)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
print(train_data[0])

Using 6035 examples (4828 training, 1207 evaluation)
('850€ maximum pour le loyer, à partir de janvier si possible', {'cats': {'find-around-me': False, 'find-flight': False, 'find-hotel': False, 'find-restaurant': False, 'find-train': False, 'irrelevant': True, 'provide-showtimes': False, 'purchase': False}})


In [8]:
n_iter = 2
# get names of other pipes to disable them during training
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print(
            "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                losses["textcat"],
                scores["textcat_p"],
                scores["textcat_r"],
                scores["textcat_f"],
            )
        )

Training the model...
LOSS 	  P  	  R  	  F  
13.993	0.882	0.697	0.779
0.862	0.870	0.789	0.827


In [9]:
# test the trained model
test_text = "Trouve un resto"
doc = nlp(test_text)
print(test_text, doc.cats)

Trouve un resto {'find-around-me': 0.022973181679844856, 'find-flight': 0.00016880588373169303, 'find-hotel': 0.009913002140820026, 'find-restaurant': 0.7323805689811707, 'find-train': 0.00010330761870136485, 'irrelevant': 0.02601248398423195, 'provide-showtimes': 0.0003168716502841562, 'purchase': 0.20813176035881042}


In [11]:
nlp.to_disk("./model")

PermissionError: [Errno 13] Permission denied: '/model'