# Spacy  model
This notebook is strongly based on the spacy's documentation for the text classification, you can access it with the link : https://spacy.io/usage/training#textcat.

In [4]:
import json
import random

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from pathlib import Path

In [33]:
import spacy
from spacy.util import minibatch, compounding

In [7]:
INTENTS = [
    'find-around-me',
    'find-flight',
    'find-hotel',
    'find-restaurant',
    'find-train',
    'irrelevant',
    'provide-showtimes',
    'purchase',
]

In [35]:
n_epoch = 6

# Load and format data

In [52]:
def load_data(limit=0, split_dev=0.1, split_test=0.1):

    with open("../data/training_set.json", "r") as f:
        dataset = json.load(f)

    index_test = int((split_test+split_dev) *len(dataset))
    index_dev = int(split_dev *len(dataset))
    random.shuffle(dataset)

    testing_set = dataset[:index_test]
    print(testing_set[:10])
    with open("../data/testing_set.json", "w", encoding='utf-8') as f:
        json.dump(testing_set, f,  ensure_ascii=False)
    
    texts = [elem.get('sentence') for elem in dataset]
    cats = [{ intent : (elem.get('intent')==intent) for intent in INTENTS} for elem in dataset]
    # Separation
    return  ((texts[index_dev:], cats[index_dev:]), 
            (texts[index_test:index_dev], cats[index_test:index_dev]),)

In [53]:
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()

[{'intent': 'irrelevant', 'sentence': 'Celui gratuit'}, {'intent': 'irrelevant', 'sentence': 'Alors oui max 150€'}, {'intent': 'irrelevant', 'sentence': "J'ai bien rdv à 10h au spa de opio atout formes ?"}, {'intent': 'irrelevant', 'sentence': "Tu as des offres intéressantes à me proposer pour une loc de voiture (type sport), je veux du choix et j'ai un budget de 1000€ par semaine. Ça va driver ! 🚗"}, {'intent': 'find-hotel', 'sentence': 'Bonjour je cherche un hôtel autour de Nantes dans un rayon de 50km,au calme avec un bon restaurant sur place. Je ne veux pas que ce soit dans Nantes. Plutôt un genre relais château dans les environs. C’est pour 1 nuit du 31 Juillet au 1er Août'}, {'intent': 'irrelevant', 'sentence': 'A menton'}, {'intent': 'find-train', 'sentence': 'À quelle heure arrive le dernier train de Paris à Noyelles'}, {'intent': 'provide-showtimes', 'sentence': 'Films à voir en famille au ciné'}, {'intent': 'irrelevant', 'sentence': "Serais ce possible de réserver ? C'est pou

In [32]:
print("Using {} examples ({} training, {} evaluation)".format(
    len(train_texts)+ len(dev_texts), len(train_texts), len(dev_texts)))
      
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

Using 5431 examples (4828 training, 603 evaluation)
604


# Create the model

In [39]:
nlp = spacy.load("fr_core_news_sm")

In [40]:
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe(
        "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
    )
    nlp.add_pipe(textcat, last=True)
else:
    textcat = nlp.get_pipe("textcat")

for intent in INTENTS :
    textcat.add_label(intent)

# Training

In [41]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [42]:
# get names oother pipes to disable them during training
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
    batch_sizes = compounding(4.0, 32.0, 1.001)
    loss = []
    for i in range(n_epoch):
        losses = {}
        # batch up the examples using spaCy's minibatch
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        loss.append(losses["textcat"])
        print(
            "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                losses["textcat"],
                scores["textcat_p"],
                scores["textcat_r"],
                scores["textcat_f"],
            )
        )

Training the model...
LOSS 	  P  	  R  	  F  
14.695	0.886	0.688	0.775
0.939	0.889	0.800	0.842
0.141	0.888	0.838	0.862
0.279	0.886	0.846	0.866
0.011	0.876	0.849	0.862
0.004	0.875	0.853	0.864
0.002	0.872	0.856	0.864
0.003	0.871	0.856	0.863
0.003	0.864	0.849	0.857
0.002	0.870	0.853	0.861


In [43]:
output_notebook()

p = figure(plot_width=500, plot_height=500,title="Training loss")

p.line([i for i in range(len(loss))], loss, line_width=2, color = 'darkblue')

p.xaxis.axis_label = 'epoch'
p.yaxis.axis_label = 'loss'
show(p)

# Save of the model

In [46]:
nlp.to_disk("../model")