## baseline

In [1]:
import nltk
nltk.download('rslp')

[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np

import re
import string
from numpy import inf

# Baseline 
from collections import Counter, OrderedDict
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
stop_words = stopwords.words('portuguese')
import warnings; warnings.simplefilter('ignore')


# SKLearn related imports
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity


random_state = 42

In [97]:
df = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv', encoding='utf-8')

In [75]:
test.head()

Unnamed: 0,Request
0,Quais filmes estão atualmente em cartaz no Spe...
1,Faça uma lista do que está faltando.
2,Dá-me os horários do filme para filmes exibido...
3,"reservar um restaurante em Clawson, MS para um"
4,É sobre o tempo que os franceses aprenderam da...


In [5]:
df.describe()

Unnamed: 0,Request,Label
count,26527,26799
unique,25752,8
top,Encontre o horário do filme.,no_intent
freq,5,12645


In [6]:
df.Request.iloc[10]

'Toca a melhor música de Phoebe Snow'

In [7]:
df.Label.unique()

array(['add_to_playlist', 'search_screening_event', 'book_restaurant',
       'rate_book', 'get_weather', 'play_music', 'search_creative_work',
       'no_intent'], dtype=object)

## Train Test Split

In [89]:
df = df.dropna()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    df.Request, df.Label, test_size=0.33, random_state=random_state)

In [10]:
tokenizer = WordPunctTokenizer()
stemmer = RSLPStemmer()
regex_list = [("<[^>]*>", "")]

## Tokenize and Stem

### Tokenize

In [11]:
X_train.iloc[0]

'Vai ser parte do mistério o tempo todo, porque se nasceu em Valparaíso e foi inscrito em Santiago, vai constar na certidão como Santiago.'

In [12]:
def tokenize(text):
    tokenizer = WordPunctTokenizer()
    token_text = text.apply(lambda y: tokenizer.tokenize(y.Request), axis=1)
    
    return token_text

In [13]:
# Custom transformer to implement sentence cleaning
class PortugueseCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, tokenizer, stemmer, regex_list,
                 lower=True, remove_punct=True):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        X = list(map(self._clean_sentence, X))
        return X
    
    def _clean_sentence(self, sentence):
        
        # Replace given regexes
        #for regex in self.regex_list:
        #    sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()

        # Split sentence into list of words
        words = self.tokenizer.tokenize(sentence)
            
        # Remove punctuation
        if self.remove_punct:
            words = list(filter(lambda x: x not in string.punctuation, words))

        # Stem words
        if self.stemmer:
            words = map(self.stemmer.stem, words)

        # Join list elements into string
        sentence = " ".join(words)
        
        return sentence
    
    def fit(self, *_):
        return self

## Pipeline

In [14]:
# Build the pipeline
if 1==0:
    if 1 == 0:
        pipe = Pipeline([('stemm', PortugueseCleaner(tokenizer, stemmer, regex_list, lower=False)),
                         ('vect', TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words, max_features=20000)),
                         ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=2000, class_weight='balanced_subsample'))])
        # Train the classifier
        pipe.fit(X_train, y_train)

        predicted = pipe.predict(X_test)
        mean = np.mean(predicted == y_test)
    else:
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import GridSearchCV
        from sklearn.pipeline import Pipeline
        from sklearn.linear_model import LogisticRegression
        from sklearn.neighbors import KNeighborsClassifier
        from xgboost import XGBClassifier
        from sklearn.svm import LinearSVC, SVC


        search_space = [{'classifier': [LogisticRegression()],
                         'classifier__penalty': ['l1', 'l2']},
                        {'classifier': [RandomForestClassifier()],
                         'classifier__n_estimators': [10, 15],
                         'classifier__max_features': [ 2, 3]}]

        search_space2 = [{'classifier': [MultinomialNB()]},
                        {'classifier': [RandomForestClassifier()],
                         'classifier__n_estimators': [1000, 5000],
                         'classifier__max_features': [ 2, 3]}]

        k = 5000
        loads_of_classifiers = [
                            {'classifier': [RandomForestClassifier()],
                             'classifier__n_estimators': [2000]},
                            {'classifier': [KNeighborsClassifier()]},
                            {'classifier': [XGBClassifier()],
                            'classifier__n_estimators': [2000]},
                           ]

        pipe = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words, max_features=20000)),
                         ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=2000, class_weight='balanced_subsample'))])

        clf = GridSearchCV(pipe, loads_of_classifiers, cv=5, verbose=0, n_jobs=6)

        from sklearn.feature_selection import SelectFromModel
        from sklearn.feature_selection import f_classif

        X = X_train
        y =  y_train
        X_val = X_test
        y_val = y_test

        cleanner = PortugueseCleaner(tokenizer, stemmer, regex_list, lower=False)
        X_clean = cleanner.transform(X)
        X_val_clean =cleanner.transform(X_val)

        best_model = clf.fit(X_clean, y)
        print(best_model.best_estimator_.get_params()['classifier'])

        cv_score = clf.cv_results_['mean_test_score']

        print("Grid search cv scores:", cv_score)

        predicted = best_model.predict(X_val_clean)
        print("Acuracy on validation:", np.mean(predicted == y_val))

        def get_cv_summary(grid_clf):
            return pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_score')

        scores_df = get_cv_summary(clf)
        print(scores_df)


In [15]:
def get_cv_summary(grid_clf):
        return pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_score')

In [16]:
#scores_df

## Submission

In [17]:
df = df.dropna()
df.head()


Unnamed: 0,Request,Label
0,Adicionar música tomohisa yamashita à minha li...,add_to_playlist
1,Eu quero adicionando Aprite le finestre à minh...,add_to_playlist
2,Que curta-metragens estão tocando às 11 da man...,search_screening_event
3,precisa de uma sala em um que serve foie gras ...,book_restaurant
4,"Atribuir 4 estrelas de 6 para a crônica, Deus ...",rate_book


In [18]:
df.describe()

Unnamed: 0,Request,Label
count,26527,26527
unique,25752,8
top,Encontre o horário do filme.,no_intent
freq,5,12515


In [19]:
if 0 == 1:
    pipe = Pipeline([('stemm', PortugueseCleaner(tokenizer, stemmer, regex_list, lower=False)),
                     ('vect', TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words, max_features=20000)),
                     ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=2000, class_weight='balanced_subsample'))])
    # Train the classifier
    pipe.fit(df.Request, df.Label)

    predicted = pipe.predict(test.Request)

In [23]:
!python -m spacy download pt_core_news_sm

Collecting https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.0.0/pt_core_news_sm-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.0.0/pt_core_news_sm-2.0.0.tar.gz (38.7MB)
[K    100% |████████████████████████████████| 38.7MB 19.7MB/s ta 0:00:01% |                                | 92kB 92kB/s eta 0:06:57    4% |█▎                              | 1.6MB 560kB/s eta 0:01:07    4% |█▌                              | 1.8MB 916kB/s eta 0:00:41    7% |██▍                             | 2.9MB 920kB/s eta 0:00:39    14% |████▋                           | 5.6MB 5.4MB/s eta 0:00:07    15% |█████                           | 6.1MB 1.3MB/s eta 0:00:26    29% |█████████▍                      | 11.3MB 8.8MB/s eta 0:00:04    30% |█████████▋                      | 11.7MB 4.9MB/s eta 0:00:06    33% |██████████▊                     | 13.0MB 4.8MB/s eta 0:00:06    40% |█████████████                   | 15.7MB 8.8MB/s eta

In [24]:
import spacy
nlp = spacy.load('pt_core_news_sm')

In [51]:
labels = list(df.Label.unique())

In [79]:
labels

['add_to_playlist',
 'search_screening_event',
 'book_restaurant',
 'rate_book',
 'get_weather',
 'play_music',
 'search_creative_work',
 'no_intent']

In [93]:
#!/usr/bin/env python
# coding: utf8
"""Train a convolutional neural network text classifier using the TextCategorizer component. 
The model is added to spacy.pipeline, and predictions are available via `doc.cats`. For more details,
see the documentation:
* Training: https://spacy.io/usage/training

Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import thinc.extra.datasets

import spacy
from spacy.util import minibatch, compounding


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_texts=("Number of texts to train from", "option", "t", int),
    n_iter=("Number of training iterations", "option", "n", int))
def main(_train_df, model, output_dir, n_iter, n_texts, label_column_name, text_column_name, labels):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    for label in labels:
        textcat.add_label(label.upper())

    # load the IMDB dataset
    print("Loading data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(_train_df, label_column_name, text_column_name, labels, limit=n_texts)
    
    print("Using {} examples ({} training, {} evaluation)"
          .format(n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'A', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_a'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

    # test the trained model
    test_text = "Quero uma musiquinha"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
        
    return nlp


def load_data(_train_df, text_column_name, label_column_name, label_names, limit=0, split=0.95):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    #train_data, _ = thinc.extra.datasets.imdb()
    train_data = [(row[text_column_name], row[label_column_name]) for index, row in _train_df.iterrows()]
    random.shuffle(train_data)
    #print(train_data[0:10])
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = []
    for label in labels:
        cats.append({y.upper(): bool(y == label) for y in label_names})
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])


def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    accuracy = (tp + tn) /(tp + tn + fp + fn)
    return {'textcat_a' : accuracy, 'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

def spacy_predict(_nlp, _test_data, label, predict_proba=True):
    predictions = []
    if predict_proba:
        for doc in _test_data.values:
            predictions.append(_nlp(doc).cats[label])
    else:
        for doc in _test_data.values:
            predictions.append(int((_nlp(doc).cats[label]) > 0.5))
    return pd.Series(predictions)




labels = list(df.Label.unique())

#nlp = main(df, 'pt_core_news_sm', '/tmp/dockerwd/spacy_models/hackmodel1', 10, 0, 'Request', 'Label', labels)


In [95]:
def spacy_predict_multi(_nlp, _test_data, predict_proba=True):
    predictions = []
    for doc in _test_data.values:
        predictions.append(pd.Series(_nlp(doc).cats).idxmax().lower())
    return predictions
    
#predictions = spacy_predict_multi(nlp, test['Request'])

In [66]:
predictions

['search_screening_event',
 'no_intent',
 'search_screening_event',
 'book_restaurant',
 'no_intent',
 'no_intent',
 'add_to_playlist',
 'rate_book',
 'search_screening_event',
 'no_intent',
 'no_intent',
 'no_intent',
 'play_music',
 'no_intent',
 'no_intent',
 'add_to_playlist',
 'no_intent',
 'no_intent',
 'no_intent',
 'no_intent',
 'no_intent',
 'no_intent',
 'no_intent',
 'play_music',
 'play_music',
 'search_screening_event',
 'no_intent',
 'rate_book',
 'no_intent',
 'play_music',
 'get_weather',
 'no_intent',
 'book_restaurant',
 'no_intent',
 'no_intent',
 'no_intent',
 'search_screening_event',
 'rate_book',
 'book_restaurant',
 'rate_book',
 'no_intent',
 'no_intent',
 'book_restaurant',
 'play_music',
 'no_intent',
 'get_weather',
 'no_intent',
 'play_music',
 'no_intent',
 'no_intent',
 'play_music',
 'no_intent',
 'no_intent',
 'book_restaurant',
 'search_creative_work',
 'get_weather',
 'no_intent',
 'add_to_playlist',
 'no_intent',
 'no_intent',
 'get_weather',
 'get_w

In [83]:
test['Label'] = predictions

In [84]:
test = test.set_index('Request')

In [85]:
test.to_csv("../Submission3.csv", encoding="utf-8")

In [81]:
test.isnull().sum()

Label    0
dtype: int64

In [86]:
test.head()

Unnamed: 0_level_0,Label
Request,Unnamed: 1_level_1
Quais filmes estão atualmente em cartaz no Speakeasy Theatres,search_screening_event
Faça uma lista do que está faltando.,no_intent
Dá-me os horários do filme para filmes exibidos no bairro,search_screening_event
"reservar um restaurante em Clawson, MS para um",book_restaurant
É sobre o tempo que os franceses aprenderam da América sobre o assunto.,no_intent


In [96]:
## Second iteration training


nlp2 = main(df, '/tmp/dockerwd/spacy_models/hackmodel1', '/tmp/dockerwd/spacy_models/hackmodel2', 10, 0, 'Request', 'Label', labels)

Loaded model '/tmp/dockerwd/spacy_models/hackmodel1'
Loading data...
Using 0 examples (25200 training, 1327 evaluation)
Training the model...
LOSS 	  A  	  P  	  R  	  F  
241.094	0.985	0.943	0.936	0.939
200.008	0.986	0.948	0.939	0.943
181.317	0.985	0.942	0.935	0.939
165.484	0.984	0.941	0.931	0.936
158.517	0.985	0.942	0.934	0.938
152.899	0.984	0.941	0.934	0.938
147.879	0.984	0.939	0.935	0.937
144.683	0.984	0.941	0.931	0.936
145.314	0.984	0.944	0.931	0.937
139.822	0.984	0.938	0.931	0.935
Quero uma musiquinha {'GET_WEATHER': 0.0003231181181035936, 'ADD_TO_PLAYLIST': 6.190120620885864e-05, 'RATE_BOOK': 9.423041046829894e-05, 'SEARCH_SCREENING_EVENT': 0.00027399894315749407, 'SEARCH_CREATIVE_WORK': 0.6962063908576965, 'BOOK_RESTAURANT': 4.539787187241018e-05, 'PLAY_MUSIC': 0.00031958604813553393, 'NO_INTENT': 0.21093034744262695}
Saved model to /tmp/dockerwd/spacy_models/hackmodel2
Loading from /tmp/dockerwd/spacy_models/hackmodel2
Quero uma musiquinha {'GET_WEATHER': 0.0003231181181035936

In [98]:
predictions = spacy_predict_multi(nlp2, test['Request'])


In [99]:
test['Label'] = predictions
test = test.set_index('Request')
test.to_csv("../Submission5.csv", encoding="utf-8")