## baseline

In [1]:
import nltk
nltk.download('rslp')

[nltk_data] Downloading package rslp to
[nltk_data]     /Users/manuelcostareis/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np

import re
import string
from numpy import inf

# Baseline 
from collections import Counter, OrderedDict
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
stop_words = stopwords.words('portuguese')
import warnings; warnings.simplefilter('ignore')


# SKLearn related imports
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity


random_state = 42

In [26]:
df = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [4]:
df.head()

Unnamed: 0,Request,Label
0,Adicionar música tomohisa yamashita à minha li...,add_to_playlist
1,Eu quero adicionando Aprite le finestre à minh...,add_to_playlist
2,Que curta-metragens estão tocando às 11 da man...,search_screening_event
3,precisa de uma sala em um que serve foie gras ...,book_restaurant
4,"Atribuir 4 estrelas de 6 para a crônica, Deus ...",rate_book


In [5]:
df.describe()

Unnamed: 0,Request,Label
count,26527,26799
unique,25752,8
top,Encontre o horário do filme.,no_intent
freq,5,12645


In [6]:
df.Request.iloc[10]

'Toca a melhor música de Phoebe Snow'

In [7]:
df.Label.unique()

array(['add_to_playlist', 'search_screening_event', 'book_restaurant',
       'rate_book', 'get_weather', 'play_music', 'search_creative_work',
       'no_intent'], dtype=object)

## Train Test Split

In [8]:
df = df.dropna()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    df.Request, df.Label, test_size=0.33, random_state=random_state)

In [10]:
tokenizer = WordPunctTokenizer()
stemmer = RSLPStemmer()
regex_list = [("<[^>]*>", "")]

## Tokenize and Stem

### Tokenize

In [11]:
X_train.iloc[0]

'Vai ser parte do mistério o tempo todo, porque se nasceu em Valparaíso e foi inscrito em Santiago, vai constar na certidão como Santiago.'

In [12]:
def tokenize(text):
    tokenizer = WordPunctTokenizer()
    token_text = text.apply(lambda y: tokenizer.tokenize(y.Request), axis=1)
    
    return token_text

In [15]:
# Custom transformer to implement sentence cleaning
class PortugueseCleaner(TransformerMixin):
    def __init__(self, tokenizer, stemmer, regex_list,
                 lower=True, remove_punct=True):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        X = list(map(self._clean_sentence, X))
        return X
    
    def _clean_sentence(self, sentence):
        
        # Replace given regexes
        #for regex in self.regex_list:
        #    sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()

        # Split sentence into list of words
        words = self.tokenizer.tokenize(sentence)
            
        # Remove punctuation
        if self.remove_punct:
            words = list(filter(lambda x: x not in string.punctuation, words))

        # Stem words
        if self.stemmer:
            words = map(self.stemmer.stem, words)

        # Join list elements into string
        sentence = " ".join(words)
        
        return sentence
    
    def fit(self, *_):
        return self

## Pipeline

In [22]:
# Build the pipeline

if 1 == 1:
    pipe = Pipeline([('stemm', PortugueseCleaner(tokenizer, stemmer, regex_list, lower=False)),
                     ('vect', TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words, max_features=20000)),
                     ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=2000, class_weight='balanced_subsample'))])
    # Train the classifier
    pipe.fit(X_train, y_train)

    predicted = pipe.predict(X_test)
    mean = np.mean(predicted == y_test)
else:
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from xgboost import XGBClassifier
    from sklearn.svm import LinearSVC, SVC
    
    
    search_space = [{'classifier': [LogisticRegression()],
                     'classifier__penalty': ['l1', 'l2']},
                    {'classifier': [RandomForestClassifier()],
                     'classifier__n_estimators': [10, 15],
                     'classifier__max_features': [ 2, 3]}]

    search_space2 = [{'classifier': [MultinomialNB()]},
                    {'classifier': [RandomForestClassifier()],
                     'classifier__n_estimators': [1000, 5000],
                     'classifier__max_features': [ 2, 3]}]

    k = 5000
    loads_of_classifiers = [
                        {'classifier': [RandomForestClassifier()],
                         'classifier__n_estimators': [2000]},
                        {'classifier': [KNeighborsClassifier()]},
                        {'classifier': [LogisticRegression(multi_class='multinomial')]},
                        {'classifier': [XGBClassifier()],
                        'classifier__n_estimators': [2000]},
                       {'classifier': [LinearSVC(multi_class='crammer_singer')]},]
    
    pipe = Pipeline([('stemm', PortugueseCleaner(tokenizer, stemmer, regex_list, lower=False)),
                     ('vect', TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words, max_features=20000)),
                     ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=2000, class_weight='balanced_subsample'))])
    
    clf = GridSearchCV(pipe, loads_of_classifiers, cv=5, verbose=0, n_jobs=6)
    
    from sklearn.feature_selection import SelectFromModel
    from sklearn.feature_selection import f_classif

    X = X_train
    y =  y_train
    X_val = X_test
    y_val = y_test

    best_model = clf.fit(X, y)
    print(best_model.best_estimator_.get_params()['classifier'])

    cv_score = clf.cv_results_['mean_test_score']

    print("Grid search cv scores:", cv_score)

    predicted = best_model.predict(X_val)
    print("Acuracy on validation:", np.mean(predicted == y_val))
    
    def get_cv_summary(grid_clf):
        return pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_score')

    scores_df = get_cv_summary(clf)
    print(scores_df)


In [23]:
mean

0.8840530043408728

## Submission

In [30]:
df = df.dropna()
df.head()


Unnamed: 0,Request,Label
0,Adicionar música tomohisa yamashita à minha li...,add_to_playlist
1,Eu quero adicionando Aprite le finestre à minh...,add_to_playlist
2,Que curta-metragens estão tocando às 11 da man...,search_screening_event
3,precisa de uma sala em um que serve foie gras ...,book_restaurant
4,"Atribuir 4 estrelas de 6 para a crônica, Deus ...",rate_book


In [31]:
df.describe()

Unnamed: 0,Request,Label
count,26527,26527
unique,25752,8
top,Encontre o horário do filme.,no_intent
freq,5,12515


In [None]:
pipe = Pipeline([('stemm', PortugueseCleaner(tokenizer, stemmer, regex_list, lower=False)),
                 ('vect', TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words, max_features=20000)),
                 ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=2000, class_weight='balanced_subsample'))])
# Train the classifier
pipe.fit(df.Request, df.Label)

predicted = pipe.predict(test.Request)

In [None]:
predicted