## baseline

In [6]:
import nltk
nltk.download('rslp')

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [7]:
import pandas as pd
import numpy as np

import re
import string
from numpy import inf

# Baseline 
from collections import Counter, OrderedDict
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
stop_words = stopwords.words('portuguese')
import warnings; warnings.simplefilter('ignore')


# SKLearn related imports
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity


random_state = 42

In [8]:
df = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [9]:
df.head()

Unnamed: 0,Request,Label
0,Adicionar música tomohisa yamashita à minha li...,add_to_playlist
1,Eu quero adicionando Aprite le finestre à minh...,add_to_playlist
2,Que curta-metragens estão tocando às 11 da man...,search_screening_event
3,precisa de uma sala em um que serve foie gras ...,book_restaurant
4,"Atribuir 4 estrelas de 6 para a crônica, Deus ...",rate_book


In [10]:
df.describe()

Unnamed: 0,Request,Label
count,26527,26799
unique,25752,8
top,Encontre o horário do filme.,no_intent
freq,5,12645


In [11]:
df.Request.iloc[10]

'Toca a melhor música de Phoebe Snow'

In [12]:
df.Label.unique()

array(['add_to_playlist', 'search_screening_event', 'book_restaurant',
       'rate_book', 'get_weather', 'play_music', 'search_creative_work',
       'no_intent'], dtype=object)

## Train Test Split

In [13]:
df = df.dropna()

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df.Request, df.Label, test_size=0.33, random_state=random_state)

In [15]:
tokenizer = WordPunctTokenizer()
stemmer = RSLPStemmer()
regex_list = [("<[^>]*>", "")]

## Tokenize and Stem

### Tokenize

In [16]:
X_train.iloc[0]

'Vai ser parte do mistério o tempo todo, porque se nasceu em Valparaíso e foi inscrito em Santiago, vai constar na certidão como Santiago.'

In [17]:
def tokenize(text):
    tokenizer = WordPunctTokenizer()
    token_text = text.apply(lambda y: tokenizer.tokenize(y.Request), axis=1)
    
    return token_text

In [18]:
# Custom transformer to implement sentence cleaning
class PortugueseCleaner(TransformerMixin):
    def __init__(self, tokenizer, stemmer, regex_list,
                 lower=True, remove_punct=True):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        X = list(map(self._clean_sentence, X))
        return X
    
    def _clean_sentence(self, sentence):
        
        # Replace given regexes
        #for regex in self.regex_list:
        #    sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()

        # Split sentence into list of words
        words = self.tokenizer.tokenize(sentence)
            
        # Remove punctuation
        if self.remove_punct:
            words = list(filter(lambda x: x not in string.punctuation, words))

        # Stem words
        if self.stemmer:
            words = map(self.stemmer.stem, words)

        # Join list elements into string
        sentence = " ".join(words)
        
        return sentence
    
    def fit(self, *_):
        return self

## Pipeline

In [19]:
# Build the pipeline

if 1 == 1:
    pipe = Pipeline([('stemm', PortugueseCleaner(tokenizer, stemmer, regex_list, lower=False)),
                     ('vect', TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words, max_features=20000)),
                     ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=2000, class_weight='balanced_subsample'))])
    # Train the classifier
    pipe.fit(X_train, y_train)

    predicted = pipe.predict(X_test)
    mean = np.mean(predicted == y_test)
else:
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from xgboost import XGBClassifier
    from sklearn.svm import LinearSVC, SVC
    
    
    search_space = [{'classifier': [LogisticRegression()],
                     'classifier__penalty': ['l1', 'l2']},
                    {'classifier': [RandomForestClassifier()],
                     'classifier__n_estimators': [10, 15],
                     'classifier__max_features': [ 2, 3]}]

    search_space2 = [{'classifier': [MultinomialNB()]},
                    {'classifier': [RandomForestClassifier()],
                     'classifier__n_estimators': [1000, 5000],
                     'classifier__max_features': [ 2, 3]}]

    k = 5000
    loads_of_classifiers = [
                        {'classifier': [RandomForestClassifier()],
                         'classifier__n_estimators': [2000]},
                        {'classifier': [KNeighborsClassifier()]},
                        {'classifier': [LogisticRegression(multi_class='multinomial')]},
                        {'classifier': [XGBClassifier()],
                        'classifier__n_estimators': [2000]},
                       {'classifier': [LinearSVC(multi_class='crammer_singer')]},]
    
    pipe = Pipeline([('stemm', PortugueseCleaner(tokenizer, stemmer, regex_list, lower=False)),
                     ('vect', TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words, max_features=20000)),
                     ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=2000, class_weight='balanced_subsample'))])
    
    clf = GridSearchCV(pipe, loads_of_classifiers, cv=5, verbose=0, n_jobs=6)
    
    from sklearn.feature_selection import SelectFromModel
    from sklearn.feature_selection import f_classif

    X = X_train
    y =  y_train
    X_val = X_test
    y_val = y_test

    best_model = clf.fit(X, y)
    print(best_model.best_estimator_.get_params()['classifier'])

    cv_score = clf.cv_results_['mean_test_score']

    print("Grid search cv scores:", cv_score)

    predicted = best_model.predict(X_val)
    print("Acuracy on validation:", np.mean(predicted == y_val))
    
    def get_cv_summary(grid_clf):
        return pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_score')

    scores_df = get_cv_summary(clf)
    print(scores_df)


KeyboardInterrupt: 

In [20]:
mean

NameError: name 'mean' is not defined

## Submission

In [21]:
df = df.dropna()
df.head()


Unnamed: 0,Request,Label
0,Adicionar música tomohisa yamashita à minha li...,add_to_playlist
1,Eu quero adicionando Aprite le finestre à minh...,add_to_playlist
2,Que curta-metragens estão tocando às 11 da man...,search_screening_event
3,precisa de uma sala em um que serve foie gras ...,book_restaurant
4,"Atribuir 4 estrelas de 6 para a crônica, Deus ...",rate_book


In [22]:
df.describe()

Unnamed: 0,Request,Label
count,26527,26527
unique,25752,8
top,Encontre o horário do filme.,no_intent
freq,5,12515


In [23]:
pipe = Pipeline([('stemm', PortugueseCleaner(tokenizer, stemmer, regex_list, lower=False)),
                 ('vect', TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words, max_features=20000)),
                 ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=2000, class_weight='balanced_subsample'))])
# Train the classifier
pipe.fit(df.Request, df.Label)

predicted = pipe.predict(test.Request)

KeyboardInterrupt: 

In [None]:
predicted

# Part-of-Speech



In [84]:
import nltk
#from nltk.corpus import mac_morpho
import spacy
#nltk.download('mac_morpho')
#nltk.download('averaged_perceptron_tagger')
#nlt = nltk.pos_tag(mac_morpho.sents()[0])
nlp = spacy.load('pt_core_news_sm')

# Funcao para adicionar features POS

In [105]:

def create_aditional_POS_features(_df, text_column_name,_nlp, return_only_new_feats = False):
    n_adj, n_verbs, len_message= [], [], []
    _df_copy = _df.copy()
    n_adv = []
    n_noun = [] 
    
    for doc in _nlp.pipe(_df_copy[text_column_name]):
        n_adj.append(len([token for token in doc if token.pos_ == 'ADJ']))
        if len([token for token in doc if token.pos_ == 'ADJ']) is np.nan:
            print(doc)
            
        n_verbs.append(len([token for token in doc if token.pos_ == 'VERB']))
        
        # Acrescentados adverbs ADV , nouns NOUN
        n_adv.append(len([token for token in doc if token.pos_ == 'ADV']))
        n_noun.append(len([token for token in doc if token.pos_ == 'NOUN']))
        
        len_message.append(len(doc))
    _df_copy['n_adj'] = pd.Series(n_adj)
    _df_copy['n_verbs'] = pd.Series(n_verbs)
    _df_copy['n_adv'] = pd.Series(n_adv)
    _df_copy['n_noun'] = pd.Series(n_noun)
    _df_copy['len_text'] = pd.Series(len_message)
    
    if return_only_new_feats:
        return _df_copy.drop(text_column_name, axis=1)
    else:
        return _df_copy

# Text and number column selectors

In [87]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    




# Pipeline with feature union to combine features and simple model

In [122]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler


text = Pipeline([
                ('selector', TextSelector(key='Request')),
                # Acrescentar PortugueseCleaner
                ('stemm', PortugueseCleaner(tokenizer, stemmer, regex_list, lower=False)),
                ('tfidf', TfidfVectorizer(ngram_range=(1,2), stop_words=stop_words))
            ])

adj =  Pipeline([
                ('selector', NumberSelector(key='n_adj')),
                ('standard', StandardScaler())
            ])

verbs =  Pipeline([
                ('selector', NumberSelector(key='n_verbs')),
                ('standard', StandardScaler())
            ])

len_text =  Pipeline([
                ('selector', NumberSelector(key='len_text')),
                ('standard', StandardScaler())
            ])

feats = FeatureUnion([('text', text), 
                      ('adj', adj),
                      ('verbs', verbs),
                      ('len_text', len_text)
                       ])


final_pipe = Pipeline([('features',feats),
                     ('classifier', RandomForestClassifier())])



In [106]:
df2 = create_aditional_POS_features(df, "Request",nlt, return_only_new_feats = False)

In [123]:
df3 = df2.dropna()
df3.isnull().values.any()

False

In [124]:
#You can pass the label to the classifier because of the feature selectors

# X = train_data.drop('label', axis=1)
# y =  train_data.label
# X_val = test_data
# y_val = test_data.label

X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(
    df3.drop('Label', axis=1), df3.Label, test_size=0.33, random_state=random_state)

# X_train, X_test, y_train, y_test

final_pipe.fit(X_train_pos, y_train_pos)

preds = final_pipe.predict(X_test_pos)
accuracy = np.mean(preds == y_test_pos)

print("Accuracy: {:.4f}".format(accuracy))

Accuracy: 0.8599


In [102]:
for coluna in df2.columns:
    print("Coluna :", coluna)
    print(df2[coluna].isnull().values.sum()

Coluna : Request
0
Coluna : Label
0
Coluna : n_adj
268
Coluna : n_verbs
268
Coluna : n_adv
268
Coluna : n_noun
268
Coluna : len_text
268
