In [5]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from textacy.preprocess import preprocess_text as textacy_preprocess

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from fastText import load_model

import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [6]:
STOPWORDS_SET = stopwords.words('spanish')

EMOJI_PATTERN = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)


def remove_stopwords(text):
    """Remove stop words from list of tokenized words."""
    new_words = [word for word in text.split() if word not in STOPWORDS_SET]
    return ' '.join(new_words)


def remove_emoticons(text):
    """Remove emoticos."""
    return EMOJI_PATTERN.sub('', text)


def to_lowercase(text):
    """To lowercase."""
    return text.lower()


def preprocess_text(text, fix_unicode=True, lowercase=True,
                    no_urls=True, no_emails=True,
                    no_phone_numbers=True,
                    no_numbers=True, no_currency_symbols=True,
                    no_punct=True, no_accents=True):
    """Preprocess text."""
    clean_text = ""
    if text:
        clean_text = remove_emoticons(text)
        if lowercase:
            clean_text = to_lowercase(clean_text)
#        clean_text = remove_stopwords(clean_text)
        clean_text = textacy_preprocess(clean_text, fix_unicode=fix_unicode,
                                        lowercase=lowercase,
                                        no_urls=no_urls, no_emails=no_emails,
                                        no_phone_numbers=no_phone_numbers,
                                        no_numbers=no_numbers,
                                        no_currency_symbols=no_currency_symbols,
                                        no_punct=no_punct,
                                        no_accents=no_accents)
    return clean_text

## ML

In [7]:
df = pd.read_csv('train.csv',sep = '|')
df.columns = ['Pregunta', 'Intencion']

In [8]:
df["clean_txt"] = df["Pregunta"].apply(lambda x: preprocess_text(x))

In [9]:
df_test = pd.read_csv('test_santander.csv')

In [10]:
df_test["clean_txt"] = df_test["Pregunta"].apply(lambda x: preprocess_text(x))

In [11]:
sentences = list(df["clean_txt"].values) + list(df_test["clean_txt"].values)
len(sentences)

26806

In [12]:
sentences[:2]

['como puedo trabajar en santander rio', 'pagar tarjeta visa querer reintegro']

In [24]:
with open("ft_train.txt", 'w') as ofile:
    for sent in sentences:
        ofile.write(sent + '\n')

In [13]:
df.groupby(["clean_txt"]).agg({"Intencion": 'count'}).reset_index().sort_values(["Intencion"], ascending=False).head(20)

Unnamed: 0,clean_txt,Intencion
1021,comision custodio,4
9641,pagar afip,4
9429,no veo debitada la factura de telecom que en l...,4
5438,estoy adherido al debito automatico de los ser...,4
9597,olvide mi contrasena,4
1039,comision por extraccion en el exterior,4
10986,que beneficios tiene la tarjeta,4
4572,donde esta mi tarjeta,4
13690,quiero saber si puedo sacar un prestamo,3
3769,cuanto dinero puedo sacar por cajero automatico,3


In [None]:
! fasttext skipgram -input ft_train.txt -epoch 100 -lr 0.1 -lrUpdateRate 50 -loss ns -minn 3 -maxn 7 -ws 2 -minCount 1 -dim 50 -neg 5 -verbose 2 -thread 12 -output ft_lm_skipgram

In [None]:
! fasttext cbow -input ft_train.txt -epoch 100 -lr 0.1 -lrUpdateRate 50 -loss ns -minn3 -maxn 7 -ws 2 -minCount 1 -dim 50 -neg 5 -verbose 2 -thread 12 -output ft_lm_cbow

## Load Fasttext Model

In [14]:
ft_model_cbow = load_model("ft_lm_cbow.bin")
ft_model_skigram = load_model("ft_lm_skipgram.bin")

In [15]:
df["feats_cbow"] = df["clean_txt"].apply(lambda x: ft_model_cbow.get_sentence_vector(x))
df["feats_skigram"] = df["clean_txt"].apply(lambda x: ft_model_skigram.get_sentence_vector(x))

In [16]:
df.head()

Unnamed: 0,Pregunta,Intencion,clean_txt,feats_cbow,feats_skigram
0,como puedo trabajar en santander rio,Cat_102,como puedo trabajar en santander rio,"[-0.047589377, 0.0447164, -0.07848723, 0.14958...","[0.010053147, -0.009651233, -0.10590075, -0.00..."
1,pagar tarjeta visa querer reintegro,Cat_350,pagar tarjeta visa querer reintegro,"[-0.011047299, 0.21139987, 0.039497666, -0.048...","[0.020344956, 0.20709991, -0.065466926, -0.103..."
2,pagar tarjeta naranja sistema,Cat_132,pagar tarjeta naranja sistema,"[-0.00039441325, 0.16295655, -0.0021106154, 0....","[0.02834517, 0.09324221, -0.028042024, -0.0284..."
3,no se debitó la primera cuota del plan de bien...,Cat_129,no se debito la primera cuota del plan de bien...,"[0.018735666, 0.025599986, 0.013943253, 0.0835...","[0.06818043, 0.02883004, 0.011213178, 0.062340..."
4,abonar tarjeta credito,Cat_342,abonar tarjeta credito,"[0.055869073, 0.15869121, 0.07427165, 0.027957...","[-0.00225306, 0.057065807, -0.017361669, -0.11..."


In [17]:
df_test["feats_cbow"] = df_test["clean_txt"].apply(lambda x: ft_model_cbow.get_sentence_vector(x))
df_test["feats_skigram"] = df_test["clean_txt"].apply(lambda x: ft_model_skigram.get_sentence_vector(x))

## Train Model Cbow

In [102]:
X = np.array([ elem for elem in df.feats_cbow]) 
y = df.Intencion

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13571113)

In [106]:
clf = SVC(kernel="linear", C=10)
clf.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [107]:
preds = clf.predict(X_test)

In [108]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.662
El valor de Balanced Accuracy en test es de: 0.497


In [110]:
preds

array(['Cat_132', 'Cat_1', 'Cat_258', ..., 'Cat_163', 'Cat_223',
       'Cat_173'], dtype=object)

## Train Model Skipgram

In [18]:
X = np.array([ elem for elem in df.feats_skigram]) 
y = df.Intencion

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13571113)

In [19]:
parameters = {
    'kernel':['linear'],
    'C':[1500, 3000, 7000, 10000, 50000],
}
clf = GridSearchCV(SVC(), parameters, n_jobs=-1, verbose=2)

In [130]:
clf.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  3.7min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  5.1min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'kernel': ['linear'], 'C': [1500, 3000, 7000, 10000, 50000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [132]:
clf.best_estimator_

SVC(C=1500, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [133]:
clf = SVC(kernel="linear", C=1500)
clf.fit(X_train, y_train)

SVC(C=1500, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [134]:
preds = clf.predict(X_test)

In [135]:
preds

array(['Cat_132', 'Cat_1', 'Cat_258', ..., 'Cat_163', 'Cat_223',
       'Cat_173'], dtype=object)

In [123]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.71
El valor de Balanced Accuracy en test es de: 0.608


In [136]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.709
El valor de Balanced Accuracy en test es de: 0.608


# SMOTE

In [23]:
cat_to_delete = [ cat for cat, occ in sorted(Counter(y_train).items(), key=lambda x: x[1]) if occ < 6]
keep = [(x,y) for x, y in zip(X_train, y_train) if y not in cat_to_delete]
X = np.array([ x for x,_ in keep])
y = np.array([y for _,y in keep])

from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(n_jobs=-1)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

In [24]:
clf = SVC(kernel="linear", C=1500)
clf.fit(X_resampled, y_resampled)

SVC(C=1500, kernel='linear')

In [25]:
clf_smote_enn_preds = clf.predict(X_test)

print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, clf_smote_enn_preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, clf_smote_enn_preds),3)))


El valor de Accuracy en test es de: 0.708
El valor de Balanced Accuracy en test es de: 0.584


## Supervised ft

In [57]:
prefix = "__label__"

In [58]:
X = df.clean_txt
y = df.Intencion

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13571113)

In [59]:
y_train = [label.replace("Cat_", prefix) for label in y_train]
y_test = [label.replace("Cat_", prefix) for label in y_test]

In [60]:
sentences_train = [ f'{x} {y}' for x,y in zip(X_train, y_train)]
sentences_test = [ f'{x} {y}' for x,y in zip(X_test, y_test)]

In [151]:
with open("supervised/ft_train.txt", 'w') as ofile:
    for sent in sentences_train:
        ofile.write(sent + '\n')
with open("supervised/ft_val.txt", 'w') as ofile:
    for sent in sentences_test:
        ofile.write(sent + '\n')

In [None]:
! fasttext supervised -input supervised/ft_train.txt -epoch 10 -lr 0.4 -lrUpdateRate 100 -loss hs -wordNgrams 1 -verbose 2 -minCount 1 -dim 50 -output supervised/ft_sup

In [149]:
P@1     0.574
R@1     0.574
Number of examples: 5026

In [None]:
! fastText/./fasttext supervised -input supervised/ft_train.txt -output supervised/ft_sup_auto -autotune-validation supervised/ft_val.txt

In [9]:
ft_sup_auto = load_model("supervised/ft_sup_auto.bin")

In [66]:
preds = [ft_sup_auto.predict(x)[0][0] for x in X_test]

In [71]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.775
El valor de Balanced Accuracy en test es de: 0.629


In [52]:
ft_sup_auto.__dict__['f'].getArgs().__dir__()

['__init__',
 '__doc__',
 '__module__',
 'input',
 'output',
 'lr',
 'lrUpdateRate',
 'dim',
 'ws',
 'epoch',
 'minCount',
 'minCountLabel',
 'neg',
 'wordNgrams',
 'loss',
 'model',
 'bucket',
 'minn',
 'maxn',
 'thread',
 't',
 'label',
 'verbose',
 'pretrainedVectors',
 'saveOutput',
 'qout',
 'retrain',
 'qnorm',
 'cutoff',
 'dsub',
 '__new__',
 '__repr__',
 '__hash__',
 '__str__',
 '__getattribute__',
 '__setattr__',
 '__delattr__',
 '__lt__',
 '__le__',
 '__eq__',
 '__ne__',
 '__gt__',
 '__ge__',
 '__reduce_ex__',
 '__reduce__',
 '__subclasshook__',
 '__init_subclass__',
 '__format__',
 '__sizeof__',
 '__dir__',
 '__class__']

In [43]:
ft_sup_auto.__dict__['f'].getArgs().maxn

6