In [74]:
!pip install joblib



In [75]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from textacy.preprocess import preprocess_text as textacy_preprocess

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from tqdm.auto import tqdm
from googletrans import Translator

import warnings
warnings.filterwarnings("ignore")

# Ejemplo de uso de google translate

In [12]:
translator = Translator()
sentences_es = ["necesito pagar las cuentas", "ola k ace"]
translations_en = translator.translate(sentences_es, src="es", dest="en")
for translation in translations_en:
    print(translation.origin, translation.text)

necesito pagar las cuentas I need to pay the bills
ola k ace Hi, what are you doing


# Load Data

In [27]:
df = pd.read_csv('data/train.csv',sep = '|')
df.columns = ['Pregunta', 'Intencion']
print(f'df shape: {df.shape}')
df.head(2)

df shape: (20104, 2)


Unnamed: 0,Pregunta,Intencion
0,como puedo trabajar en santander rio,Cat_102
1,pagar tarjeta visa querer reintegro,Cat_350


In [115]:
df_test = pd.read_csv('data/test_santander.csv')
print(f'df shape: {df_test.shape}')
df_test.head(2)

df shape: (6702, 2)


Unnamed: 0,id,Pregunta
0,0,querer saber tarjeta sin limite
1,1,¿cuál es el límite de mi tarjeta santander?


# translate es --> en --> es

In [28]:
sentences_es = list(df.Pregunta.values)
print(f'Amount of sentences {len(sentences_es)}')
sentences_es[:2]

Amount of sentences 20104


['como puedo trabajar en santander rio', 'pagar tarjeta visa querer reintegro']

In [29]:
translator = Translator()

### Translate es --> en

In [56]:
translations_en = []
for sent in tqdm(sentences_es):
    translation = translator.translate(sent, src="es", dest="en")
    translations_en.append(translation)
print(f'Amount sentences en: {len(translations_en)}')
translations_en[:2]

HBox(children=(IntProgress(value=0, max=20104), HTML(value='')))

Amount sentences en: 20104


[<googletrans.models.Translated at 0x10c8e6490>,
 <googletrans.models.Translated at 0x121bf2ed0>]

In [57]:
sentences_en = [ translation.text for translation in translations_en]

In [58]:
translations_es_back = []
for sent in tqdm(sentences_en):
    translation = translator.translate(sent, src="en", dest="es")
    translations_es_back.append(translation)
print(f'Amount sentences en: {len(translations_es_back)}')
translations_es_back[:2]

HBox(children=(IntProgress(value=0, max=20104), HTML(value='')))

Amount sentences en: 20104


[<googletrans.models.Translated at 0x122408bd0>,
 <googletrans.models.Translated at 0x122408fd0>]

In [59]:
sentences_es_back = [ translation.text for translation in translations_es_back]

In [60]:
sentences_es_back[:2]

['¿Cómo puedo trabajar en Santander Rio?',
 'pagar tarjeta visa quiere reembolso']

In [62]:
df["Pregunta_T1"] = sentences_es_back

In [72]:
df.to_csv("data/train_with_translations.csv", index=False)

In [119]:
test_sentences_es = list(df_test.Pregunta.values)
test_translation_en = []
for sent in tqdm(test_sentences_es):
    translation = translator.translate(sent, src="es", dest="en")
    test_translation_en.append(translation)
print(f'Amount sentences en: {len(test_translation_en)}')
test_translation_en[:2]

HBox(children=(IntProgress(value=0, max=6702), HTML(value='')))

Amount sentences en: 6702


[<googletrans.models.Translated at 0x12cf9f450>,
 <googletrans.models.Translated at 0x12cf9fc50>]

In [120]:
test_sentences_en = [ translation.text for translation in test_translation_en]

In [126]:
test_translations_es_back = []
for sent in tqdm(test_sentences_en):
    translation = translator.translate(sent, src="en", dest="es")
    translations_es_back.append(translation)
print(f'Amount sentences en: {len(test_translations_es_back)}')
test_translations_es_back[:2]

HBox(children=(IntProgress(value=0, max=6702), HTML(value='')))

Amount sentences en: 0


[]

In [132]:
test_sentences_es_back = [ translation.text for translation in translations_es_back]

In [127]:
test_sentences_es_back = [ translation.text for translation in test_translations_es_back]

In [136]:
df_test["Pregunta_T1"] = test_sentences_es_back[-6702:]
df_test.to_csv("data/test_with_translations.csv", index=False)

## Preprocess

In [76]:
STOPWORDS_SET = stopwords.words('spanish')

EMOJI_PATTERN = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)


def remove_stopwords(text):
    """Remove stop words from list of tokenized words."""
    new_words = [word for word in text.split() if word not in STOPWORDS_SET]
    return ' '.join(new_words)


def remove_emoticons(text):
    """Remove emoticos."""
    return EMOJI_PATTERN.sub('', text)


def to_lowercase(text):
    """To lowercase."""
    return text.lower()


def preprocess_text(text, fix_unicode=True, lowercase=True,
                    no_urls=True, no_emails=True,
                    no_phone_numbers=True,
                    no_numbers=True, no_currency_symbols=True,
                    no_punct=True, no_accents=True):
    """Preprocess text."""
    clean_text = ""
    if text:
        clean_text = remove_emoticons(text)
        if lowercase:
            clean_text = to_lowercase(clean_text)
#        clean_text = remove_stopwords(clean_text)
        clean_text = textacy_preprocess(clean_text, fix_unicode=fix_unicode,
                                        lowercase=lowercase,
                                        no_urls=no_urls, no_emails=no_emails,
                                        no_phone_numbers=no_phone_numbers,
                                        no_numbers=no_numbers,
                                        no_currency_symbols=no_currency_symbols,
                                        no_punct=no_punct,
                                        no_accents=no_accents)
    return clean_text

In [77]:
df["Pregunta_clean_txt"] = df["Pregunta"].apply(lambda x: preprocess_text(x))
df["Pregunta_T1_clean_txt"] = df["Pregunta_T1"].apply(lambda x: preprocess_text(x))

## Split Dataset

In [94]:
X_origin = df.Pregunta_clean_txt
X_translated = df.Pregunta_T1_clean_txt
y = df.Intencion

train_index, test_index = train_test_split(list(X_origin.index), random_state = 13571113)

# Model base origin

In [95]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_origin[train_index])
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [97]:
X_test_counts = count_vect.transform(X_origin[test_index])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [98]:
clf = SVC(kernel="linear", C=10)
clf.fit(X_train_tfidf, y[train_index])
preds = clf.predict(X_test_tfidf)

In [99]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y[test_index], preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y[test_index], preds),3)))

El valor de Accuracy en test es de: 0.789
El valor de Balanced Accuracy en test es de: 0.674


# Model base translated

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_translated[train_index])
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = SVC(kernel="linear", C=10)
clf.fit(X_train_tfidf, y[train_index])

In [111]:
X_test_counts = count_vect.transform(X_origin[test_index])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
preds = clf.predict(X_test_tfidf)
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y[test_index], preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y[test_index], preds),3)))

El valor de Accuracy en test es de: 0.686
El valor de Balanced Accuracy en test es de: 0.54


## BOTH DATA: ORIGIN + TRANSLATED

In [112]:
X_train = list(X_translated[train_index].values) +  list(X_origin[train_index].values)
y_train = list(y[train_index].values) + list(y[train_index].values)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = SVC(kernel="linear", C=10)
clf.fit(X_train_tfidf, y_train)

SVC(C=10, kernel='linear')

In [113]:
X_test_counts = count_vect.transform(X_origin[test_index])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
preds = clf.predict(X_test_tfidf)
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y[test_index], preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y[test_index], preds),3)))

El valor de Accuracy en test es de: 0.79
El valor de Balanced Accuracy en test es de: 0.686
