In [2]:
!pip install joblib



In [3]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from textacy.preprocess import preprocess_text as textacy_preprocess

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [4]:
STOPWORDS_SET = stopwords.words('spanish')

EMOJI_PATTERN = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)


def remove_stopwords(text):
    """Remove stop words from list of tokenized words."""
    new_words = [word for word in text.split() if word not in STOPWORDS_SET]
    return ' '.join(new_words)


def remove_emoticons(text):
    """Remove emoticos."""
    return EMOJI_PATTERN.sub('', text)


def to_lowercase(text):
    """To lowercase."""
    return text.lower()


def preprocess_text(text, fix_unicode=True, lowercase=True,
                    no_urls=True, no_emails=True,
                    no_phone_numbers=True,
                    no_numbers=True, no_currency_symbols=True,
                    no_punct=True, no_accents=True):
    """Preprocess text."""
    clean_text = ""
    if text:
        clean_text = remove_emoticons(text)
        if lowercase:
            clean_text = to_lowercase(clean_text)
#        clean_text = remove_stopwords(clean_text)
        clean_text = textacy_preprocess(clean_text, fix_unicode=fix_unicode,
                                        lowercase=lowercase,
                                        no_urls=no_urls, no_emails=no_emails,
                                        no_phone_numbers=no_phone_numbers,
                                        no_numbers=no_numbers,
                                        no_currency_symbols=no_currency_symbols,
                                        no_punct=no_punct,
                                        no_accents=no_accents)
    return clean_text

## Load Dataset

In [5]:
df = pd.read_csv('train.csv',sep = '|')
df.columns = ['Pregunta', 'Intencion']

## Pre-process data

In [6]:
df["clean_txt"] = df["Pregunta"].apply(lambda x: preprocess_text(x))

## Split Dataset

In [7]:
X = df.clean_txt
y = df.Intencion

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13571113)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15078,), (5026,), (15078,), (5026,))

In [27]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [28]:
clf = SVC(kernel="linear", C=10)
clf.fit(X_train_tfidf, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [29]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
preds = clf.predict(X_test_tfidf)

In [30]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.789
El valor de Balanced Accuracy en test es de: 0.674


## Submit

In [31]:
df_test = pd.read_csv('test_santander.csv')
df_test.head(2)

Unnamed: 0,id,Pregunta
0,0,querer saber tarjeta sin limite
1,1,¿cuál es el límite de mi tarjeta santander?


In [32]:
X_train = df.clean_txt
y_train = df.Intencion

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = SVC(kernel="linear", C=10)
clf.fit(X_train_tfidf, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [33]:
df_test["clean_txt"] = df_test["Pregunta"].apply(lambda x: preprocess_text(x))

In [35]:
X_test = df_test["clean_txt"]
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [36]:
preds = clf.predict(X_test_tfidf)

In [37]:
preds = [int(pred.split("_")[1]) for pred in preds]

In [38]:
df_test["cat"] = preds

In [39]:
submit_df = df_test[["id", "cat"]]
submit_df.head(2)

Unnamed: 0,id,cat
0,0,303
1,1,303


In [40]:
submit_df.to_csv('preprocess_countv_tfidf_svc_df.csv', header=False, index=False)

## CountVec train + test --> Pre-p +countvec + tdidf + svc

In [67]:
X = df.clean_txt
y = df.Intencion

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13571113)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15078,), (5026,), (15078,), (5026,))

In [69]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X)
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_tfidf = tfidf_transformer.fit_transform(X_counts)

X_train_counts = count_vect.transform(X_train)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)

In [70]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [75]:
parameters = {
    'kernel':['linear'],
    'C':[5, 10, 15, 20],
}
clf = GridSearchCV(SVC(), parameters, n_jobs=-1, verbose=2)
clf.fit(X_train_tfidf, y_train)
clf.best_estimator_

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:  4.7min remaining: 14.1min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  4.8min remaining:   57.6s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  4.8min finished


SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [76]:
clf = SVC(kernel="linear", C=5)
clf.fit(X_train_tfidf, y_train)

SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [77]:
preds = clf.predict(X_test_tfidf)

In [73]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.784
El valor de Balanced Accuracy en test es de: 0.662


In [78]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.785
El valor de Balanced Accuracy en test es de: 0.665


## XGBoost

In [42]:
X = df.clean_txt
y = df.Intencion

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13571113)

In [43]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15078,), (5026,), (15078,), (5026,))

In [44]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [45]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [58]:
X_train_tfidf.shape, X_train_tfidf[:5000].shape, y_train[:5000].shape

((15078, 5200), (5000, 5200), (5000,))

In [59]:
parameters = {
    "max_features":["log2","sqrt"],
    "subsample":[0.5],
    "n_estimators":[10, 50]
    }

clf = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs=-1, verbose=2)

clf.fit(X_train_tfidf[:5000], y_train[:5000])

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:  2.4min remaining:  7.1min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  6.9min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  7.0min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_features': ['log2', 'sqrt'], 'subsample': [0.5], 'n_estimators': [10, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [61]:
print(clf.score(X_train_tfidf[:5000], y_train[:5000]))
print(clf.best_params_)

0.1656
{'max_features': 'log2', 'n_estimators': 10, 'subsample': 0.5}


In [62]:
clf = GradientBoostingClassifier(n_estimators=10, subsample=0.5, max_features="log2")
clf.fit(X_train_tfidf, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features='log2', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=0.5, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [63]:
preds = clf.predict(X_test_tfidf)

In [64]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.098
El valor de Balanced Accuracy en test es de: 0.059


In [65]:
preds

array(['Cat_0', 'Cat_132', 'Cat_132', ..., 'Cat_166', 'Cat_248',
       'Cat_132'], dtype=object)

## MLP

In [9]:
X = df.clean_txt
y = df.Intencion

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13571113)

In [10]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [11]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [12]:
from sklearn.neural_network import MLPClassifier

In [13]:
clf = MLPClassifier()
clf.fit(X_train_tfidf, y_train)

MLPClassifier()

In [14]:
preds = clf.predict(X_test_tfidf)

In [15]:
preds

array(['Cat_132', 'Cat_1', 'Cat_258', ..., 'Cat_163', 'Cat_249',
       'Cat_174'], dtype='<U7')

In [16]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.79
El valor de Balanced Accuracy en test es de: 0.677


## SUbmit mlp

In [17]:
df_test = pd.read_csv('test_santander.csv')
df_test.head(2)

Unnamed: 0,id,Pregunta
0,0,querer saber tarjeta sin limite
1,1,¿cuál es el límite de mi tarjeta santander?


In [18]:
X_train = df.clean_txt
y_train = df.Intencion

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MLPClassifier()
clf.fit(X_train_tfidf, y_train)

MLPClassifier()

In [19]:
df_test["clean_txt"] = df_test["Pregunta"].apply(lambda x: preprocess_text(x))

In [20]:
X_test = df_test["clean_txt"]
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [21]:
preds = clf.predict(X_test_tfidf)

In [22]:
preds = [int(pred.split("_")[1]) for pred in preds]

In [23]:
df_test["cat"] = preds

In [24]:
submit_df = df_test[["id", "cat"]]
submit_df.head(2)

Unnamed: 0,id,cat
0,0,303
1,1,303


In [25]:
submit_df.to_csv('preprocess_countv_tfidf_mlp_df.csv', header=False, index=False)

## MLP + Imb learning

In [34]:
X = df.clean_txt
y = df.Intencion

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13571113)

In [35]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [36]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [37]:
clf = MLPClassifier()
clf.fit(X_train_tfidf, y_train)

MLPClassifier()

In [38]:
preds = clf.predict(X_test_tfidf)

In [39]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.796
El valor de Balanced Accuracy en test es de: 0.684


In [45]:
x_train_feat = X_train_tfidf * clf.coefs_[0]
x_test_feat = X_test_tfidf * clf.coefs_[0]

In [46]:
cat_to_delete = [ cat for cat, occ in sorted(Counter(y_train).items(), key=lambda x: x[1]) if occ < 6]
keep = [(x,y) for x, y in zip(x_train_feat, y_train) if y not in cat_to_delete]
X = np.array([ x for x,_ in keep])
y = np.array([y for _,y in keep])

In [49]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(n_jobs=-1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [54]:
from sklearn.svm import LinearSVC
parameters = {
    'C':[1, 10, 100, 1000],
}
clf = GridSearchCV(LinearSVC(), parameters, n_jobs=-1, verbose=2)
clf.fit(X_resampled, y_resampled)
clf.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:  8.4min remaining: 12.6min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 10.7min finished


LinearSVC(C=1)

In [51]:
clf_smote_enn = LinearSVC(C=1).fit(X_resampled, y_resampled)

In [58]:
clf_smote_enn_preds = clf_smote_enn.predict(x_test_feat)
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, clf_smote_enn_preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, clf_smote_enn_preds),3)))

El valor de Accuracy en test es de: 0.734
El valor de Balanced Accuracy en test es de: 0.617
