In [2]:
!pip3 install joblib



In [3]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from textacy.preprocess import preprocess_text as textacy_preprocess

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE, ADASYN

import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# Load & Transform Data

In [4]:
STOPWORDS_SET = stopwords.words('spanish')

EMOJI_PATTERN = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)


def remove_stopwords(text):
    """Remove stop words from list of tokenized words."""
    new_words = [word for word in text.split() if word not in STOPWORDS_SET]
    return ' '.join(new_words)


def remove_emoticons(text):
    """Remove emoticos."""
    return EMOJI_PATTERN.sub('', text)


def to_lowercase(text):
    """To lowercase."""
    return text.lower()


def preprocess_text(text, fix_unicode=True, lowercase=True,
                    no_urls=True, no_emails=True,
                    no_phone_numbers=True,
                    no_numbers=True, no_currency_symbols=True,
                    no_punct=True, no_accents=True):
    """Preprocess text."""
    clean_text = ""
    if text:
        clean_text = remove_emoticons(text)
        if lowercase:
            clean_text = to_lowercase(clean_text)
#        clean_text = remove_stopwords(clean_text)
        clean_text = textacy_preprocess(clean_text, fix_unicode=fix_unicode,
                                        lowercase=lowercase,
                                        no_urls=no_urls, no_emails=no_emails,
                                        no_phone_numbers=no_phone_numbers,
                                        no_numbers=no_numbers,
                                        no_currency_symbols=no_currency_symbols,
                                        no_punct=no_punct,
                                        no_accents=no_accents)
    return clean_text

In [5]:
df = pd.read_csv('train.csv',sep = '|')
df.columns = ['Pregunta', 'Intencion']

In [6]:
df["clean_txt"] = df["Pregunta"].apply(lambda x: preprocess_text(x))

In [7]:
X = df.clean_txt
y = df.Intencion

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13571113)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15078,), (5026,), (15078,), (5026,))

In [9]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer(sublinear_tf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [10]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [11]:
X_train_tfidf.shape, X_test_tfidf.shape

((15078, 5200), (5026, 5200))

### Base

In [65]:
clf_base = LinearSVC(C=10).fit(X_train_tfidf, y_train)

In [67]:
preds_base = clf_base.predict(X_test_tfidf)

In [68]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds_base),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds_base),3)))

El valor de Accuracy en test es de: 0.767
El valor de Balanced Accuracy en test es de: 0.668


## Base + PCA

In [18]:
from sklearn.decomposition import PCA

In [49]:
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train_tfidf.toarray())

In [50]:
X_test_pca = pca.transform(X_test_tfidf.toarray())

In [51]:
clf_base_pca = LinearSVC(C=10).fit(X_train_pca, y_train)

In [52]:
preds_base_pca = clf_base_pca.predict(X_test_pca)

In [53]:
preds_base_pca

array(['Cat_132', 'Cat_1', 'Cat_258', ..., 'Cat_163', 'Cat_223', 'Cat_11'],
      dtype=object)

In [54]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds_base_pca),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds_base_pca),3)))


El valor de Accuracy en test es de: 0.49
El valor de Balanced Accuracy en test es de: 0.278


In [59]:
keep = [(x,y) for x, y in zip(X_train_pca, y_train) if y not in cat_to_delete]
X = np.array([ x for x,_ in keep])
y = np.array([y for _,y in keep])

In [61]:
smote_enn = SMOTEENN(n_jobs=-1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [None]:
clf_smote_enn = LinearSVC(C=10).fit(X_resampled, y_resampled)


In [65]:
clf_smote_enn_preds = clf_smote_enn.predict(X_test_pca)

print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, clf_smote_enn_preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, clf_smote_enn_preds),3)))


El valor de Accuracy en test es de: 0.324
El valor de Balanced Accuracy en test es de: 0.314


In [66]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(n_jobs=-1)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

In [67]:
clf_smote_tomek = LinearSVC(C=10).fit(X_resampled, y_resampled)

In [68]:
clf_smote_tomek_preds = clf_smote_tomek.predict(X_test_pca)

print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, clf_smote_tomek_preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, clf_smote_tomek_preds),3)))


El valor de Accuracy en test es de: 0.358
El valor de Balanced Accuracy en test es de: 0.313


## Imbalance Learning

In [12]:
X_train = X_train_tfidf
X_test = X_test_tfidf

## Over sampling

In [56]:
cat_to_delete = [ cat for cat, occ in sorted(Counter(y_train).items(), key=lambda x: x[1]) if occ < 6]

In [14]:
keep = [(x.toarray()[0],y) for x, y in zip(X_train, y_train) if y not in cat_to_delete]

In [15]:
X = np.array([ x for x,_ in keep])
y = np.array([y for _,y in keep])

### SMOTE

In [59]:
X_resampled, y_resampled = SMOTE().fit_resample(X, y)

In [61]:
clf_smote = LinearSVC(C=10).fit(X_resampled, y_resampled)

In [62]:
preds = clf_smote.predict(X_test)

In [64]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds),3)))

El valor de Accuracy en test es de: 0.751
El valor de Balanced Accuracy en test es de: 0.632


### BorderlineSMOTE

In [77]:
from imblearn.over_sampling import BorderlineSMOTE

In [78]:
X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)

In [79]:
clf_bordersmote = LinearSVC(C=10).fit(X_resampled, y_resampled)
bordersmote_preds = clf_bordersmote.predict(X_test)
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, bordersmote_preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, bordersmote_preds),3)))

El valor de Accuracy en test es de: 0.751
El valor de Balanced Accuracy en test es de: 0.632


### ADASYN

In [80]:
X_resampled, y_resampled = ADASYN().fit_resample(X, y)

ValueError: No samples will be generated with the provided ratio settings.

In [None]:
clf_adasyn = LinearSVC(C=10).fit(X_resampled, y_resampled)

In [None]:
preds_adasyn = clf_adasyn.predict(X_test)

In [None]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, preds_adasyn),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, preds_adasyn),3)))

## Under Sampling 

### Prototype generation: ClusterCentroids

In [81]:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids()
X_resampled, y_resampled = cc.fit_resample(X, y)

clf_cluster = LinearSVC(C=10).fit(X_resampled, y_resampled)
clf_cluster_preds = clf_cluster.predict(X_test)

print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, clf_cluster_preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, clf_cluster_preds),3)))

El valor de Accuracy en test es de: 0.646
El valor de Balanced Accuracy en test es de: 0.629


### TomekLinks

In [82]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks()
X_res, y_res = tl.fit_resample(X, y)

clf_t1 = LinearSVC(C=10).fit(X_res, y_res)
clf_t1_preds = clf_t1.predict(X_test)

print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, clf_t1_preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, clf_t1_preds),3)))

El valor de Accuracy en test es de: 0.755
El valor de Balanced Accuracy en test es de: 0.621


## Combination of over- and under-sampling

### SMOTEENN

In [16]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(n_jobs=-1)

In [17]:
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

KeyboardInterrupt: 

In [None]:
clf_smote_enn = LinearSVC(C=10).fit(X_resampled, y_resampled)
clf_smote_enn_preds = clf_smote_enn.predict(X_test)

print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, clf_smote_enn_preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, clf_smote_enn_preds),3)))


### SMOTETomek

In [None]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek()
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

clf_smote_tomek = LinearSVC(C=10).fit(X_resampled, y_resampled)
clf_smote_tomek_preds = clf_smote_tomek.predict(X_test)

print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, clf_smote_tomek_preds),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, clf_smote_tomek_preds),3)))


## 5. Ensemble of samplers

### BalancedBaggingClassifier

In [16]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [17]:
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                replacement=False)

In [18]:
bbc.fit(X_train, y_train)

BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier())

In [19]:
y_pred = bbc.predict(X_test)

In [20]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, y_pred),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, y_pred),3)))


El valor de Accuracy en test es de: 0.086
El valor de Balanced Accuracy en test es de: 0.13


### Random Forest

In [21]:
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)

In [22]:
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=0)

In [23]:
y_pred = brf.predict(X_test)

In [24]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, y_pred),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, y_pred),3)))

El valor de Accuracy en test es de: 0.199
El valor de Balanced Accuracy en test es de: 0.32


In [25]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=0)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(random_state=0)

In [26]:
y_pred = eec.predict(X_test)

In [27]:
print('El valor de Accuracy en test es de: {}'.format(round(accuracy_score(y_test, y_pred),3)))
print('El valor de Balanced Accuracy en test es de: {}'.format(round(balanced_accuracy_score(y_test, y_pred),3)))


El valor de Accuracy en test es de: 0.013
El valor de Balanced Accuracy en test es de: 0.025
