In [None]:
import pandas as pd
import numpy as np
import text_normalizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from text_normalizer import tokenizer

## 1. Train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    stratify = y
)

## 2. Normalization

In [None]:
X_train = text_normalizer.normalize_corpus(X_train,
    html_stripping=True,
    contraction_expansion=True,
    accented_char_removal=True,
    text_lower_case=True,
    text_stemming=True,
    text_lemmatization=False,
    special_char_removal=True,
    remove_digits=True,
    stopword_removal=True,
    stopwords=text_normalizer.stopword_list
)


X_test = text_normalizer.normalize_corpus(X_test,
    html_stripping=True,
    contraction_expansion=True,
    accented_char_removal=True,
    text_lower_case=True,
    text_stemming=True,
    text_lemmatization=False,
    special_char_removal=True,
    remove_digits=True,
    stopword_removal=True,
    stopwords=text_normalizer.stopword_list
)

## 3. Feature Engineering

In [None]:
# Option 1: TF-IDF, mejor para titulos.
tfid_vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = tfid_vectorizer.fit_transform(X_train)
X_test_vec = tfid_vectorizer.transform(X_test)

In [None]:
# Option 2: Word2Vec, para descripciones.
X_train_tok = [tokenizer.tokenize(doc) for doc in X_train]
X_test_tok = [tokenizer.tokenize(doc) for doc in X_test]

model_w2v = Word2Vec(sentences=X_train_tok, vector_size=100)

In [None]:
def vectorizer(corpus, model, num_features=100):
    # Put your code
    vocabulary = model.wv.index_to_key
    corpus_vectors = []
    
    for doc in corpus:
        doc_vec = []
        for word in doc:
            if word in vocabulary:
                word_vector = model.wv[word]
            doc_vec.append(word_vector)  
        avg_vec = np.add.reduce(doc_vec)/len(doc_vec)
        corpus_vectors.append(avg_vec)
        
    return corpus_vectors

In [None]:
X_train_w2v = vectorizer(X_train_tok, model_w2v)
X_test_w2v = vectorizer(X_test_tok, model_w2v)

## 4. Modeling

In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier

X = pd.DataFrame(X_train_w2v) # Reemplazar con X_train_vec si usan TF-IDF
x = pd.DataFrame(X_test_w2v) # Reemplazar con X_test_vec si usan TF-IDF
Y = pd.DataFrame(y_train)
y = pd.DataFrame(y_test)

lazy = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = lazy.fit(X, x, Y, y)
models