In [1]:
import pandas as pd
df3 = pd.read_csv('elo7_recruitment_dataset.csv')

In [2]:
from sklearn.model_selection import train_test_split

def split_stratified_into_train_val_test(df_input, stratify_colname='y',
                                         frac_train=0.6, frac_val=0.15, frac_test=0.25,
                                         random_state=None):
    '''

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    '''

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' % \
                         (frac_train, frac_val, frac_test))

    if stratify_colname not in df_input.columns:
        raise ValueError('%s is not a column in the dataframe' % (stratify_colname))

    X = df_input # Contains all columns.
    # y = category 
    y = df_input[[stratify_colname]] # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(X,
                                                          y,
                                                          stratify=y,
                                                          test_size=(1.0 - frac_train),
                                                          random_state=random_state)

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(df_temp,
                                                      y_temp,
                                                      stratify=y_temp,
                                                      test_size=relative_frac_test,
                                                      random_state=random_state)

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test

In [3]:
train, validate, test = split_stratified_into_train_val_test(df3, stratify_colname='category')

In [4]:
# Rodar apenas uma vez
#!python -m spacy download pt_core_news_sm

In [5]:
import spacy
nlp = spacy.load("pt_core_news_sm")

In [6]:
# https://www.elo7.com.br/bolsa-de-praia-pink-de-tela-barca/dp/195E09?e7src=home&e7mdm=card#bn=1
textos = ["Sapatinho Corujinha", "Kit de Sapatinhos com luva", "Sapatinho Allstar Tenis"]
texto = "Sapatinho Corujinha"

In [7]:
doc = nlp(texto)

In [8]:
treatment_title = (title.lower() for title in df3["title"])

In [9]:
treatment_title

<generator object <genexpr> at 0x7fb4f01a8b48>

In [10]:
# remove stopwords, alphanumber and phrases with 3 words
def treat_words(doc):
    valid_tokens = []
    for token in doc:
        e_valid = not token.is_stop and token.is_alpha
        if e_valid:
            valid_tokens.append(token.text)
    if len(valid_tokens) > 1:
        return " ".join(valid_tokens)

In [11]:
from time import time

time_initial= time()

treated_text = [treat_words(doc) for doc in nlp.pipe(treatment_title, batch_size=1000)]

time_process = time() - time_initial
print(time_process/60)

0.9595110654830933


In [12]:
print(len(treated_text))

38507


In [13]:
import pandas as pd
treated_titles = pd.DataFrame({"title": treated_text}).dropna().drop_duplicates()
print(len(treated_titles))

22364


In [14]:
from gensim.models import Word2Vec

In [15]:
# hiperparameters
#sg = continuos bag 0 of words and skip-gram 1
# window = context representation word2vec
# min_count = typing error
# neural network with layer
# alpha = learning rate , leave the maximum point and go to the minimum point 
# min_alpha = decay
w2v_model = Word2Vec(sg=0, window = 2, size=300, min_count=5, alpha=0.03, min_alpha=0.007 )

In [16]:
w2v_model

<gensim.models.word2vec.Word2Vec at 0x7fb4f0111b70>

In [17]:
# creat sentences (list of list of tokens -> title)
sentences = list_list_tokens = [title.split(" ") for title in treated_titles.title]

In [18]:
import logging
logging.basicConfig(format="%(asctime)s : - %(message)s", level=logging.INFO)
# building vocabulary

w2v_model.build_vocab(sentences, progress_per=5000)
# mensage log


2021-03-01 04:12:48,623 : - collecting all words and their counts
2021-03-01 04:12:48,624 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-01 04:12:48,630 : - PROGRESS: at sentence #5000, processed 19327 words, keeping 3019 word types
2021-03-01 04:12:48,635 : - PROGRESS: at sentence #10000, processed 39196 words, keeping 4275 word types
2021-03-01 04:12:48,641 : - PROGRESS: at sentence #15000, processed 59411 words, keeping 5313 word types
2021-03-01 04:12:48,646 : - PROGRESS: at sentence #20000, processed 79987 words, keeping 6094 word types
2021-03-01 04:12:48,650 : - collected 6466 word types from a corpus of 89778 raw words and 22364 sentences
2021-03-01 04:12:48,651 : - Loading a fresh vocabulary
2021-03-01 04:12:48,657 : - min_count=5 retains 2057 unique words (31% of original 6466, drops 4409)
2021-03-01 04:12:48,657 : - min_count=5 leaves 82681 word corpus (92% of original 89778, drops 7097)
2021-03-01 04:12:48,663 : - deleting the raw counts dicti

In [19]:
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec

class callback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print("Loss após a época {}:{}".format(self.epoch, loss))
        else:
            print("Loss após a época {}:{}".format(self.epoch, loss-self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [20]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, compute_loss=True, callbacks=[callback()])

2021-03-01 04:12:49,014 : - training model with 3 workers on 2057 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2
2021-03-01 04:12:49,064 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,070 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,073 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:49,074 : - EPOCH - 1 : training on 89778 raw words (69220 effective words) took 0.0s, 1514074 effective words/s
2021-03-01 04:12:49,124 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,127 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,133 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:49,134 : - EPOCH - 2 : training on 89778 raw words (69236 effective words) took 0.1s, 1330533 effective words/s
2021-03-01 04:12:49,183 : - worker thread finished; awaiting finish of 2 more thre

Loss após a época 0:94297.765625
Loss após a época 1:71480.609375
Loss após a época 2:65628.875
Loss após a época 3:60562.5625


2021-03-01 04:12:49,304 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,307 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,314 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:49,315 : - EPOCH - 5 : training on 89778 raw words (69312 effective words) took 0.0s, 1434331 effective words/s
2021-03-01 04:12:49,364 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,367 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,373 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:49,374 : - EPOCH - 6 : training on 89778 raw words (69266 effective words) took 0.1s, 1340464 effective words/s
2021-03-01 04:12:49,429 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,433 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,438 : - worker thread finished; aw

Loss após a época 4:55407.21875
Loss após a época 5:51215.46875
Loss após a época 6:47605.15625
Loss após a época 7:44426.21875


2021-03-01 04:12:49,555 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,563 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,565 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:49,566 : - EPOCH - 9 : training on 89778 raw words (69326 effective words) took 0.0s, 1560934 effective words/s
2021-03-01 04:12:49,620 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,627 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,630 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:49,630 : - EPOCH - 10 : training on 89778 raw words (69359 effective words) took 0.1s, 1238405 effective words/s
2021-03-01 04:12:49,685 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,692 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,695 : - worker thread finished; a

Loss após a época 8:42088.625
Loss após a época 9:39953.9375
Loss após a época 10:37947.125
Loss após a época 11:36220.0625


2021-03-01 04:12:49,813 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,821 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,824 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:49,825 : - EPOCH - 13 : training on 89778 raw words (69305 effective words) took 0.1s, 1188431 effective words/s
2021-03-01 04:12:49,880 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,886 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,889 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:49,890 : - EPOCH - 14 : training on 89778 raw words (69331 effective words) took 0.1s, 1251052 effective words/s
2021-03-01 04:12:49,941 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:49,948 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:49,950 : - worker thread finished; 

Loss após a época 12:35138.5
Loss após a época 13:33786.875
Loss após a época 14:32735.625
Loss após a época 15:31912.6875


2021-03-01 04:12:50,060 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,063 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,070 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:50,070 : - EPOCH - 17 : training on 89778 raw words (69390 effective words) took 0.0s, 1500648 effective words/s
2021-03-01 04:12:50,119 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,124 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,128 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:50,128 : - EPOCH - 18 : training on 89778 raw words (69162 effective words) took 0.0s, 1996475 effective words/s
2021-03-01 04:12:50,181 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,186 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,188 : - worker thread finished; 

Loss após a época 16:30858.5625
Loss após a época 17:40287.9375
Loss após a época 18:29268.0
Loss após a época 19:28581.125


2021-03-01 04:12:50,301 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,307 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,308 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:50,309 : - EPOCH - 21 : training on 89778 raw words (69439 effective words) took 0.1s, 1384345 effective words/s
2021-03-01 04:12:50,363 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,370 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,373 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:50,373 : - EPOCH - 22 : training on 89778 raw words (69341 effective words) took 0.1s, 1377320 effective words/s
2021-03-01 04:12:50,425 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,430 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,434 : - worker thread finished; 

Loss após a época 20:28154.5625
Loss após a época 21:27649.6875
Loss após a época 22:27063.875
Loss após a época 23:35630.0625


2021-03-01 04:12:50,548 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,553 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,559 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:50,559 : - EPOCH - 25 : training on 89778 raw words (69276 effective words) took 0.1s, 1211609 effective words/s
2021-03-01 04:12:50,613 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,619 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,622 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:50,623 : - EPOCH - 26 : training on 89778 raw words (69348 effective words) took 0.1s, 1223716 effective words/s
2021-03-01 04:12:50,676 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,684 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,686 : - worker thread finished; 

Loss após a época 24:26290.625
Loss após a época 25:25607.625
Loss após a época 26:25243.0
Loss após a época 27:24932.25


2021-03-01 04:12:50,797 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,801 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,807 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:50,808 : - EPOCH - 29 : training on 89778 raw words (69207 effective words) took 0.1s, 1296689 effective words/s
2021-03-01 04:12:50,860 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:50,863 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:50,869 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:50,870 : - EPOCH - 30 : training on 89778 raw words (69166 effective words) took 0.0s, 1545692 effective words/s
2021-03-01 04:12:50,870 : - training on a 2693340 raw words (2078845 effective words) took 1.9s, 1120086 effective words/s


Loss após a época 28:25052.0
Loss após a época 29:24456.625


(2078845, 2693340)

In [21]:
# analyse the result, quanlitative
w2v_model.most_similar("bebe")

  
2021-03-01 04:12:50,877 : - precomputing L2-norms of word weight vectors


[('bebê', 0.7158752679824829),
 ('cilios', 0.5748425126075745),
 ('ovelhinhas', 0.5430614948272705),
 ('ursinhas', 0.5115598440170288),
 ('soneca', 0.5050129890441895),
 ('zoe', 0.48289942741394043),
 ('bebês', 0.4789409637451172),
 ('coroas', 0.47468453645706177),
 ('ursa', 0.46888861060142517),
 ('chevron', 0.4648624658584595)]

In [22]:
w2v_model.most_similar("casamento")

  """Entry point for launching an IPython kernel.


[('noivado', 0.6011864542961121),
 ('debutante', 0.5665645599365234),
 ('consagração', 0.5459885001182556),
 ('madrinhas', 0.5355975031852722),
 ('madrinha', 0.5047541260719299),
 ('marsala', 0.48429062962532043),
 ('padre', 0.48323777318000793),
 ('padrinho', 0.4812665283679962),
 ('monograma', 0.4706394672393799),
 ('batismo', 0.46959495544433594)]

In [23]:
w2v_model.most_similar("decoracao")

  """Entry point for launching an IPython kernel.


[('jantar', 0.8123608827590942),
 ('moderna', 0.7328689098358154),
 ('urbana', 0.6977060437202454),
 ('cachoeira', 0.6966215968132019),
 ('aula', 0.6833953857421875),
 ('moderno', 0.6817888617515564),
 ('cidade', 0.6806402206420898),
 ('painéis', 0.6763515472412109),
 ('fofinho', 0.6714497804641724),
 ('abstrato', 0.6564012765884399)]

## skip-gram 

In [24]:
# train
w2v_model_skipgram = Word2Vec(sg=1, window = 5, size=300, min_count=5, alpha=0.03, min_alpha=0.007 )

w2v_model_skipgram.build_vocab(sentences, progress_per=5000)

w2v_model_skipgram.train(sentences, total_examples=w2v_model_skipgram.corpus_count, epochs=30)

2021-03-01 04:12:50,907 : - collecting all words and their counts
2021-03-01 04:12:50,908 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-01 04:12:50,913 : - PROGRESS: at sentence #5000, processed 19327 words, keeping 3019 word types
2021-03-01 04:12:50,919 : - PROGRESS: at sentence #10000, processed 39196 words, keeping 4275 word types
2021-03-01 04:12:50,924 : - PROGRESS: at sentence #15000, processed 59411 words, keeping 5313 word types
2021-03-01 04:12:50,930 : - PROGRESS: at sentence #20000, processed 79987 words, keeping 6094 word types
2021-03-01 04:12:50,934 : - collected 6466 word types from a corpus of 89778 raw words and 22364 sentences
2021-03-01 04:12:50,935 : - Loading a fresh vocabulary
2021-03-01 04:12:50,941 : - min_count=5 retains 2057 unique words (31% of original 6466, drops 4409)
2021-03-01 04:12:50,942 : - min_count=5 leaves 82681 word corpus (92% of original 89778, drops 7097)
2021-03-01 04:12:50,949 : - deleting the raw counts dicti

2021-03-01 04:12:52,911 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:52,912 : - EPOCH - 18 : training on 89778 raw words (69270 effective words) took 0.1s, 1018687 effective words/s
2021-03-01 04:12:52,989 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:53,000 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:53,003 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:53,003 : - EPOCH - 19 : training on 89778 raw words (69301 effective words) took 0.1s, 921734 effective words/s
2021-03-01 04:12:53,080 : - worker thread finished; awaiting finish of 2 more threads
2021-03-01 04:12:53,091 : - worker thread finished; awaiting finish of 1 more threads
2021-03-01 04:12:53,093 : - worker thread finished; awaiting finish of 0 more threads
2021-03-01 04:12:53,094 : - EPOCH - 20 : training on 89778 raw words (69357 effective words) took 0.1s, 839264 effective words/s
2021-03-01

(2078590, 2693340)

In [25]:
w2v_model_skipgram.wv.most_similar("bebe")

2021-03-01 04:12:54,010 : - precomputing L2-norms of word weight vectors


[('bebê', 0.5451101660728455),
 ('ursa', 0.5250155925750732),
 ('conforto', 0.4950309097766876),
 ('ursinhas', 0.4945724606513977),
 ('salmão', 0.493078351020813),
 ('molhadeira', 0.47946828603744507),
 ('cilios', 0.4748899042606354),
 ('termica', 0.4741533696651459),
 ('cavalinho', 0.4691942036151886),
 ('detalhes', 0.4610801041126251)]

In [26]:
w2v_model_skipgram.wv.most_similar("casamento")

[('noivado', 0.6201376914978027),
 ('debutante', 0.49689939618110657),
 ('madrinha', 0.4903338551521301),
 ('marsala', 0.4892551898956299),
 ('padre', 0.48681971430778503),
 ('pastor', 0.48036718368530273),
 ('chanfrada', 0.47239524126052856),
 ('noivinhos', 0.47019630670547485),
 ('noivos', 0.4700844883918762),
 ('sachês', 0.46872368454933167)]

# Conclusão
##### acesso: https://www.elo7.com.br

No caso de casamento ele retornou anatômica, diamantada, chanfrada e banhada no modelo skip gram que tem uma grande relacao visto que estão conectadas com a palavra alianca. E salmão existem categorias de lembracas para bebes que tb há cor slamao e muito presente. Assim devido a alta associacao (analogia) da cor com o produto para bebe, o mesmo é coerente apresentando-se como similar.
No caso de cílios 


In [27]:
# save model cbow
w2v_model.wv.save_word2vec_format("model_cbow.txt", binary=False)

2021-03-01 04:04:56,630 : - storing 2057x300 projection weights into model_cbow.txt


In [28]:
#save model skipgram
w2v_model.wv.save_word2vec_format("model_skip.txt", binary=False)

2021-03-01 04:04:57,016 : - storing 2057x300 projection weights into model_skip.txt


## Classificacao 

In [27]:
import spacy
import numpy as np
from gensim.models import KeyedVectors

w2v_model_cbow = KeyedVectors.load_word2vec_format("model_cbow.txt")
w2v_model_skipgram = KeyedVectors.load_word2vec_format("model_skip.txt")
nlp = spacy.load("pt_core_news_sm", desable=["parser", "ner", "tagger", "textcat"])

2021-03-01 04:12:54,028 : - loading projection weights from model_cbow.txt
2021-03-01 04:12:54,565 : - loaded (2057, 300) matrix from model_cbow.txt
2021-03-01 04:12:54,566 : - loading projection weights from model_skip.txt
2021-03-01 04:12:55,086 : - loaded (2057, 300) matrix from model_skip.txt


In [28]:
def tokenizator(text):
    doc=nlp(text)
    valid_tokens = []
    for token in doc:
        e_valid = not token.is_stop and token.is_alpha
        if e_valid:
            valid_tokens.append(token.text.lower())
    return valid_tokens
text = "Lembrancinha de bebe"
tokenizator(text)

['lembrancinha', 'bebe']

In [29]:
import numpy as np

def combinator_vector_sum(words, model=w2v_model_cbow):
    result_vector = np.zeros((1, 300))
    for w in words:
        try:
            result_vector += model.get_vector(w)
        except KeyError:
            pass
    return result_vector

text = "Lembrancinha de bebe"
words = combinator_vector_sum(tokenizator(text))
print(words)
words = combinator_vector_sum(tokenizator(text), model=w2v_model_skipgram)
print(words)

[[-0.48902056  0.15874362  0.71528174  0.12056345 -0.28467058  0.38668891
  -0.79554945 -1.15319693  0.00908354  0.46041317 -0.52874875 -0.17413667
   0.96144239 -0.03772259  0.21295898  0.49685426 -0.03949238  0.02232021
  -0.52251077 -0.17097783  0.26269599 -0.91722532  0.88429809  0.91393304
  -0.38189344  0.6398657   0.20920087  0.26544115  0.71789706 -0.64221151
   0.07061724  0.23736247  0.73625942 -0.20027842 -0.69149977 -0.0625457
  -0.38376166 -0.78147647  0.70572029  0.87022521 -0.39797583 -0.22129533
  -0.99756154 -0.68547675 -0.74170688 -0.49795833  0.13850313 -0.0627453
   0.48682062 -0.96437243 -0.04397185  0.51269831 -0.17870842 -0.19219952
  -0.6738137   1.09459797 -0.10180173 -0.3089267   0.51626769  0.54721735
  -1.19467172  0.13622061  1.43469761  0.92820504  0.96610698  0.17531562
  -0.14196912  0.11893156 -0.88577314 -0.3827794  -0.45923327  0.65416265
  -0.13287787  0.12161149  1.09356011  0.53783652  0.76647489 -0.66384074
  -0.10909276  0.28180647 -0.43026483 -0

In [31]:
def vector_matrix(texts, model):
    x = len(texts)
    matrix = np.zeros((x, 300))
    
    for i in range(x):
        words = tokenizator(texts.iloc[i])
        matrix[i] = combinator_vector_sum(words, model)
    return matrix
#cbow
matrix_train_c = vector_matrix(train.title, w2v_model_cbow)
matrix_test_c = vector_matrix(test.title, w2v_model_cbow)

#skip-gram
matrix_train_s = vector_matrix(train.title, w2v_model_skipgram)
matrix_test_s = vector_matrix(test.title, w2v_model_skipgram)


In [36]:
matrix_train_c


array([[-0.03865575, -0.44828025,  0.64353627, ..., -0.56104386,
         0.79147726, -0.40429975],
       [-1.86183476,  0.25833577,  0.03335893, ...,  2.09046109,
         1.48261529,  1.24519221],
       [-0.48368124,  0.48375386,  0.15157415, ..., -0.94684323,
        -0.35266843, -0.47060202],
       ...,
       [-2.36711238, -1.12889016,  0.21085365, ..., -0.87698052,
         1.4759222 ,  0.78756371],
       [-0.73387601,  0.02592593,  0.05637543, ...,  0.06413649,
        -0.26428528,  0.67564253],
       [-1.04010919, -0.15677229,  0.39932306, ..., -0.48496409,
         0.50353448, -0.92219686]])

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


def classification(model, x_train, y_train, x_test, y_test):
    LR = LogisticRegression(max_iter=4000)
    LR.fit(x_train, y_train)
    category = LR.predict(x_test)
    result = classification_report(y_test, category)
    print(result)
    return LR

LR_Cbow = classification(w2v_model_cbow, matrix_train_c, train.category, matrix_test_c, test.category)


                    precision    recall  f1-score   support

              Bebê       0.87      0.79      0.82      1756
Bijuterias e Jóias       0.86      0.89      0.88       238
         Decoração       0.84      0.87      0.85      2212
     Lembrancinhas       0.84      0.92      0.88      4440
            Outros       0.69      0.33      0.45       287
       Papel e Cia       0.75      0.56      0.64       694

       avg / total       0.84      0.84      0.83      9627



In [43]:
LR_skip = classification(w2v_model_skipgram, matrix_train_s, train.category, matrix_test_s, test.category)

                    precision    recall  f1-score   support

              Bebê       0.87      0.79      0.82      1756
Bijuterias e Jóias       0.86      0.89      0.88       238
         Decoração       0.84      0.87      0.85      2212
     Lembrancinhas       0.84      0.92      0.88      4440
            Outros       0.69      0.33      0.45       287
       Papel e Cia       0.75      0.56      0.64       694

       avg / total       0.84      0.84      0.83      9627



In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
def classification(model, x_train, y_train, x_test, y_test):
    SVC_c = SVC(kernel='linear')
    SVC_c.fit(x_train, y_train)
    category = SVC_c.predict(x_test)
    result = classification_report(y_test, category)
    c_m = confusion_matrix(validation.category, category)
    print(result)
    print(c_m)
    return SVC_c

SVC_C_Cbow = classification(w2v_model_cbow, matrix_train_c, train.category, matrix_test_c, test.category)




# Aqui temos um recall muito semmelhante a base de dados do ICMC-USP, e com uma base muito menor e ainda com menos dados

In [None]:
import pickle

with open("LR_cbow.pkl", "wb") as f:
    pickle.dump(LR_Cbow)
    
with open("LR_skip.pkl", "wb") as f:
    pickle.dump(LR_skip)    