In [1]:
import numpy as np
import pandas as pd
import re

import sentencepiece as spm

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from gensim import models

Using TensorFlow backend.


In [3]:
# SentencePiece BPE

modelname = 'embeddings/embeddings/bpe4k.model'

sp = spm.SentencePieceProcessor()
sp.Load(modelname)

pretrain_sent = pd.read_csv("data/p_pretrain_nostem.csv")
train_sent = pd.read_csv("data/p_train_nostem.csv")
test_sent = pd.read_csv("data/p_test_nostem.csv")

pretrain_sent['content'] = pretrain_sent['content'].map(lambda x: str(x).lower())
train_sent['content'] = train_sent['content'].map(lambda x: str(x).lower())
test_sent['content'] = test_sent['content'].map(lambda x: str(x).lower())

pretrain_sent['content'] = pretrain_sent['content'].map(lambda x: ' '.join(sp.EncodeAsPieces(str(x).lower())))
train_sent['content'] = train_sent['content'].map(lambda x: ' '.join(sp.EncodeAsPieces(str(x).lower())))
test_sent['content'] = test_sent['content'].map(lambda x: ' '.join(sp.EncodeAsPieces(str(x).lower())))

max_features = 4000
maxlen = 100

tokenizer = Tokenizer(max_features,
                     filters = '!"#$%&()*+,-./:;<=>?[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(list(train_sent['content']) + list(pretrain_sent['content']))
print('Уникальных слов - ', len(tokenizer.word_index))

X_train = tokenizer.texts_to_sequences(train_sent['content'])
X_train = pad_sequences(list(X_train), maxlen)

X_pretrain = tokenizer.texts_to_sequences(pretrain_sent['content'])
X_pretrain = pad_sequences(list(X_pretrain), maxlen)

X_test = tokenizer.texts_to_sequences(test_sent['content'])
X_test = pad_sequences(list(X_test), maxlen)

y_train = train_sent['published'].map({'t' : 1, 'f' : 0})
y_pretrain = pretrain_sent['published'].map({'t' : 1, 'f' : 0})
y_test = test_sent['published'].map({'t' : 1, 'f' : 0})

Уникальных слов -  3868


In [5]:
# fastText embeddings

embname = 'embeddings/embeddings/ftmodel.vec'
embedding_dim = 200

w2v = models.KeyedVectors.load_word2vec_format(embname)
keys = list(w2v.vocab.keys())
tmp = pd.DataFrame([ np.append(keys[i],w2v.syn0[i]) for i in range(0,len(keys))])

embeddings_index = {}

for i in range(0,len(tmp)):
    values = np.array(tmp.iloc[i])
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((max_features, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Found 117529 word vectors.


In [6]:
# glove Embeddings

embname = 'embeddings/embeddings/glove4k200.csv'
embedding_dim = 200

tmp = pd.read_csv(embname,header=None)
print('Indexing word vectors.')
embeddings_index = {}
for i in range(0,len(tmp)):
    values = np.array(tmp.iloc[i])
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))
# prepare embedding matrix
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Indexing word vectors.
Found 4000 word vectors.
