# 文章のベクトル化のチュートリアル

以下の5つを軽くやってみる

- BoW + SVD
- TF-IDF + word2vec
- SWEM (fasttext)
- Doc2Vec

In [1]:
import nltk
import warnings
import numpy as np
import pandas as pd
from nltk.corpus import brown
from gensim.models import Word2Vec, FastText
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nishikawadaiki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# データの読み込み
train = pd.read_csv("../dataset/raw/train.csv")

In [3]:
train.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,...,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,...,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,...,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,1.0,1.0


In [4]:
# 取り扱う文章
sentence = train['question_body']
sentence = sentence[0:100]
sentence

0     After playing around with macro photography on...
1     I am trying to understand what kinds of places...
2     I'm working on a PCB that has through-hole com...
3     An affidavit, from what i understand, is basic...
4     I am trying to make a binary image. I want mor...
                            ...                        
95    Like most people, I have a habit, and that hab...
96    I'm working on a ERP / Accounting (lots of tab...
97    There was the following passage in the New Yor...
98    Ella Mental has $600$ ft of fencing to enclose...
99    Have 12 excel Sheets in a workbook for each mo...
Name: question_body, Length: 100, dtype: object

## BoW + SVD

In [5]:
# BoWは、sklearnのCountVectorizerを使うと良さそう
count_vectorizer = CountVectorizer()
bow = count_vectorizer.fit_transform(sentence)

# ngram_range: bi-gram, tri-gramができる、割と使う
# count_vectorizer = CountVectorizer(ngram_range=(1, 3))

In [6]:
print('BoW (Bag of Words) shape : ', bow.shape)

BoW (Bag of Words) shape :  (100, 3498)


In [7]:
df = pd.DataFrame(data=bow[0:5].toarray(),
                  columns=count_vectorizer.get_feature_names())
df

Unnamed: 0,000,03,04,062,0x0000,0x000505b8,0x000dbfc6,0x6784fc4b,0x950e3b8d,0xb7032c3f,...,здоровье,карьера,личностный,окружение,отношения,рост,спорт,творчество,финансы,яркость
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# TruncatedSVDを使って圧縮する
tsvd = TruncatedSVD(n_components=50, random_state=1234)
truncated_bow = tsvd.fit_transform(bow)

In [9]:
print('BoW (Bag of Words) shape : ', truncated_bow.shape)

BoW (Bag of Words) shape :  (100, 50)


In [10]:
# まとめ

class BoWVectorizerWithSVD(object):
    def __init__(self, dims=50, random_state=1234):
        self.count_vectorizer = None
        self.tsvd = None
        self.dims = dims
        self.random_state = random_state

    def fit_transform(self, X):
        # create BoW vector
        self.count_vectorizer = CountVectorizer()
        bow_X = self.count_vectorizer.fit_transform(X)
        # reduce vector dimension
        self.tsvd = TruncatedSVD(n_components=self.dims, random_state=self.random_state)
        truncated_bow_X = self.tsvd.fit_transform(bow_X)

        # return Dataframe
        df = pd.DataFrame(data=truncated_bow_X,
                          columns=['BoW-WithSVD-' + str(i) for i in range(self.dims)])
        return df

    def transform(self, X):
        bow_X = self.count_vectorizer.transform(X)
        truncated_bow_X = self.tsvd.transform(bow_X)
        df = pd.DataFrame(data=truncated_bow_X,
                          columns=['BoW-WithSVD-' + str(i) for i in range(self.dims)])
        return df

In [11]:
converter = BoWVectorizerWithSVD()
truncated_bow = converter.fit_transform(sentence)
print('BoW (Bag of Words) shape : ', truncated_bow.shape)

BoW (Bag of Words) shape :  (100, 50)


## TF-IDF + word2vec

In [12]:
# TF-IDFの算出もsklearnでやる場合が多い
tfidf = TfidfVectorizer()
tfidf_vector = tfidf.fit_transform(sentence)

In [13]:
print('TF-IDF shape : ', tfidf_vector.shape)

TF-IDF shape :  (100, 3498)


In [14]:
df = pd.DataFrame(data=tfidf_vector[0:5].toarray(),
                  columns=tfidf.get_feature_names())
df

Unnamed: 0,000,03,04,062,0x0000,0x000505b8,0x000dbfc6,0x6784fc4b,0x950e3b8d,0xb7032c3f,...,здоровье,карьера,личностный,окружение,отношения,рост,спорт,творчество,финансы,яркость
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.097332,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# word2vecの使い方
# size (100): 圧縮次元数
# min_count (5): 出現頻度の低いものをカットする
# window (5): 前後の単語を拾う際の窓の広さを決める
# iter: 機械学習の繰り返し回数, 十分学習できていないときにこの値を調整する

# 文章の分かち書きを得る
words = [nltk.word_tokenize(val) for val in sentence] 
# 何かしらのデータで事前学習させる
model = Word2Vec(words)

# brownと言われる文章を使う場合
# model = gensim.models.Word2Vec(brown.sents())

In [16]:
# ベクトルの辞書を作る
word2vector = dict(zip(model.wv.index2word, model.wv.syn0))

In [17]:
from collections import defaultdict

# tf-idfの辞書を作る
word2weight = defaultdict(lambda: max_idf, 
                          [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

In [18]:
def get_word_embedding(sentence, word2vector, word2weight):
    words = nltk.word_tokenize(sentence)
    vectors = np.zeros((len(words), 100))
    
    total_weight = 0
    for i, val in enumerate(words):
        try:
            vectors[i] = word2vector[val] * word2weight[val]
            total_weight += word2weight[val]
        except:
            vectors[i] = np.zeros(100)

    return np.sum(vectors, axis=0) / total_weight

vector = get_word_embedding(sentence[0], word2vector, word2weight)
vector.shape

(100,)

In [19]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vector=None, dims=100, use_word2vec=True):
        # receive pre-trained model
        self.word2vector = word2vector
        self.word2weight = None
        self.dims = dims
        self.use_word2vec = use_word2vec

    def fit_transform(self, X):
        # calculate tf-idf
        tfidf = TfidfVectorizer(tokenizer=lambda x: nltk.word_tokenize(x))
        tfidf.fit(X)
        # create word2weight dict
        self.word2weight = {}
        for w, i in tfidf.vocabulary_.items():
            self.word2weight[w] = tfidf.idf_[i]

        # create word2vec dict
        if self.word2vector is None:
            words_list = [nltk.word_tokenize(sentence) for sentence in X]
            model = Word2Vec(words_list, size=self.dims, seed=1234) \
                if self.use_word2vec else FastText(words_list, size=self.dims, seed=1234)
            self.word2vector = dict(zip(model.wv.index2word, model.wv.syn0))

        feature = np.array([self.get_sentence_vector(words) for words in words_list])
        df = pd.DataFrame(data=feature,
                          columns=['Tfidf-with-wordvec' + str(i) for i in range(feature.shape[1])])

        return df

    def transform(self, X):
        words_list = [nltk.word_tokenize(sentence) for sentence in X]
        feature = np.array([self.get_sentence_vector(words) for words in words_list])
        df = pd.DataFrame(data=feature,
                          columns=['Tfidf-with-wordvec' + str(i) for i in range(feature.shape[1])])
        return df

    def get_sentence_vector(self, words):
        dims = len(list(self.word2vector.values())[0])
        vectors = np.zeros((len(words), dims))
        total_weight = 0
        for i, word in enumerate(words):
            try:
                vectors[i] = self.word2vector[word] * self.word2weight[word]
                total_weight += self.word2weight[word]
            except:  # noqa
                # doesn't find the vector
                vectors[i] = np.zeros(dims)

        return np.sum(vectors, axis=0) / total_weight

In [20]:
converter = TfidfEmbeddingVectorizer(dims=50)
tfidf_word2vec_vector = converter.fit_transform(sentence)
print('tfidf + word2vec shape : ', tfidf_word2vec_vector.shape)

tfidf + word2vec shape :  (100, 50)


## SWEM + FastText

慣れてきたので、いきなりクラスで実装する。MAXよりMEANの方が良さそう....

In [21]:
class SWEMEmbeddingVectorizer(object):
    def __init__(self, word2vector=None, dims=100, use_word2vec=True, pooling='mean'):
        # receive pre-trained model
        self.word2vector = word2vector
        self.word2weight = None
        self.dims = dims
        self.use_word2vec = use_word2vec
        self.pooling = pooling

    def fit_transform(self, X):
        # word2vec
        if self.word2vector is None:
            words_list = [nltk.word_tokenize(sentence) for sentence in X]
            model = Word2Vec(words_list, size=self.dims, seed=1234) \
                if self.use_word2vec else FastText(words_list, size=self.dims, seed=1234)
            self.word2vector = dict(zip(model.wv.index2word, model.wv.syn0))
        # create features
        feature = np.array([self.get_sentence_vector(words) for words in words_list])
        df = pd.DataFrame(data=feature,
                          columns=['SWEM-' + str(i) for i in range(feature.shape[1])])

        return df

    def transform(self, X):
        words_list = [nltk.word_tokenize(sentence) for sentence in X]
        feature = np.array([self.get_sentence_vector(words) for words in words_list])
        df = pd.DataFrame(data=feature,
                          columns=['SWEM-' + str(i) for i in range(feature.shape[1])])
        return df

    def get_sentence_vector(self, words):
        dims = len(list(self.word2vector.values())[0])
        vectors = np.zeros((len(words), dims))
        for i, word in enumerate(words):
            try:
                vectors[i] = self.word2vector[word]
            except:  # noqa
                # doesn't find the vector
                vectors[i] = np.zeros(dims)

        # only max, min, mean pooling
        if self.pooling == 'max':
            return np.max(vectors, axis=0)
        elif self.pooling == 'min':
            return np.min(vectors, axis=0)
        elif self.pooling == 'mean':
            return np.mean(vectors, axis=0)
        else:
            return np.mean(vectors, axis=0)

In [22]:
converter = SWEMEmbeddingVectorizer(dims=50, use_word2vec=False)
swem_vector = converter.fit_transform(sentence)
print('SWEM shape : ', swem_vector.shape)

SWEM shape :  (100, 50)


## Doc2Vec

In [23]:
class Doc2VecVectorizer(object):
    def __init__(self, dims=100):
        self.model = None
        self.dims = dims

    def fit_transform(self, X):
        tagged_data = [TaggedDocument(words=nltk.word_tokenize(_d.lower()), tags=[str(i)])
                       for i, _d in enumerate(X)]
        self.model = Doc2Vec(tagged_data, vector_size=self.dims)
        feature = np.array([self.model.docvecs[i] for i in range(len(X))])
        df = pd.DataFrame(data=feature,
                          columns=['doc2vec-' + str(i) for i in range(self.dims)])

        return df

    def transform(self, X):
        words_list = [nltk.word_tokenize(sentence.lower()) for sentence in X]
        feature = np.array([self.model.infer_vector(words) for words in words_list])
        df = pd.DataFrame(data=feature,
                          columns=['doc2vec-' + str(i) for i in range(self.dims)])
        return df

In [24]:
converter = Doc2VecVectorizer()
doc2vec_vector = converter.fit_transform(sentence)
print('doc2vec shape : ', doc2vec_vector.shape)

doc2vec shape :  (100, 100)
