In [69]:
import gensim
import glob
import MeCab
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import torch
import torch.nn as nn
import urllib.request

In [67]:
# 記事ベクトルの作成

np.random.seed(0)
text_paths = glob.glob('livedoor_news_corpus/text/**/*.txt')
# print(len(text_paths))  # 7376
model = gensim.models.Word2Vec.load('ja.bin')


def analyzer(text, mecab, stopwords=[], target_part_of_speech=['proper_noun', 'noun', 'verb', 'adjective']):
    node = mecab.parseToNode(text)
    words = []
    while node:
        features = node.feature.split(',')
        surface = features[6]
        if (surface == '*') or (len(surface) < 2) or (surface in stopwords):
            node = node.next
            continue
        noun_flag = (features[0] == '名詞')
        proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
        verb_flag = (features[0] == '動詞') & (features[1] == '自立')
        adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
        if ('proper_noun' in target_part_of_speech) & proper_noun_flag:
            words.append(surface)
        elif ('noun' in target_part_of_speech) & noun_flag:
            words.append(surface)
        elif ('verb' in target_part_of_speech) & verb_flag:
            words.append(surface)
        elif ('adjective' in target_part_of_speech) & adjective_flag:
            words.append(surface)
        node = node.next
    return words


req = urllib.request.Request('http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt')
with urllib.request.urlopen(req) as res:
    stopwords = res.read().decode('utf-8').split('\r\n')
while '' in stopwords:
    stopwords.remove('')

words_list = []
for tp in text_paths[:15]:
    text = open(tp, 'r').read()
    text = text.split('\n')
    # title = text[2]
    text = ' '.join(text[3:])
    mecab = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    words = analyzer(text, mecab, stopwords=stopwords, target_part_of_speech=['noun', 'proper_noun'])
    words = filter(lambda x: x in model.wv.vocab, words)
    words_list.append(' '.join(words))

docs = np.asarray(words_list)
count = CountVectorizer()
bags = count.fit_transform(docs)
bags.toarray()  # (2, N)  全単語の出現回数 0を含む
features = count.get_feature_names()

# features: データ内の全単語 (F)
# docs: 各文章の単語を' 'でつなげたもの
# bags: featuresのidxにしたがって, 出現回数をカウントしたもの (N, F)
# 
# model.wv[words_list[0]]
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
tf_idf = tfidf.fit_transform(bags)
tf_idf = tf_idf.toarray()
# print(np.take_along_axis(tf_idf, tf_idf.argsort(axis=1), axis=1))
features = np.asarray(features)
sorted_features = features[tf_idf.argsort(axis=1)][:, -10:]  # 各item内におけるtf-idf top10の単語

item_vecs = []
for f in sorted_features:
    item_vecs.append(model[f])
print(np.asarray(item_vecs).shape)
# item_vecs  # 記事ベクトル

(15, 10, 300)




In [88]:
embedding = nn.Embedding(10, 3)
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
embedding(input)

tensor([[[ 1.7105,  0.7686, -0.2016],
         [ 0.6952, -0.2221, -0.5135],
         [ 2.4392,  0.9282, -0.2792],
         [ 0.3483,  1.9796, -1.8862]],

        [[ 2.4392,  0.9282, -0.2792],
         [-2.4020, -0.7471, -0.2395],
         [ 0.6952, -0.2221, -0.5135],
         [ 0.7127,  0.9713,  1.0156]]], grad_fn=<EmbeddingBackward>)

In [96]:
ff = nn.Linear(1, 10)
input = torch.randint(0, 10, (2, 1), dtype=torch.float)

In [97]:
ff(input).shape

torch.Size([2, 10])

In [98]:
input.shape

torch.Size([2, 1])

In [103]:
torch.transpose(input, 0, 1)

tensor([[7., 2.]])

In [114]:
a, b = torch.ones(333, 3, 100), torch.ones(100, 1500)
torch.matmul(a, b).shape

torch.Size([333, 3, 1500])

In [141]:
a= torch.ones(333, 3, 100)
torch.mean(a, axis=1).shape

torch.Size([333, 100])

In [151]:
c = torch.tensor([[[1, 2, 3], [3, 4, 5]], [[1, 2, 3], [3, 4, 5]], [[1, 2, 3], [3, 4, 5]], [[1, 2, 3], [3, 4, 5]]], dtype=torch.float)
print(c.shape)
torch.mean(c, axis=1).shape

torch.Size([4, 2, 3])


torch.Size([4, 3])

In [153]:
c.mean(axis=1).shape

torch.Size([4, 3])