In [64]:
import sys
import re
import pickle
from janome.tokenizer import Tokenizer
import numpy as np
import collections

with open("/Users/ryozawau/css_nlp/notebook/Data/dokujo-tsushin.txt", mode="r",encoding="utf-8") as f: # 注1）
    original_corpus = f.readlines()

text = re.sub("http://news.livedoor.com/article/detail/[0-9]{7}/","", original_corpus) # 注2）
text = re.sub("[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\+[0-9]{4}","", text) # 注3）
text = re.sub("[\f\n\r\t\v]","", text)
text = re.sub("　","", text)
text = re.sub("[「」]","", text)
text = [re.sub("[（）]","", text)]

# ＜ポイント＞
t = Tokenizer()

words_list = []
for word in text:
    words_list.append(t.tokenize(word, wakati=True))


TypeError: expected string or bytes-like object

In [65]:
with open("/Users/ryozawau/css_nlp/notebook/Data/dokujo-tsushin.txt", mode="r",encoding="utf-8") as f: # 注1）
    original_corpus = f.readlines()

In [66]:
import MeCab
from tqdm.notebook import tqdm
def tokenize_with_mecab(sentences):
    # Initialize MeCab with the specified dictionary
    corpus = []
    for sentence in sentences:
        sentence = re.sub("http://news.livedoor.com/article/detail/[0-9]{7}/","", sentence) # 注2）
        sentence = re.sub("[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\+[0-9]{4}","", sentence) # 注3）
        # Parse the sentence
        node = mecab.parseToNode(sentence)
        # Iterate over all nodes
        while node:
            # Extract the surface form of the word
            word = node.surface
            # Skip empty words and add to the corpus
            if word:
                corpus.append(word)
            node = node.next
    return corpus


# Initialize the MeCab tokenizer
#mecab = MeCab.Tagger()
path = "-d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd"
mecab = MeCab.Tagger(path)
corpus = tokenize_with_mecab(original_corpus)

In [67]:
words

['友人',
 '代表',
 'の',
 'スピーチ',
 '、',
 '独女',
 'は',
 'どう',
 'こなし',
 'て',
 'いる',
 '？',
 '\u3000',
 'もうすぐ',
 'ジューン・ブライド',
 'と',
 '呼ば',
 'れる',
 '６月',
 '。',
 '独女',
 'の',
 '中',
 'に',
 'は',
 '自分',
 'の',
 '式',
 'は',
 'まだ',
 'な',
 'のに',
 '呼ば',
 'れ',
 'て',
 'ばかり',
 '…',
 '…',
 'という',
 '「',
 'お祝い',
 '貧乏',
 '」',
 '状態',
 'の',
 '人',
 'も',
 '多い',
 'の',
 'で',
 'は',
 'ない',
 'だろ',
 'う',
 'か',
 '？',
 '\u3000',
 'さらに',
 '出席',
 '回数',
 'を',
 '重ね',
 'て',
 'いく',
 'と',
 '、',
 'こんな',
 'お願い',
 'ごと',
 'を',
 'さ',
 'れる',
 'こと',
 'も',
 '少なく',
 'ない',
 '。',
 '\u3000',
 '「',
 'お願い',
 'が',
 'ある',
 'ん',
 'だ',
 'けど',
 '…',
 '…',
 '友人',
 '代表',
 'の',
 'スピーチ',
 '、',
 'やっ',
 'て',
 'くれ',
 'ない',
 'か',
 'な',
 '？',
 '」',
 '\u3000',
 'さて',
 'そんな',
 'とき',
 '、',
 '独女',
 'は',
 'どう',
 '対応',
 'し',
 'たら',
 'いい',
 'か',
 '？',
 '\u3000',
 '最近',
 'だ',
 'と',
 'インターネット',
 '等',
 'で',
 '検索',
 'すれ',
 'ば',
 '友人',
 '代表',
 'スピーチ',
 '用',
 'の',
 '例文',
 'サイト',
 'が',
 'たくさん',
 '出',
 'て',
 'くる',
 'ので',
 '、',
 'それら',
 'を',
 '参考',
 'に',
 'すれ',
 'ば',
 '、',
 '

In [68]:
word_to_id = {}
id_to_word = {}

for word in words:
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word
        
print('id_to_word[0]:', id_to_word[0])
print('id_to_word[1]:', id_to_word[1])
print('id_to_word[2]:', id_to_word[2])
print()
print("word_to_id['女']:", word_to_id['女'])
print("word_to_id['結婚']:", word_to_id['結婚'])
print("word_to_id['夫']:", word_to_id['夫'])

id_to_word[0]: 友人
id_to_word[1]: 代表
id_to_word[2]: の

word_to_id['女']: 3211
word_to_id['結婚']: 449
word_to_id['夫']: 1449


In [69]:
# 共起行列の作成
def create_co_matrix(corpus, vocab_size, window_size=1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)

    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size + 1):
            left_idx = idx - i
            right_idx = idx + i

            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1

            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1

    return co_matrix

# ベクトル間の類似度（cos類似度）判定
def cos_similarity(x, y, eps=1e-8):
    nx = x / (np.sqrt(np.sum(x ** 2)) + eps)
    ny = y / (np.sqrt(np.sum(y ** 2)) + eps)
    return np.dot(nx, ny)

# ベクトル間の類似度をランキング
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    if query not in word_to_id:
        print('%s is not found' % query)
        return

    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    vocab_size = len(id_to_word)

    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)

    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))

        count += 1
        if count >= top:
            return

# 正の相互情報量（PPMI）を使用した単語の関連性指標の改善
def ppmi(C, verbose=False, eps = 1e-8):
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0

    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
            M[i, j] = max(0, pmi)

            if verbose:
                cnt += 1
                if cnt % (total//100) == 0:
                    print('%.1f%% done' % (100*cnt/total))
    return M

In [70]:
window_size = 2
wordvec_size = 100
vocab_size = len(word_to_id)


In [71]:
# リストに変換
corpus = [word_to_id[word] for word in words]

# NumPy配列に変換
corpus = np.array(corpus)

In [72]:
vocab_size

31354

In [73]:
print('counting  co-occurrence ...')
C = create_co_matrix(corpus, vocab_size, window_size)

counting  co-occurrence ...


In [49]:
print('calculating PPMI ...')
W = ppmi(C, verbose=True)

calculating PPMI ...
1.0% done
2.0% done
3.0% done
4.0% done
5.0% done
6.0% done
7.0% done
8.0% done
9.0% done
10.0% done
11.0% done
12.0% done
13.0% done
14.0% done
15.0% done
16.0% done
17.0% done
18.0% done
19.0% done
20.0% done
21.0% done
22.0% done
23.0% done
24.0% done
25.0% done
26.0% done
27.0% done
28.0% done
29.0% done
30.0% done
31.0% done
32.0% done
33.0% done
34.0% done
35.0% done
36.0% done
37.0% done
38.0% done
39.0% done
40.0% done
41.0% done
42.0% done
43.0% done
44.0% done
45.0% done
46.0% done
47.0% done
48.0% done
49.0% done
50.0% done
51.0% done
52.0% done
53.0% done
54.0% done
55.0% done
56.0% done
57.0% done
58.0% done
59.0% done
60.0% done
61.0% done
62.0% done
63.0% done
64.0% done
65.0% done
66.0% done
67.0% done
68.0% done
69.0% done
70.0% done
71.0% done
72.0% done
73.0% done
74.0% done
75.0% done
76.0% done
77.0% done
78.0% done
79.0% done
80.0% done
81.0% done
82.0% done
83.0% done
84.0% done
85.0% done
86.0% done
87.0% done
88.0% done
89.0% done
90.0% don

In [74]:
W = np.load("/Users/ryozawau/css_nlp/notebook/Data/W.npy")

In [75]:
from sklearn.utils.extmath import randomized_svd
U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5,
                             random_state=None)

In [51]:
word_vecs = U[:, :wordvec_size]


In [63]:
querys = ['女性', '結婚', '彼', "秋"]

for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)



[query] 女性
 男性: 0.813703715801239
 世代: 0.6217721700668335
 女子: 0.6008188128471375
 独女: 0.5742473602294922
 層: 0.5740475654602051

[query] 結婚
 交際: 0.6586401462554932
 離婚: 0.6414052844047546
 退職: 0.5841768383979797
 歴: 0.5604287385940552
 同棲: 0.5578230023384094

[query] 彼
 彼女: 0.8239361643791199
 相手: 0.6697133183479309
 彼氏: 0.6597174406051636
 夫: 0.633521556854248
 矢野: 0.6003598570823669

[query] 秋
 春: 0.7461285591125488
 今年: 0.7142464518547058
 季節: 0.7044225335121155
 冬: 0.6923420429229736
 夏: 0.6913642287254333


In [None]:
import pickle


In [58]:
W

array([[ 0.       ,  5.6548448,  1.3778337, ...,  0.       ,  0.       ,
         0.       ],
       [ 5.6548448,  0.       ,  0.7999536, ...,  0.       ,  0.       ,
         0.       ],
       [ 1.3778337,  0.7999536,  0.       , ...,  0.       ,  2.4755187,
         2.4755187],
       ...,
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 0.       ,  0.       ,  2.4755187, ...,  0.       ,  0.       ,
        17.520851 ],
       [ 0.       ,  0.       ,  2.4755187, ...,  0.       , 17.520851 ,
         0.       ]], dtype=float32)

In [59]:
np.save('./Data/W.npy', W)


In [None]:
torch.softmax(s, dim=1)
t = torch.tensor([[0, 1, 0, 0, 0, 0, 0]], dtype=torch.float32)
loss = nn.CrossEntropyLoss()
loss(s,t)

In [2]:
import re
import nltk
nltk.download('brown')
from nltk.corpus import brown
import itertools
corpus = []

for cat in ['news']:
    for text_id in brown.fileids(cat):
        raw_text = list(itertools.chain.from_iterable(brown.sents(text_id)))
        text = ' '.join(raw_text)
        text = text.lower()
        text.replace('\n', ' ')
        text = re.sub('[^a-z ]+', '', text)
        corpus.append([w for w in text.split() if w != ''])

[nltk_data] Downloading package brown to /home/lyuzeyu/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [4]:
len(corpus)

44

In [6]:
len(corpus[0])

1959