## Подготовка к работе с Википедией

Скачайте 'enwiki-20200401-pages-articles.xml.bz2' по ссылке https://meta.wikimedia.org/wiki/Data_dump_torrents — архив весит порядка 16Гб

Скачайте 'wiki.corpus' по ссылке https://yadi.sk/d/TVo-KPUbgx4vPA — это слепок памяти объекта для работы с нелемматизированной(!) Википедией


In [1]:
from gensim.corpora.wikicorpus import WikiCorpus

In [2]:
# logging is important to get the state of the functions
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)

In [3]:
wiki = WikiCorpus.load('wiki.corpus')

2020-11-23 04:38:51,654: INFO: loading WikiCorpus object from wiki.corpus
2020-11-23 04:38:53,799: INFO: loading dictionary recursively from wiki.corpus.dictionary.* with mmap=None
2020-11-23 04:38:53,800: INFO: loaded wiki.corpus


## Построим word2vec вручную средствами gensim

Использовался код https://gist.github.com/maxbellec/85d90d3d7f2f96589f1517e5c4567dc3

In [None]:
import multiprocessing
from gensim.models.word2vec import Word2Vec

class MySentences(object):
    def __iter__(self):
        for text in wiki.get_texts():
            yield text
sentences = MySentences()
params = {'size': 300, 'window': 10, 'min_count': 40, 
          'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1e-3,}
word2vec = Word2Vec(sentences, **params)
word2vec.save('wiki.word2vec.model')

2020-11-20 19:38:17,061: INFO: collecting all words and their counts
2020-11-20 19:38:35,687: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


## Модель посчиталась, пора оценить её качество

Скачать предпосчитанную модель можно по ссылкам, она состоит из трёх частей:

- 'wiki.word2vec.model' - https://yadi.sk/d/LTFU0Ukc2Bp2MA

- 'wiki.word2vec.model.trainables.syn1neg.npy' - https://yadi.sk/d/g7oWXFwga8l9OA

- 'wiki.word2vec.model.wv.vectors.npy' - https://yadi.sk/d/nGaMaQT_FkqnLQ

In [5]:
from gensim.models.word2vec import Word2Vec
word2vec = Word2Vec.load('wiki.word2vec.model')
len(word2vec.wv.vocab)

2020-11-23 04:39:18,217: INFO: loading Word2Vec object from wiki.word2vec.model
2020-11-23 04:39:20,465: INFO: loading wv recursively from wiki.word2vec.model.wv.* with mmap=None
2020-11-23 04:39:20,466: INFO: loading vectors from wiki.word2vec.model.wv.vectors.npy with mmap=None
2020-11-23 04:39:28,183: INFO: setting ignored attribute vectors_norm to None
2020-11-23 04:39:28,185: INFO: loading vocabulary recursively from wiki.word2vec.model.vocabulary.* with mmap=None
2020-11-23 04:39:28,186: INFO: loading trainables recursively from wiki.word2vec.model.trainables.* with mmap=None
2020-11-23 04:39:28,186: INFO: loading syn1neg from wiki.word2vec.model.trainables.syn1neg.npy with mmap=None
2020-11-23 04:39:34,573: INFO: setting ignored attribute cum_table to None
2020-11-23 04:39:34,575: INFO: loaded wiki.word2vec.model


642768

In [6]:
from scipy.stats import spearmanr

f=open("SimLex-999.txt", 'r').readlines()

def rank(model):
        
    not_in_model=[]
    w2v_pairs=[]
    for i in f[1:]:
        ii=i.split('\t')
        first_word=ii[0]
        second_word=ii[1]
        flag=0
        if first_word not in model:
            not_in_model.append(first_word.split('_')[0])
            flag=1
        if second_word not in model:
            not_in_model.append(second_word.split('_')[0])
            flag=1
        if not flag:
            w2v_pairs.append(model.distance(first_word, second_word))
        #print(first_word, second_word)
    print(len(w2v_pairs), not_in_model)
    
    simlex_pairs=[]
    for i in f[1:]:
        ii=i.split('\t')
        if ii[0] not in not_in_model and ii[1] not in not_in_model:
            simlex_pairs.append(float(ii[3]))
    print(len(simlex_pairs))
    
    return spearmanr(simlex_pairs, w2v_pairs)

In [7]:
rank(word2vec.wv)

999 []
999


SpearmanrResult(correlation=-0.36182557483841277, pvalue=2.9039845974509423e-32)

## А теперь построим w2v для лемматизированной Вики

Скачайте 'wiki.lem.corpus' по ссылке https://yadi.sk/d/AsaBf1j_oFBFHw — это слепок памяти объекта для работы с лемматизированной(!) Википедией


In [6]:
from gensim.corpora.wikicorpus import WikiCorpus

import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)

wiki = WikiCorpus.load('wiki.lem.corpus')

2020-12-17 15:51:53,758: INFO: loading WikiCorpus object from wiki.lem.corpus
2020-12-17 15:51:55,464: INFO: loading dictionary recursively from wiki.lem.corpus.dictionary.* with mmap=None
2020-12-17 15:51:55,465: INFO: loaded wiki.lem.corpus


In [None]:
import multiprocessing
from gensim.models.word2vec import Word2Vec

class MySentences(object):
    def __iter__(self):
        for text in wiki.get_texts():
            yield [word.decode('utf-8').split('/')[0] for word in text]
sentences = MySentences()
params = {'size': 300, 'window': 10, 'min_count': 40, 
          'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1e-3,}
word2vec = Word2Vec(sentences, **params)
word2vec.save('wiki.lem.word2vec.model')

## Модель посчиталась, пора оценить её качество

Скачать предпосчитанную модель можно по ссылкам, она состоит из трёх частей:

- 'wiki.lem.word2vec.model' - https://yadi.sk/d/dxyelACAqAVQ0w

- 'wiki.lem.word2vec.model.trainables.syn1neg.npy' - https://yadi.sk/d/aE9Ak2-SZrg5tQ

- 'wiki.lem.word2vec.model.wv.vectors.npy' - https://yadi.sk/d/In6x505G-JCPSg

In [7]:
from gensim.models.word2vec import Word2Vec
word2vec = Word2Vec.load('wiki.lem.word2vec.model')
len(word2vec.wv.vocab)

2020-12-17 15:53:38,177: INFO: loading Word2Vec object from wiki.lem.word2vec.model
2020-12-17 15:53:39,716: INFO: loading wv recursively from wiki.lem.word2vec.model.wv.* with mmap=None
2020-12-17 15:53:39,717: INFO: loading vectors from wiki.lem.word2vec.model.wv.vectors.npy with mmap=None
2020-12-17 15:53:40,507: INFO: setting ignored attribute vectors_norm to None
2020-12-17 15:53:40,508: INFO: loading vocabulary recursively from wiki.lem.word2vec.model.vocabulary.* with mmap=None
2020-12-17 15:53:40,533: INFO: loading trainables recursively from wiki.lem.word2vec.model.trainables.* with mmap=None
2020-12-17 15:53:40,534: INFO: loading syn1neg from wiki.lem.word2vec.model.trainables.syn1neg.npy with mmap=None
2020-12-17 15:53:41,320: INFO: setting ignored attribute cum_table to None
2020-12-17 15:53:41,321: INFO: loaded wiki.lem.word2vec.model


586969

In [10]:
from scipy.stats import spearmanr

f=open("SimLex-999.txt", 'r').readlines()

def rank(model):        
    w2v_pairs=[]
    simlex_pairs=[]
    for i in f[1:]:
        ii=i.split('\t')
        first_word=ii[0]
        second_word=ii[1]
        if first_word in model and second_word in model:
            w2v_pairs.append(model.distance(first_word, second_word))
            simlex_pairs.append(float(ii[3]))
    print(len(w2v_pairs))
    return spearmanr(simlex_pairs, w2v_pairs)

In [11]:
rank(word2vec.wv)

989


SpearmanrResult(correlation=-0.3901066811150612, pvalue=2.679880250653796e-37)