# First download the raw corpus filest for all recent wikipedia articles!
 -  **english** WIKIPEDIA corpus 14 GB
   - https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
 -  **hungarian** WIKIPEDIA corpus 712 MB
   -  https://dumps.wikimedia.org/huwiki/latest/huwiki-latest-pages-articles.xml.bz2


In [1]:
!ls -lh corpus_data/

total 15G
-rw-r--r-- 1 pataki domain users  14G Mar 22 12:05 en_wiki_corpus.xml.bz2
-rw-r--r-- 1 pataki domain users 712M Mar 21 15:13 hun_wiki_corpus.xml.bz2


In [2]:
from nltk.stem import SnowballStemmer
from gensim.corpora import WikiCorpus
from gensim.models.word2vec import Word2Vec

In [3]:
hun_stem = SnowballStemmer(language='hungarian')
en_stem  = SnowballStemmer(language='english')

### Load the zipped XML wikipedia files and parse them!

In [4]:
%%time
hun_wiki = WikiCorpus('corpus_data/hun_wiki_corpus.xml.bz2')
hun_articles = list(hun_wiki.get_texts())
len(hun_articles)

CPU times: user 11min 1s, sys: 15min 52s, total: 26min 54s
Wall time: 28min 25s


In [5]:
%%time
en_wiki = WikiCorpus('corpus_data/en_wiki_corpus.xml.bz2')
en_articles = list(en_wiki.get_texts())
len(en_articles)

CPU times: user 3h 10min 18s, sys: 5h 32min 56s, total: 8h 43min 15s
Wall time: 9h 19min 30s


#### Check a random hungarian article
[Kártyajáték](https://hu.wikipedia.org/wiki/K%C3%A1rtyaj%C3%A1t%C3%A9k)

In [6]:
hun_articles[4522][0:10]

['középkori',
 'bécsi',
 'udvari',
 'kártyajáték',
 'hofämterspiel',
 'egy',
 'lapja',
 'kártyajátékokat',
 'általában',
 'több']

Stemmed version of this article. Stemming for hungarian words are far from perfect...

In [7]:
[hun_stem.stem(i) for i in hun_articles[4522][0:10]]

['középkor',
 'bécs',
 'udvar',
 'kártyajáte',
 'hofämterspiel',
 'egy',
 'lap',
 'kártyajáték',
 'által',
 'több']

## Let's stem the articles!

In [8]:
%%time
hun_stemmed_articles = []
for i in hun_articles:
    tmp = [hun_stem.stem(j) for j in i]
    hun_stemmed_articles.append(tmp)

CPU times: user 43min 17s, sys: 2min 41s, total: 45min 59s
Wall time: 46min


In [9]:
%%time
en_stemmed_articles = []
for i in en_articles:
    tmp = [en_stem.stem(j) for j in i]
    en_stemmed_articles.append(tmp)

CPU times: user 7h 48min 6s, sys: 32min 25s, total: 8h 20min 31s
Wall time: 8h 20min 45s


## Let's train word2vec on both stemmed and not stemmed articles!

Hungarian

In [10]:
%%time
params = {'size': 200, 'window': 10, 'min_count': 10, 'workers':20 , 'sample': 1e-3,}
hun_word2vec = Word2Vec(hun_articles, **params)
hun_word2vec.save('models/hun_word2vec')

CPU times: user 1h 7min 28s, sys: 2min 37s, total: 1h 10min 5s
Wall time: 13min 46s


In [11]:
%%time
params = {'size': 200, 'window': 10, 'min_count': 10, 'workers':20 , 'sample': 1e-3,}
hun_word2vec_stemmed = Word2Vec(hun_stemmed_articles, **params)
hun_word2vec_stemmed.save('models/hun_word2vec_stemmed')

CPU times: user 1h 5min 12s, sys: 2min 8s, total: 1h 7min 20s
Wall time: 11min 47s


English

In [12]:
%%time
params = {'size': 200, 'window': 10, 'min_count': 10, 'workers':20 , 'sample': 1e-3,}
en_word2vec = Word2Vec(en_articles, **params)
en_word2vec.save('models/en_word2vec')

CPU times: user 20h 19min 51s, sys: 26min 59s, total: 20h 46min 51s
Wall time: 3h 39min 38s


In [13]:
%%time
params = {'size': 200, 'window': 10, 'min_count': 10, 'workers':20 , 'sample': 1e-3,}
en_word2vec_stemmed = Word2Vec(en_stemmed_articles, **params)
en_word2vec_stemmed.save('models/en_word2vec_stemmed')

CPU times: user 19h 17min 29s, sys: 18min 50s, total: 19h 36min 19s
Wall time: 3h 1min 12s
