# WAMEX reports: pre-process data and train the embedding with Word2Vec

In [1]:
import re
import codecs
import glob, os
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info('START - FastText Embeddings for all wamex reports')


import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from stop_words import get_stop_words
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = list(get_stop_words('en'))         # 174 stopwords
nltk_words = list(stopwords.words('english'))   # 153 stopwords
stop_words.extend(nltk_words)                   # 353 in total

from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()

WAMEX_DATA_FOLDER = r"/Users/majiga/Documents/wamex/data/wamex_xml/"

MODEL_FILE = r"Vectors/word2vec_wamex_raw.model" # bin file

2018-11-09 13:27:17,491 : INFO : START - FastText Embeddings for all wamex reports


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/majiga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/majiga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/majiga/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load a trained model

In [4]:
# load model
word2vec_model = gensim.models.Word2Vec.load(MODEL_FILE)
print(word2vec_model)

2018-11-09 13:27:38,385 : INFO : loading Word2Vec object from Vectors/word2vec_wamex_raw.model
2018-11-09 13:27:38,447 : INFO : loading wv recursively from Vectors/word2vec_wamex_raw.model.wv.* with mmap=None
2018-11-09 13:27:38,448 : INFO : setting ignored attribute vectors_norm to None
2018-11-09 13:27:38,448 : INFO : loading vocabulary recursively from Vectors/word2vec_wamex_raw.model.vocabulary.* with mmap=None
2018-11-09 13:27:38,449 : INFO : loading trainables recursively from Vectors/word2vec_wamex_raw.model.trainables.* with mmap=None
2018-11-09 13:27:38,449 : INFO : setting ignored attribute cum_table to None
2018-11-09 13:27:38,450 : INFO : loaded Vectors/word2vec_wamex_raw.model


Word2Vec(vocab=8562, size=100, alpha=0.025)


In [13]:
#word2vec_model.wv.vocab

{'no': <gensim.models.keyedvectors.Vocab at 0x1a1eb74128>,
 'weed': <gensim.models.keyedvectors.Vocab at 0x1a1eb74208>,
 'specie': <gensim.models.keyedvectors.Vocab at 0x1a1eb74278>,
 'defined': <gensim.models.keyedvectors.Vocab at 0x1a1eb742e8>,
 'florabase': <gensim.models.keyedvectors.Vocab at 0x1a1eb74320>,
 'waherb': <gensim.models.keyedvectors.Vocab at 0x1a1eb74390>,
 'identified': <gensim.models.keyedvectors.Vocab at 0x1a1eb743c8>,
 'within': <gensim.models.keyedvectors.Vocab at 0x1a1eb74438>,
 'survey': <gensim.models.keyedvectors.Vocab at 0x1a1eb744a8>,
 'area': <gensim.models.keyedvectors.Vocab at 0x1a1eb74518>,
 'neither': <gensim.models.keyedvectors.Vocab at 0x1a1eb74588>,
 'document': <gensim.models.keyedvectors.Vocab at 0x1a1eb745c0>,
 'content': <gensim.models.keyedvectors.Vocab at 0x1a1eb74630>,
 'may': <gensim.models.keyedvectors.Vocab at 0x1a1eb746a0>,
 'referred': <gensim.models.keyedvectors.Vocab at 0x1a1eb746d8>,
 'quoted': <gensim.models.keyedvectors.Vocab at 0x1a

In [19]:
print(word2vec_model.wv.most_similar('gold'))

print(word2vec_model.wv.most_similar('kalgoorlie'))
print(word2vec_model.wv.most_similar('iron', topn=10))

#print(word2vec_model.wv.most_similar('iron-ore'))
#print(word2vec_model.wv.most_similar(positive=['kalgoorlie','iron-ore'],negative=['gold']))

[('au', 0.7145883440971375), ('copper', 0.643734872341156), ('nickel', 0.6223106980323792), ('precious', 0.6135585308074951), ('antimony', 0.5928707718849182), ('metal', 0.5911065340042114), ('arsenic', 0.589397132396698), ('tungsten', 0.5775366425514221), ('tantalum', 0.569101095199585), ('molybdenum', 0.5604355931282043)]
[('kambalda', 0.6378808617591858), ('coolgardie', 0.6370849013328552), ('leonora', 0.6257977485656738), ('norseman', 0.6004508137702942), ('kurnalpi', 0.6000410914421082), ('menzies', 0.5832206010818481), ('laverton', 0.580978274345398), ('p26', 0.5796036720275879), ('banda', 0.575484573841095), ('os', 0.5699737071990967)]
[('brockman', 0.6801323890686035), ('bif', 0.6522730588912964), ('banded', 0.6518596410751343), ('manganese', 0.6492370367050171), ('marra', 0.6461169719696045), ('hematite', 0.6449585556983948), ('bid', 0.6278976798057556), ('ore', 0.6275693774223328), ('mamba', 0.6272781491279602), ('dso', 0.627128541469574)]
