# WAMEX entities-only embedding with FastText

In [13]:
from gensim.models import FastText

MODEL_FILE_NAME = r"Vectors/FastText_wamex_terms.model"

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info('START - FastText Embeddings for entities only file')

2018-11-08 10:21:58,889 : INFO : START - FastText Embeddings for entities only file


## Read the entities only file and Train the FastText model
Note: Or load the trained model in the next line

In [14]:
FILE_TO_READ = r"/Users/majiga/Documents/wamex/WAMEX_geological_entities_allfiles.txt"

sentences = ""
with open(FILE_TO_READ, 'r') as f:
    sentences = f.readlines()
print(len(sentences))   # 31328 files=lines of 28910989 words=tokens

data = []
count_terms = 0
for s in sentences:
    arr = s.split(', ')
    words = []
    for w in arr:
        a = w.strip().replace(' ', '-')
        words.append(a)
    data.append(words)
    count_terms += len(words) # 2,772,122

print(count_terms)

logging.info("START - Train the fastText model")
# Skip gram model FastText model 
# min_count of 100 --- min number of word occurrence
# number of negatives sampled [5]
model_fasttext = FastText(data, size=100, window=5, min_count=100, workers=4, sg=1)
logging.info('END - FastText Embeddings for entities')

# save model
model_fasttext.save(MODEL_FILE_NAME)

31328


2018-11-08 10:22:01,681 : INFO : START - Train the fastText model
2018-11-08 10:22:01,682 : INFO : collecting all words and their counts
2018-11-08 10:22:01,683 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-08 10:22:01,817 : INFO : PROGRESS: at sentence #10000, processed 1009032 words, keeping 1517 word types


2772122


2018-11-08 10:22:01,936 : INFO : PROGRESS: at sentence #20000, processed 1896449 words, keeping 1723 word types
2018-11-08 10:22:02,050 : INFO : PROGRESS: at sentence #30000, processed 2689435 words, keeping 1845 word types
2018-11-08 10:22:02,063 : INFO : collected 1853 word types from a corpus of 2772122 raw words and 31328 sentences
2018-11-08 10:22:02,063 : INFO : Loading a fresh vocabulary
2018-11-08 10:22:02,066 : INFO : min_count=100 retains 838 unique words (45% of original 1853, drops 1015)
2018-11-08 10:22:02,066 : INFO : min_count=100 leaves 2747343 word corpus (99% of original 2772122, drops 24779)
2018-11-08 10:22:02,069 : INFO : deleting the raw counts dictionary of 1853 items
2018-11-08 10:22:02,069 : INFO : sample=0.001 downsamples 74 most-common words
2018-11-08 10:22:02,070 : INFO : downsampling leaves estimated 1532704 word corpus (55.8% of prior 2747343)
2018-11-08 10:22:02,163 : INFO : estimated required memory for 838 words, 16345 buckets and 100 dimensions: 79208

## Load the existing model without training again

In [15]:
model = FastText.load(MODEL_FILE_NAME)
print(model['gold'])

2018-11-08 10:22:48,986 : INFO : loading FastText object from Vectors/FastText_wamex_terms.model
2018-11-08 10:22:49,063 : INFO : loading wv recursively from Vectors/FastText_wamex_terms.model.wv.* with mmap=None
2018-11-08 10:22:49,064 : INFO : setting ignored attribute vectors_norm to None
2018-11-08 10:22:49,064 : INFO : setting ignored attribute vectors_vocab_norm to None
2018-11-08 10:22:49,065 : INFO : setting ignored attribute vectors_ngrams_norm to None
2018-11-08 10:22:49,065 : INFO : setting ignored attribute buckets_word to None
2018-11-08 10:22:49,066 : INFO : loading vocabulary recursively from Vectors/FastText_wamex_terms.model.vocabulary.* with mmap=None
2018-11-08 10:22:49,066 : INFO : loading trainables recursively from Vectors/FastText_wamex_terms.model.trainables.* with mmap=None
2018-11-08 10:22:49,066 : INFO : loaded Vectors/FastText_wamex_terms.model


[ 0.11394285 -0.2954462  -0.00161281 -0.13292015  0.16249023 -0.29896724
 -0.11928976 -0.09642559  0.03694298 -0.01508666  0.3389508   0.16575722
  0.35612342  0.3288069   0.3662248   0.1309997  -0.08535857  0.05184054
  0.08872069  0.04750683  0.10573867 -0.19169861  0.14181891 -0.02243489
 -0.10144629 -0.35596833  0.05233613  0.01030986  0.02784077  0.05750883
 -0.23626123 -0.14224444 -0.01390793  0.17340924 -0.03772602  0.3135555
  0.03876122  0.06076474  0.08404154 -0.12702513  0.01460998  0.06024339
 -0.30078572  0.11498134  0.15024103  0.35172573  0.06366682  0.16782545
 -0.16606937  0.37653887 -0.13072525  0.23260714 -0.05100095  0.17505772
  0.20597345 -0.09268837 -0.28350693 -0.39543292  0.38256982 -0.03765872
  0.37133476 -0.13598533  0.49424398 -0.11985529  0.12764889 -0.061955
  0.12554361 -0.04529578 -0.03425958  0.05043591  0.39497897 -0.19723797
  0.16822003  0.22117823 -0.05482369  0.17574376 -0.563747   -0.01264164
  0.4310346   0.11678924 -0.0020106   0.10945526 -0.13

  


## Query embeddings

In [12]:
#print(model_fasttext.wv.vocab)
print(len(model.wv.vocab))
# 838 words 100+ freq

print(model.wv.most_similar('gold'))
print(model.wv.most_similar('iron-ore'))
print(model.wv.most_similar('iron', topn=10))
print(model.wv.most_similar(positive=['kalgoorlie','iron-ore'],negative=['gold']))

print(model.wv.most_similar('kalgoorlie'))
print(model.wv['gold'])

2018-11-08 10:19:54,085 : INFO : precomputing L2-norms of word weight vectors
2018-11-08 10:19:54,087 : INFO : precomputing L2-norms of ngram weight vectors


838
[('surface-gold', 0.6701695919036865), ('mineralisation', 0.6299868822097778), ('gold-mineral', 0.6059813499450684), ('mineralization', 0.585066020488739), ('kalgoorlie', 0.5806409120559692), ('metal', 0.5685122609138489), ('western-australia', 0.5583513379096985), ('nickel', 0.5336951017379761), ('greenstone-belt', 0.5104318857192993), ('archaean', 0.5011576414108276)]
[('iron', 0.7401604056358337), ('hematite', 0.6724737286567688), ('west-angelas', 0.6016417145729065), ('marandoo', 0.5700218081474304), ('mount-jackson', 0.5502362847328186), ('windarling', 0.5446536540985107), ('hamersley', 0.5273263454437256), ('martite', 0.5261359810829163), ('banded-iron-formation', 0.5159679055213928), ('tallering-peak', 0.5107094645500183)]
[('iron-ore', 0.7401604056358337), ('hematite', 0.6376545429229736), ('west-angelas', 0.6171693205833435), ('hamersley', 0.5958882570266724), ('marandoo', 0.5773637294769287), ('mount-sylvia-formation', 0.5698187351226807), ('wittenoom-formation', 0.566781