# WAMEX reports: pre-process data and train the embedding with FastText

In [1]:
from gensim.models import FastText

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from stop_words import get_stop_words
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = list(get_stop_words('en'))         # 174 stopwords
nltk_words = list(stopwords.words('english'))   # 153 stopwords
stop_words.extend(nltk_words)                   # 353 in total

import re
import os, glob, codecs
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info('START - FastText Embeddings for all wamex reports')

WAMEX_DATA_FOLDER = r"/Users/majiga/Documents/wamex/data/wamex_xml"

MODEL_FILE = r"Vectors/fastText_wamex_all_300freq.model" # bin file


def tokenize_and_lemmatize(input_text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token    
    tokens = [word for sent in nltk.sent_tokenize(input_text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z-]', token):
            filtered_tokens.append(token)
    # Lemma
    lemmas = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    return ' '.join(lemmas)


"""
Read a txt file and return sentences
"""
def read_clean_file(filename):
       
    with codecs.open(filename, "r", encoding='utf-8', errors='ignore') as f:
        data = f.read()
    if (len(data) < 10):
        return None
    
    
    data_cleaned = [] 
    # remove stop words
    data = data.lower()
    data = data.replace('-', ' ')
    data = data.replace(',', ' ')
    data = data.replace('\\', ' ')
    data = data.replace('/', ' ')
    
    for w in data.split():
        if (w not in stop_words):
            data_cleaned.append(w)
    
    #print('CLEAN DATA')
    #print(' '.join(data_cleaned))
    
    # lemmatize words in each sentences
    data_lemmatized = tokenize_and_lemmatize(' '.join(data_cleaned))
    
    return data_lemmatized

2018-11-07 16:00:21,398 : INFO : START - FastText Embeddings for all wamex reports


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/majiga/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/majiga/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/majiga/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# READ WAMEX REPORTS FROM THE DATA FOLDER
logging.info('START - Read cleaned wamex reports in ' + WAMEX_DATA_FOLDER)
reports_data = []
for filename in glob.glob(os.path.join(WAMEX_DATA_FOLDER, '*.json')):
    # do your stuff
    if (read_clean_file(filename) is not None):
        reports_data.append(read_clean_file(filename).split())
logging.info('END - Read wamex reports')

# Counting tokens
count_tokens = 0
for data in reports_data:
    count_tokens += len(data)
print(count_tokens)
# 42650553 tokens

# Train the model
logging.info("START - Train the fastText model")
model_fasttext = FastText(reports_data, size=100, window=5, min_count=300, workers=4, sg=1)

# min_count=100 => training on a 246019540 raw words (182291433 effective words) took 817.9s, 222873 effective words/s if 100+ frequency

logging.info('END - FastText Embeddings for all wamex reports')

2018-11-07 16:00:27,879 : INFO : START - Read cleaned wamex reports in /Users/majiga/Documents/wamex/data/wamex_xml
2018-11-07 19:24:52,500 : INFO : END - Read wamex reports
2018-11-07 19:24:52,510 : INFO : START - Train the fastText model
2018-11-07 19:24:52,513 : INFO : collecting all words and their counts
2018-11-07 19:24:52,514 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


42650553


2018-11-07 19:24:54,951 : INFO : PROGRESS: at sentence #10000, processed 13850438 words, keeping 218283 word types
2018-11-07 19:24:56,979 : INFO : PROGRESS: at sentence #20000, processed 27122180 words, keeping 337736 word types
2018-11-07 19:24:59,150 : INFO : PROGRESS: at sentence #30000, processed 40874779 words, keeping 440080 word types
2018-11-07 19:24:59,430 : INFO : collected 452928 word types from a corpus of 42650553 raw words and 31328 sentences
2018-11-07 19:24:59,431 : INFO : Loading a fresh vocabulary
2018-11-07 19:24:59,632 : INFO : min_count=300 retains 8562 unique words (1% of original 452928, drops 444366)
2018-11-07 19:24:59,632 : INFO : min_count=300 leaves 39711426 word corpus (93% of original 42650553, drops 2939127)
2018-11-07 19:24:59,666 : INFO : deleting the raw counts dictionary of 452928 items
2018-11-07 19:24:59,674 : INFO : sample=0.001 downsamples 35 most-common words
2018-11-07 19:24:59,675 : INFO : downsampling leaves estimated 37283155 word corpus (93

2018-11-07 19:26:04,845 : INFO : EPOCH 1 - PROGRESS: at 44.13% examples, 237776 words/s, in_qsize 8, out_qsize 1
2018-11-07 19:26:05,900 : INFO : EPOCH 1 - PROGRESS: at 44.77% examples, 237655 words/s, in_qsize 8, out_qsize 0
2018-11-07 19:26:06,901 : INFO : EPOCH 1 - PROGRESS: at 45.50% examples, 237609 words/s, in_qsize 8, out_qsize 0
2018-11-07 19:26:07,927 : INFO : EPOCH 1 - PROGRESS: at 46.19% examples, 237559 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:26:08,985 : INFO : EPOCH 1 - PROGRESS: at 46.89% examples, 237316 words/s, in_qsize 8, out_qsize 0
2018-11-07 19:26:09,990 : INFO : EPOCH 1 - PROGRESS: at 47.49% examples, 237323 words/s, in_qsize 6, out_qsize 1
2018-11-07 19:26:11,043 : INFO : EPOCH 1 - PROGRESS: at 48.31% examples, 237438 words/s, in_qsize 8, out_qsize 1
2018-11-07 19:26:12,062 : INFO : EPOCH 1 - PROGRESS: at 49.08% examples, 237537 words/s, in_qsize 8, out_qsize 0
2018-11-07 19:26:13,067 : INFO : EPOCH 1 - PROGRESS: at 49.69% examples, 237699 words/s, in_qsiz

2018-11-07 19:27:19,898 : INFO : EPOCH 1 - PROGRESS: at 96.12% examples, 237665 words/s, in_qsize 8, out_qsize 1
2018-11-07 19:27:20,919 : INFO : EPOCH 1 - PROGRESS: at 96.87% examples, 237743 words/s, in_qsize 8, out_qsize 0
2018-11-07 19:27:22,000 : INFO : EPOCH 1 - PROGRESS: at 97.65% examples, 237753 words/s, in_qsize 8, out_qsize 0
2018-11-07 19:27:23,014 : INFO : EPOCH 1 - PROGRESS: at 98.31% examples, 237773 words/s, in_qsize 8, out_qsize 1
2018-11-07 19:27:24,033 : INFO : EPOCH 1 - PROGRESS: at 98.99% examples, 237808 words/s, in_qsize 6, out_qsize 1
2018-11-07 19:27:25,045 : INFO : EPOCH 1 - PROGRESS: at 99.59% examples, 237757 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:27:25,560 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-07 19:27:25,562 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-07 19:27:25,604 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-07 19:27:25,633 : INFO : worker thread fi

2018-11-07 19:28:31,508 : INFO : EPOCH 2 - PROGRESS: at 46.03% examples, 239940 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:28:32,518 : INFO : EPOCH 2 - PROGRESS: at 46.69% examples, 240142 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:28:33,519 : INFO : EPOCH 2 - PROGRESS: at 47.41% examples, 240165 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:28:34,536 : INFO : EPOCH 2 - PROGRESS: at 48.18% examples, 240124 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:28:35,568 : INFO : EPOCH 2 - PROGRESS: at 48.91% examples, 240088 words/s, in_qsize 8, out_qsize 1
2018-11-07 19:28:36,596 : INFO : EPOCH 2 - PROGRESS: at 49.54% examples, 240150 words/s, in_qsize 6, out_qsize 1
2018-11-07 19:28:37,623 : INFO : EPOCH 2 - PROGRESS: at 50.27% examples, 240269 words/s, in_qsize 8, out_qsize 0
2018-11-07 19:28:38,629 : INFO : EPOCH 2 - PROGRESS: at 51.03% examples, 240111 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:28:39,661 : INFO : EPOCH 2 - PROGRESS: at 51.78% examples, 240047 words/s, in_qsiz

2018-11-07 19:29:46,262 : INFO : EPOCH 2 - PROGRESS: at 97.97% examples, 239066 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:29:47,278 : INFO : EPOCH 2 - PROGRESS: at 98.63% examples, 239101 words/s, in_qsize 8, out_qsize 0
2018-11-07 19:29:48,307 : INFO : EPOCH 2 - PROGRESS: at 99.31% examples, 239125 words/s, in_qsize 6, out_qsize 1
2018-11-07 19:29:49,154 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-07 19:29:49,171 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-07 19:29:49,211 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-07 19:29:49,236 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-07 19:29:49,236 : INFO : EPOCH - 2 : training on 42650553 raw words (34350792 effective words) took 143.6s, 239214 effective words/s
2018-11-07 19:29:50,280 : INFO : EPOCH 3 - PROGRESS: at 0.81% examples, 217287 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:29:51,295 : INFO : EPOCH 3 - P

2018-11-07 19:50:14,110 : INFO : EPOCH 3 - PROGRESS: at 46.29% examples, 230176 words/s, in_qsize 6, out_qsize 1
2018-11-07 19:50:15,114 : INFO : EPOCH 3 - PROGRESS: at 47.05% examples, 230549 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:50:16,138 : INFO : EPOCH 3 - PROGRESS: at 47.77% examples, 230794 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:50:17,147 : INFO : EPOCH 3 - PROGRESS: at 48.53% examples, 231094 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:50:18,161 : INFO : EPOCH 3 - PROGRESS: at 49.23% examples, 231345 words/s, in_qsize 8, out_qsize 0
2018-11-07 19:50:19,209 : INFO : EPOCH 3 - PROGRESS: at 49.89% examples, 231484 words/s, in_qsize 8, out_qsize 0
2018-11-07 19:50:20,234 : INFO : EPOCH 3 - PROGRESS: at 50.70% examples, 231771 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:50:21,240 : INFO : EPOCH 3 - PROGRESS: at 51.48% examples, 232086 words/s, in_qsize 7, out_qsize 0
2018-11-07 19:50:22,250 : INFO : EPOCH 3 - PROGRESS: at 52.21% examples, 232281 words/s, in_qsiz

2018-11-08 09:24:31,587 : INFO : EPOCH 3 - PROGRESS: at 88.97% examples, 160597 words/s, in_qsize 6, out_qsize 1
2018-11-08 09:24:32,603 : INFO : EPOCH 3 - PROGRESS: at 89.58% examples, 161053 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:24:33,604 : INFO : EPOCH 3 - PROGRESS: at 90.22% examples, 161506 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:24:34,613 : INFO : EPOCH 3 - PROGRESS: at 90.96% examples, 161937 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:24:35,680 : INFO : EPOCH 3 - PROGRESS: at 91.61% examples, 162287 words/s, in_qsize 8, out_qsize 1
2018-11-08 09:24:36,728 : INFO : EPOCH 3 - PROGRESS: at 92.35% examples, 162710 words/s, in_qsize 6, out_qsize 1
2018-11-08 09:24:37,729 : INFO : EPOCH 3 - PROGRESS: at 93.14% examples, 163153 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:24:38,741 : INFO : EPOCH 3 - PROGRESS: at 93.90% examples, 163601 words/s, in_qsize 8, out_qsize 0
2018-11-08 09:24:39,755 : INFO : EPOCH 3 - PROGRESS: at 94.55% examples, 163995 words/s, in_qsiz

2018-11-08 09:25:43,326 : INFO : EPOCH 4 - PROGRESS: at 39.31% examples, 242415 words/s, in_qsize 5, out_qsize 2
2018-11-08 09:25:44,381 : INFO : EPOCH 4 - PROGRESS: at 39.94% examples, 242420 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:25:45,385 : INFO : EPOCH 4 - PROGRESS: at 40.71% examples, 242670 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:25:46,396 : INFO : EPOCH 4 - PROGRESS: at 41.43% examples, 242595 words/s, in_qsize 6, out_qsize 1
2018-11-08 09:25:47,446 : INFO : EPOCH 4 - PROGRESS: at 42.14% examples, 242467 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:25:48,466 : INFO : EPOCH 4 - PROGRESS: at 42.96% examples, 242663 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:25:49,559 : INFO : EPOCH 4 - PROGRESS: at 43.68% examples, 242581 words/s, in_qsize 8, out_qsize 0
2018-11-08 09:25:50,648 : INFO : EPOCH 4 - PROGRESS: at 44.34% examples, 242182 words/s, in_qsize 7, out_qsize 1
2018-11-08 09:25:51,662 : INFO : EPOCH 4 - PROGRESS: at 45.05% examples, 242248 words/s, in_qsiz

2018-11-08 09:26:58,477 : INFO : EPOCH 4 - PROGRESS: at 93.83% examples, 246129 words/s, in_qsize 8, out_qsize 0
2018-11-08 09:26:59,516 : INFO : EPOCH 4 - PROGRESS: at 94.52% examples, 246201 words/s, in_qsize 8, out_qsize 2
2018-11-08 09:27:00,563 : INFO : EPOCH 4 - PROGRESS: at 95.29% examples, 246285 words/s, in_qsize 8, out_qsize 0
2018-11-08 09:27:01,609 : INFO : EPOCH 4 - PROGRESS: at 96.08% examples, 246344 words/s, in_qsize 8, out_qsize 1
2018-11-08 09:27:02,613 : INFO : EPOCH 4 - PROGRESS: at 96.87% examples, 246447 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:27:03,655 : INFO : EPOCH 4 - PROGRESS: at 97.65% examples, 246459 words/s, in_qsize 6, out_qsize 1
2018-11-08 09:27:04,679 : INFO : EPOCH 4 - PROGRESS: at 98.36% examples, 246561 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:27:05,680 : INFO : EPOCH 4 - PROGRESS: at 99.07% examples, 246610 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:27:06,689 : INFO : EPOCH 4 - PROGRESS: at 99.74% examples, 246624 words/s, in_qsiz

2018-11-08 09:28:09,621 : INFO : EPOCH 5 - PROGRESS: at 44.92% examples, 246478 words/s, in_qsize 6, out_qsize 1
2018-11-08 09:28:10,622 : INFO : EPOCH 5 - PROGRESS: at 45.59% examples, 246363 words/s, in_qsize 8, out_qsize 2
2018-11-08 09:28:11,646 : INFO : EPOCH 5 - PROGRESS: at 46.35% examples, 246420 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:28:12,668 : INFO : EPOCH 5 - PROGRESS: at 47.07% examples, 246271 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:28:13,679 : INFO : EPOCH 5 - PROGRESS: at 47.70% examples, 245883 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:28:14,695 : INFO : EPOCH 5 - PROGRESS: at 48.41% examples, 245629 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:28:15,705 : INFO : EPOCH 5 - PROGRESS: at 49.11% examples, 245409 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:28:16,709 : INFO : EPOCH 5 - PROGRESS: at 49.69% examples, 245355 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:28:17,779 : INFO : EPOCH 5 - PROGRESS: at 50.44% examples, 245200 words/s, in_qsiz

2018-11-08 09:29:24,550 : INFO : EPOCH 5 - PROGRESS: at 98.41% examples, 245583 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:29:25,579 : INFO : EPOCH 5 - PROGRESS: at 99.10% examples, 245527 words/s, in_qsize 8, out_qsize 0
2018-11-08 09:29:26,613 : INFO : EPOCH 5 - PROGRESS: at 99.84% examples, 245619 words/s, in_qsize 7, out_qsize 0
2018-11-08 09:29:26,745 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-08 09:29:26,765 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-08 09:29:26,800 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-08 09:29:26,826 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-08 09:29:26,826 : INFO : EPOCH - 5 : training on 42650553 raw words (34350486 effective words) took 139.8s, 245702 effective words/s
2018-11-08 09:29:26,832 : INFO : training on a 213252765 raw words (171753162 effective words) took 773.0s, 222185 effective words/s
2018-11-08 09:29:27,183

In [3]:
model_fasttext.wv.most_similar("gold")

# Count the words in the vocabulary
print(len(model_fasttext.wv.vocab)) # 8562

# save model
model_fasttext.save(MODEL_FILE)

2018-11-08 09:31:38,337 : INFO : precomputing L2-norms of word weight vectors
2018-11-08 09:31:38,345 : INFO : precomputing L2-norms of ngram weight vectors
2018-11-08 09:31:38,425 : INFO : saving FastText object under /Users/majiga/Documents/wamex/fastText_wamex_all_300freq.model, separately None
2018-11-08 09:31:38,426 : INFO : not storing attribute vectors_norm
2018-11-08 09:31:38,432 : INFO : not storing attribute vectors_vocab_norm
2018-11-08 09:31:38,432 : INFO : not storing attribute vectors_ngrams_norm
2018-11-08 09:31:38,432 : INFO : not storing attribute buckets_word
2018-11-08 09:31:38,848 : INFO : saved /Users/majiga/Documents/wamex/fastText_wamex_all_300freq.model


8562


In [4]:
# load model
new_model = FastText.load(MODEL_FILE)
print(new_model)

model_fasttext.wv.most_similar('gold')

2018-11-08 09:32:21,990 : INFO : loading FastText object from /Users/majiga/Documents/wamex/fastText_wamex_all_300freq.model
2018-11-08 09:32:22,293 : INFO : loading wv recursively from /Users/majiga/Documents/wamex/fastText_wamex_all_300freq.model.wv.* with mmap=None
2018-11-08 09:32:22,294 : INFO : setting ignored attribute vectors_norm to None
2018-11-08 09:32:22,294 : INFO : setting ignored attribute vectors_vocab_norm to None
2018-11-08 09:32:22,294 : INFO : setting ignored attribute vectors_ngrams_norm to None
2018-11-08 09:32:22,294 : INFO : setting ignored attribute buckets_word to None
2018-11-08 09:32:22,295 : INFO : loading vocabulary recursively from /Users/majiga/Documents/wamex/fastText_wamex_all_300freq.model.vocabulary.* with mmap=None
2018-11-08 09:32:22,295 : INFO : loading trainables recursively from /Users/majiga/Documents/wamex/fastText_wamex_all_300freq.model.trainables.* with mmap=None
2018-11-08 09:32:22,295 : INFO : loaded /Users/majiga/Documents/wamex/fastText

FastText(vocab=8562, size=100, alpha=0.025)


[('au', 0.7181254625320435),
 ('copper', 0.6624714732170105),
 ('nickel', 0.6557887196540833),
 ('metal', 0.6335586905479431),
 ('precious', 0.6275296807289124),
 ('arsenic', 0.6186712384223938),
 ('antimony', 0.6167358756065369),
 ('tungsten', 0.6000398993492126),
 ('historically', 0.5887951850891113),
 ('resolute', 0.5784502029418945)]

In [5]:
# model_fasttext.wv.most_similar('kalgoorlie')

print(model_fasttext.wv.most_similar('iron-ore'))

print(model_fasttext.wv.most_similar('iron', topn=10))


print(model_fasttext.wv.most_similar(positive=['kalgoorlie','iron-ore'],negative=['gold']))

print(new_model.wv['gold'])  # numpy vector of a word

[('iron', 0.7587395310401917), ('ironcap', 0.6327366828918457), ('marra', 0.5913161039352417), ('brockman', 0.5766406655311584), ('bif', 0.5754503607749939), ('sinosteel', 0.5720453262329102), ('nammuldi', 0.5704472661018372), ('manganese', 0.5699800252914429), ('banded', 0.5622855424880981), ('finnerty', 0.5618066787719727)]
[('marra', 0.6866266131401062), ('brockman', 0.6819794774055481), ('hematite', 0.6760455369949341), ('banded', 0.6720453500747681), ('detritals', 0.6715987920761108), ('bid', 0.6652620434761047), ('bif', 0.6647725701332092), ('ore', 0.6554528474807739), ('manganese', 0.6512892842292786), ('dso', 0.6481667160987854)]
[('esperance', 0.47153985500335693), ('bunbury', 0.45931476354599), ('pannawonica', 0.45913761854171753), ('karratha', 0.44624924659729004), ('geraldton', 0.4430530369281769), ('coolgardie', 0.42476966977119446), ('balladonia', 0.42342591285705566), ('vetters', 0.41675683856010437), ('hyden', 0.4077901244163513), ('bulong', 0.404255747795105)]
[ 0.0562

In [6]:
words = list(model_fasttext.wv.vocab)
print(words)

