From c2b9aed00349875c21ec1f390e9d3706d4800b2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 20 Dec 2014 18:12:37 +0100 Subject: [PATCH] update docs --- README.md | 4 ++-- run_all.sh | 1 - run_embed.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 96cf100..dbe29c1 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,10 @@ Evaluation of word embeddings ============================= Code for the blog post evaluating word2vec, GloVe, SPPMI and SPPMI-SVD methods: -[Making sense of word2vec](http://radimrehurek.com/2014/12/making-sense-of-word2vec/) + +[Making sense of word2vec](http://radimrehurek.com/2014/12/making-sense-of-word2vec/). Run `run_all.sh` to run all experiments. Logs with results will be stored in the data directory. To replicate my results from the blog article, download and preprocess Wikipedia using [this code](https://github.com/piskvorky/sim-shootout). - You can use your own corpus though (the corpus path is a parameter to `run_all.sh`). diff --git a/run_all.sh b/run_all.sh index 0e83608..e5c6385 100755 --- a/run_all.sh +++ b/run_all.sh @@ -13,7 +13,6 @@ fi input_corpus=$1 questions=$2 outdir=$3 -shift 3 mkdir -p $outdir 2> /dev/null diff --git a/run_embed.py b/run_embed.py index 277a00d..42edae2 100644 --- a/run_embed.py +++ b/run_embed.py @@ -181,7 +181,7 @@ def raw2ppmi(cooccur, word2id, k_shift=1.0): cooccur /= marginal_word[:, None] # #(w, c) / #w cooccur /= marginal_context # #(w, c) / (#w * #c) cooccur *= marginal_word.sum() # #(w, c) * D / (#w * #c) - numpy.log(cooccur, out=cooccur) # log(#(w, c) * D / (#w * #c)) + numpy.log(cooccur, out=cooccur) # PMI = log(#(w, c) * D / (#w * #c)) logger.info("shifting PMI scores by log(k) with k=%s" % (k_shift, )) cooccur -= numpy.log(k_shift) # shifted PMI = log(#(w, c) * D / (#w * #c)) - log(k) @@ -278,7 +278,7 @@ def __init__(self, corpus, id2word, s_exponent=0.0): cooccur = utils.unpickle(outf('glove_corpus')) else: logger.info("glove corpus matrix not found, creating") - cooccur = glove.Corpus() + cooccur = glove.Corpus(dictionary=word2id) cooccur.fit(corpus(), window=WINDOW) utils.pickle(cooccur, outf('glove_corpus')) model = glove.Glove(no_components=DIM, learning_rate=0.05)