From 44372091d6a2ca0a88ba262eecbe8bcaccbc52d6 Mon Sep 17 00:00:00 2001
From: mebrunet <marcetiennebrunet@gmail.com>
Date: Mon, 5 Nov 2018 15:17:12 -0500
Subject: [PATCH] Tweaks to helpers.

---
 scripts/analogy.sh | 10 +++++-----
 src/Bias.jl        | 21 +++++++++++++++++++--
 src/utils.jl       | 14 +++++++-------
 3 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/scripts/analogy.sh b/scripts/analogy.sh
index c2b34ea..4721ac0 100755
--- a/scripts/analogy.sh
+++ b/scripts/analogy.sh
@@ -2,18 +2,18 @@
 
 BASE_DIR="$(dirname $0)/.."
 
-EMBED_BIN=$PWD/$1 # Get embedding's abs path
+EMBED_BIN=$1
+VOCAB=$2
 EMBED_TXT=${EMBED_BIN%.*}.txt.tmp # Create name for txt vectors
 
 cd $BASE_DIR/GloVe # change directory to GloVe
 
-VOCAB=`julia --project -e "
+`julia --project -e "
 include(\"../src/GloVe.jl\");
-M=GloVe.load_model(\"$EMBED_BIN\");
+M=GloVe.load_model(\"$EMBED_BIN\", \"$VOCAB\");
 GloVe.save_text_vectors(\"$EMBED_TXT\", M.W, M.ivocab);
-print(M.vocab_path)
 "`
 
-../venv/bin/python eval/python/evaluate.py --vocab_file $VOCAB --vectors_file $EMBED_TXT
+python eval/python/evaluate.py --vocab_file $VOCAB --vectors_file $EMBED_TXT
 
 rm $EMBED_TXT
diff --git a/src/Bias.jl b/src/Bias.jl
index 883f2a8..d3d234e 100644
--- a/src/Bias.jl
+++ b/src/Bias.jl
@@ -1,6 +1,9 @@
 module Bias
 
-using LinearAlgebra, Statistics, Random
+using LinearAlgebra
+using Statistics
+using Random
+using SparseArrays
 
 include("word_sets.jl")
 
@@ -14,11 +17,16 @@ function get_weat_idx_set(word_set::NamedTuple, vocab::Dict)
 end
 
 
-function normalize_rows(X::Array{Float64,2})
+function normalize_rows(X::AbstractArray)
     return mapslices(normalize, X, dims=2)
 end
 
 
+function normalize_rows(X::SparseMatrixCSC)
+    return mapslices(normalize, X, dims=1)'
+end
+
+
 function effect_size(S::AbstractArray, T::AbstractArray, A::AbstractArray,
         B::AbstractArray)
     Ŝ = normalize_rows(S)
@@ -46,6 +54,15 @@ function effect_size(W, weat_idx_set::NamedTuple)
     return effect_size(S, T, A, B)
 end
 
+function effect_size(X::SparseMatrixCSC, weat_idx_set::NamedTuple)
+    S = X[:, weat_idx_set.S]
+    T = X[:, weat_idx_set.T]
+    A = X[:, weat_idx_set.A]
+    B = X[:, weat_idx_set.B]
+    return effect_size(S, T, A, B)
+end
+
+
 
 # Helper to compute effect size after changes to the embedding
 function effect_size(W, weat_idx_set::NamedTuple, deltas::Dict)
diff --git a/src/utils.jl b/src/utils.jl
index d6942b3..fd46884 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -46,13 +46,13 @@ end
 
 # Get file info from naming convention
 function fileinfo(filepath)
-    corpus = extract(filepath, r"-C[0-9]+-", trim=(2,1), cast=Int64)
-    min_vocab = extract(filepath, r"-V[0-9]+-", trim=(2,1), cast=Int64)
-    window = extract(filepath, r"-W[0-9]+-", trim=(2,1), cast=Int64)
-    dimension = extract(filepath, r"-D[0-9]+-", trim=(2,1), cast=Int64)
-    eta = extract(filepath, r"-R[0-9]+.[0-9]+-", trim=(2,1), cast=Float64)
-    max_iters = extract(filepath, r"-E[0-9]+-", trim=(2,1), cast=Int64)
-    seed = extract(filepath, r"-S[0-9]+.", trim=(2,1), cast=Int64)
+    corpus = extract(filepath, r"C[0-9]+", trim=(1,0), cast=Int64)
+    min_vocab = extract(filepath, r"V[0-9]+", trim=(1,0), cast=Int64)
+    window = extract(filepath, r"W[0-9]+", trim=(1,0), cast=Int64)
+    dimension = extract(filepath, r"D[0-9]+", trim=(1,0), cast=Int64)
+    eta = extract(filepath, r"R[0-9]+.[0-9]+", trim=(1,0), cast=Float64)
+    max_iters = extract(filepath, r"E[0-9]+", trim=(1,0), cast=Int64)
+    seed = extract(filepath, r"S[0-9]+", trim=(1,0), cast=Int64)
     tmp = extract(filepath, r".[0-9]{3}.bin$", trim=(1, 4), cast=Int64)
     iters = tmp == nothing ? max_iters : tmp
     return (corpus=corpus, min_vocab=min_vocab, window=window, dimension=dimension,