From 44372091d6a2ca0a88ba262eecbe8bcaccbc52d6 Mon Sep 17 00:00:00 2001 From: mebrunet Date: Mon, 5 Nov 2018 15:17:12 -0500 Subject: [PATCH] Tweaks to helpers. --- scripts/analogy.sh | 10 +++++----- src/Bias.jl | 21 +++++++++++++++++++-- src/utils.jl | 14 +++++++------- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/scripts/analogy.sh b/scripts/analogy.sh index c2b34ea..4721ac0 100755 --- a/scripts/analogy.sh +++ b/scripts/analogy.sh @@ -2,18 +2,18 @@ BASE_DIR="$(dirname $0)/.." -EMBED_BIN=$PWD/$1 # Get embedding's abs path +EMBED_BIN=$1 +VOCAB=$2 EMBED_TXT=${EMBED_BIN%.*}.txt.tmp # Create name for txt vectors cd $BASE_DIR/GloVe # change directory to GloVe -VOCAB=`julia --project -e " +`julia --project -e " include(\"../src/GloVe.jl\"); -M=GloVe.load_model(\"$EMBED_BIN\"); +M=GloVe.load_model(\"$EMBED_BIN\", \"$VOCAB\"); GloVe.save_text_vectors(\"$EMBED_TXT\", M.W, M.ivocab); -print(M.vocab_path) "` -../venv/bin/python eval/python/evaluate.py --vocab_file $VOCAB --vectors_file $EMBED_TXT +python eval/python/evaluate.py --vocab_file $VOCAB --vectors_file $EMBED_TXT rm $EMBED_TXT diff --git a/src/Bias.jl b/src/Bias.jl index 883f2a8..d3d234e 100644 --- a/src/Bias.jl +++ b/src/Bias.jl @@ -1,6 +1,9 @@ module Bias -using LinearAlgebra, Statistics, Random +using LinearAlgebra +using Statistics +using Random +using SparseArrays include("word_sets.jl") @@ -14,11 +17,16 @@ function get_weat_idx_set(word_set::NamedTuple, vocab::Dict) end -function normalize_rows(X::Array{Float64,2}) +function normalize_rows(X::AbstractArray) return mapslices(normalize, X, dims=2) end +function normalize_rows(X::SparseMatrixCSC) + return mapslices(normalize, X, dims=1)' +end + + function effect_size(S::AbstractArray, T::AbstractArray, A::AbstractArray, B::AbstractArray) Ŝ = normalize_rows(S) @@ -46,6 +54,15 @@ function effect_size(W, weat_idx_set::NamedTuple) return effect_size(S, T, A, B) end +function effect_size(X::SparseMatrixCSC, weat_idx_set::NamedTuple) + S = X[:, weat_idx_set.S] + T = X[:, weat_idx_set.T] + A = X[:, weat_idx_set.A] + B = X[:, weat_idx_set.B] + return effect_size(S, T, A, B) +end + + # Helper to compute effect size after changes to the embedding function effect_size(W, weat_idx_set::NamedTuple, deltas::Dict) diff --git a/src/utils.jl b/src/utils.jl index d6942b3..fd46884 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -46,13 +46,13 @@ end # Get file info from naming convention function fileinfo(filepath) - corpus = extract(filepath, r"-C[0-9]+-", trim=(2,1), cast=Int64) - min_vocab = extract(filepath, r"-V[0-9]+-", trim=(2,1), cast=Int64) - window = extract(filepath, r"-W[0-9]+-", trim=(2,1), cast=Int64) - dimension = extract(filepath, r"-D[0-9]+-", trim=(2,1), cast=Int64) - eta = extract(filepath, r"-R[0-9]+.[0-9]+-", trim=(2,1), cast=Float64) - max_iters = extract(filepath, r"-E[0-9]+-", trim=(2,1), cast=Int64) - seed = extract(filepath, r"-S[0-9]+.", trim=(2,1), cast=Int64) + corpus = extract(filepath, r"C[0-9]+", trim=(1,0), cast=Int64) + min_vocab = extract(filepath, r"V[0-9]+", trim=(1,0), cast=Int64) + window = extract(filepath, r"W[0-9]+", trim=(1,0), cast=Int64) + dimension = extract(filepath, r"D[0-9]+", trim=(1,0), cast=Int64) + eta = extract(filepath, r"R[0-9]+.[0-9]+", trim=(1,0), cast=Float64) + max_iters = extract(filepath, r"E[0-9]+", trim=(1,0), cast=Int64) + seed = extract(filepath, r"S[0-9]+", trim=(1,0), cast=Int64) tmp = extract(filepath, r".[0-9]{3}.bin$", trim=(1, 4), cast=Int64) iters = tmp == nothing ? max_iters : tmp return (corpus=corpus, min_vocab=min_vocab, window=window, dimension=dimension,