Skip to content
Permalink
Browse files

word2vec: Update to 2017071

  • Loading branch information
ryandesign committed Jun 29, 2020
1 parent d05db1d commit 6c99b08220ed2d2054a64393fa124d00c84a59a7
@@ -3,15 +3,12 @@
PortSystem 1.0
PortGroup github 1.0

github.setup tmikolov word2vec d83ccfba4dd08f113d5e75d67c9f6cd30f6532c4
version 20150131
github.setup tmikolov word2vec 20c129af10659f7c50e86e3be406df663beff438
version 20170716
revision 0
checksums rmd160 07cc774a7b4141f86dab449fef83554cfbc1d6f7 \
sha256 48ef7b400f4c1040a314f23901cccb864055d985ac208d5ea5ca22b7f80ad9c8 \
size 104573

# Newer versions do not build on macOS:
# https://github.com/tmikolov/word2vec/issues/48
checksums rmd160 de98886c52303242566eacd5a3eaf4459026bd71 \
sha256 e546d2c1213ca55fd2291a638139ef2730c3f0b737b2aa42dd98e25936a226c0 \
size 104875

categories textproc
maintainers nomaintainer
@@ -30,13 +27,15 @@ github.tarball_from archive

depends_run port:wget

patchfiles patch-malloc.diff \
patchfiles fgetc_unlocked-fputc_unlocked.patch \
prevent-early-exit.patch \
patch-compute-accuracy.c.diff \
patch-demo.diff

use_configure no
variant universal {}

# This is what the makefile uses.
configure.optflags -O3

build.args CC="${configure.cc}" \
@@ -55,10 +54,7 @@ destroot {
xinstall -m 0644 -W ${worksrcpath} \
questions-phrases.txt questions-words.txt \
${destroot}${exdir}
# fix demo scripts.
foreach f [glob ${destroot}${execdir}/demo-*.sh] {
reinplace "s|@EXECDIR@|${execdir}|g" ${f}
}
reinplace "s|@EXECDIR@|${execdir}|g" {*}[glob ${destroot}${execdir}/demo-*.sh]
set docdir ${prefix}/share/doc/${name}
xinstall -d ${destroot}${docdir}
xinstall -m 0644 -W ${worksrcpath} \
@@ -0,0 +1,52 @@
Fix build on Darwin by replacing GNU-specific fgetc_unlocked with
getc_unlocked and fputc_unlocked with putc_unlocked.

https://github.com/tmikolov/word2vec/issues/48
https://github.com/tmikolov/word2vec/pull/40
--- word2phrase.c.orig
+++ word2phrase.c
@@ -42,7 +42,7 @@ unsigned long long next_random = 1;
void ReadWord(char *word, FILE *fin, char *eof) {
int a = 0, ch;
while (1) {
- ch = fgetc_unlocked(fin);
+ ch = getc_unlocked(fin);
if (ch == EOF) {
*eof = 1;
break;
@@ -246,7 +246,7 @@ void TrainModel() {
if (eof) break;
if (word[0] == '\n') {
//fprintf(fo, "\n");
- fputc_unlocked('\n', fo);
+ putc_unlocked('\n', fo);
continue;
}
cn++;
@@ -286,12 +286,12 @@ void TrainModel() {
next_random = next_random * (unsigned long long)25214903917 + 11;
//if (next_random & 0x10000) score = 0;
if (score > threshold) {
- fputc_unlocked('_', fo);
+ putc_unlocked('_', fo);
pb = 0;
- } else fputc_unlocked(' ', fo);
+ } else putc_unlocked(' ', fo);
a = 0;
while (word[a]) {
- fputc_unlocked(word[a], fo);
+ putc_unlocked(word[a], fo);
a++;
}
pa = pb;
--- word2vec.c.orig
+++ word2vec.c
@@ -71,7 +71,7 @@ void InitUnigramTable() {
void ReadWord(char *word, FILE *fin, char *eof) {
int a = 0, ch;
while (1) {
- ch = fgetc_unlocked(fin);
+ ch = getc_unlocked(fin);
if (ch == EOF) {
*eof = 1;
break;
@@ -1,8 +1,8 @@
--- compute-accuracy.c.orig 2014-02-23 20:29:23.000000000 +0900
+++ compute-accuracy.c 2014-02-23 20:30:44.000000000 +0900
--- compute-accuracy.c.orig 2017-07-16 17:46:08.000000000 -0500
+++ compute-accuracy.c 2020-06-29 10:24:43.000000000 -0500
@@ -28,7 +28,8 @@
FILE *f;
char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size];
float dist, len, bestd[N], vec[max_size];
- long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
+ long long words, size, b, c, d, b1, b2, b3, threshold = 0;
@@ -1,3 +1,4 @@
Adjust demo paths so they work regardless of the current working directory.
--- demo-analogy.sh.orig 2014-09-07 01:54:27.000000000 +0900
+++ demo-analogy.sh 2014-12-24 22:55:24.000000000 +0900
@@ -7,5 +7,5 @@

This file was deleted.

@@ -0,0 +1,14 @@
Prevent last thread from exiting early on multiple iterations.

https://github.com/tmikolov/word2vec/issues/47
https://github.com/tmikolov/word2vec/pull/39
--- word2vec.c.orig
+++ word2vec.c
@@ -424,6 +424,7 @@ void *TrainModelThread(void *id) {
last_word_count = 0;
sentence_length = 0;
fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
+ eof = 0;
continue;
}
word = sen[sentence_position];

0 comments on commit 6c99b08

Please sign in to comment.
You can’t perform that action at this time.