-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
doc2vec_inner.pxd
95 lines (72 loc) · 3.54 KB
/
doc2vec_inner.pxd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env cython
# distutils: language = c++
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
# cython: embedsignature=True
# coding: utf-8
#
# shared type definitions for doc2vec_inner
# used from doc2vec_corpusfile
#
# Copyright (C) 2018 Dmitry Persiyanov <dmitry.persiyanov@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
import numpy as np
cimport numpy as np
from word2vec_inner cimport REAL_t
DEF MAX_DOCUMENT_LEN = 10000
cdef struct Doc2VecConfig:
int hs, negative, sample, learn_doctags, learn_words, learn_hidden, train_words, cbow_mean
int document_len, doctag_len, window, expected_doctag_len, null_word_index, workers, docvecs_count
REAL_t *word_vectors
REAL_t *doctag_vectors
REAL_t *words_lockf
np.uint32_t words_lockf_len
REAL_t *doctags_lockf
np.uint32_t doctags_lockf_len
REAL_t *work
REAL_t *neu1
REAL_t alpha
int layer1_size, vector_size
int codelens[MAX_DOCUMENT_LEN]
np.uint32_t indexes[MAX_DOCUMENT_LEN]
np.uint32_t doctag_indexes[MAX_DOCUMENT_LEN]
np.uint32_t window_indexes[MAX_DOCUMENT_LEN]
np.uint32_t reduced_windows[MAX_DOCUMENT_LEN]
# For hierarchical softmax
REAL_t *syn1
np.uint32_t *points[MAX_DOCUMENT_LEN]
np.uint8_t *codes[MAX_DOCUMENT_LEN]
# For negative sampling
REAL_t *syn1neg
np.uint32_t *cum_table
unsigned long long cum_table_len, next_random
cdef void fast_document_dbow_hs(
const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
REAL_t *context_vectors, REAL_t *syn1, const int size,
const np.uint32_t context_index, const REAL_t alpha, REAL_t *work, int learn_context, int learn_hidden,
REAL_t *contexts_lockf, const np.uint32_t contexts_lockf_len) nogil
cdef unsigned long long fast_document_dbow_neg(
const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len,
REAL_t *context_vectors, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
const np.uint32_t context_index, const REAL_t alpha, REAL_t *work,
unsigned long long next_random, int learn_context, int learn_hidden, REAL_t *contexts_lockf,
const np.uint32_t contexts_lockf_len) nogil
cdef void fast_document_dm_hs(
const np.uint32_t *word_point, const np.uint8_t *word_code, int word_code_len,
REAL_t *neu1, REAL_t *syn1, const REAL_t alpha, REAL_t *work,
const int size, int learn_hidden) nogil
cdef unsigned long long fast_document_dm_neg(
const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, unsigned long long next_random,
REAL_t *neu1, REAL_t *syn1neg, const int predict_word_index, const REAL_t alpha, REAL_t *work,
const int size, int learn_hidden) nogil
cdef void fast_document_dmc_hs(
const np.uint32_t *word_point, const np.uint8_t *word_code, int word_code_len,
REAL_t *neu1, REAL_t *syn1, const REAL_t alpha, REAL_t *work,
const int layer1_size, const int vector_size, int learn_hidden) nogil
cdef unsigned long long fast_document_dmc_neg(
const int negative, np.uint32_t *cum_table, unsigned long long cum_table_len, unsigned long long next_random,
REAL_t *neu1, REAL_t *syn1neg, const int predict_word_index, const REAL_t alpha, REAL_t *work,
const int layer1_size, const int vector_size, int learn_hidden) nogil
cdef init_d2v_config(Doc2VecConfig *c, model, alpha, learn_doctags, learn_words, learn_hidden, train_words=*, work=*,
neu1=*, word_vectors=*, words_lockf=*, doctag_vectors=*, doctags_lockf=*, docvecs_count=*)