-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
fasttext_inner.pxd
149 lines (119 loc) · 4.76 KB
/
fasttext_inner.pxd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env cython
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
# cython: embedsignature=True
# coding: utf-8
#
# shared type definitions for fasttext_inner
# used from fasttext_corpusfile
#
import numpy as np
cimport numpy as np
from word2vec_inner cimport REAL_t
DEF MAX_SENTENCE_LEN = 10000
cdef struct FastTextConfig:
#
# Contains model parameters and indices required for training.
#
# This struct performs two main roles. First, it offers a lower-level
# abstraction over the gensim.models.fasttext.FastText model class, keeping
# some of its attributes as C types.
#
# The second role is to index batches of the corpus in a way that is
# productive for FastText training. More specifically, this index is flat:
# it arranges all tokens in a conceptually one-dimensional array, bypassing
# OOV terms and empty sentences.
#
# Once this struct is fully initialized, it is sufficient for training.
# Because it consists of entirely C-level data types, it can exist without
# the GIL, enabling faster processing and parallelization.
#
# Example usage:
#
# 1) init_ft_config: initialize the struct, allocate working memory
# 2) populate_ft_config: populate the indices
# 3) fasttext_train_any: perform actual training
#
#
# Model parameters. These get copied as-is from the Python model.
#
int sg, hs, negative, sample, size, window, cbow_mean, workers
REAL_t alpha
#
# The syn0_vocab and syn0_ngrams arrays store vectors for vocabulary terms
# and ngrams, respectively, as 1D arrays in scanline order. For example,
# syn0_vocab[i * size : (i + 1) * size] contains the elements for the ith
# vocab term.
#
REAL_t *syn0_vocab
REAL_t *syn0_ngrams
#
# EXPERIMENTAL
# The arrays below selectively enable/disable training for specific vocab
# terms and ngrams. If vocab_locks[i] is 0.0, training is disabled;
# if it is 1.0, normal training is enabled. Other values scale updates.
# If undersized for vocab/ngrams, (index % actual_size) is used -
# so that a minimal single-element `lockf` can apply to all slots.
#
REAL_t *vocab_lockf
np.uint32_t vocab_lockf_len
REAL_t *ngrams_lockf
np.uint32_t ngrams_lockf_len
#
# Working memory. These are typically large enough to hold a single
# vector each.
#
REAL_t *work
REAL_t *neu1
#
# Most of the arrays are indexed by the ordinal number of a word
# (also known as terms or tokens). For example:
#
# - indexes[N]: the index of the Nth token within the vocabulary
# - reduced_windows[N]: a random integer by which to resize the window around the Nth token
#
np.uint32_t indexes[MAX_SENTENCE_LEN]
np.uint32_t reduced_windows[MAX_SENTENCE_LEN]
#
# We keep track of sentence boundaries here. The tokens of the Xth
# sentence will be between [sentence_idx[X], sentence_idx[X + 1]).
#
int sentence_idx[MAX_SENTENCE_LEN + 1]
# For hierarchical softmax
REAL_t *syn1
np.uint32_t *points[MAX_SENTENCE_LEN]
#
# Each vocabulary term has a binary code, with frequent terms having
# shorter codes. This gets assigned in the _assign_binary_codes function
# in gensim.models.word2vec.py. Since the lengths of the codes vary, and
# this is C, we need to keep the lengths of each code as well as the codes
# themselves.
#
np.uint8_t *codes[MAX_SENTENCE_LEN]
int codelens[MAX_SENTENCE_LEN]
# For negative sampling
REAL_t *syn1neg
np.uint32_t *cum_table
unsigned long long cum_table_len
# for sampling (negative and frequent-word downsampling)
unsigned long long next_random
#
# For passing subwords information as C objects for nogil. More
# specifically, subwords_idx[i] is an array that contains the buckets in
# which the ith subword (ngram) occurs. Since this is C, we also need to
# store the length of that array separately: that's what subwords_idx_len
# is for.
#
int subwords_idx_len[MAX_SENTENCE_LEN]
np.uint32_t *subwords_idx[MAX_SENTENCE_LEN]
#
# See fasttext_inner.pyx for documentation on the functions below.
#
cdef void init_ft_config(FastTextConfig *c, model, alpha, _work, _neu1)
cdef object populate_ft_config(FastTextConfig *c, vocab, buckets_word, sentences)
cdef void fasttext_fast_sentence_sg_neg(FastTextConfig *c, int i, int j) nogil
cdef void fasttext_fast_sentence_sg_hs(FastTextConfig *c, int i, int j) nogil
cdef void fasttext_fast_sentence_cbow_neg(FastTextConfig *c, int i, int j, int k) nogil
cdef void fasttext_fast_sentence_cbow_hs(FastTextConfig *c, int i, int j, int k) nogil
cdef void fasttext_train_any(FastTextConfig *c, int num_sentences) nogil