-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
word2vec_corpusfile.pxd
69 lines (53 loc) · 2.12 KB
/
word2vec_corpusfile.pxd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# distutils: language = c++
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
# cython: embedsignature=True
# coding: utf-8
#
# shared type definitions for word2vec_corpusfile
# also used from fasttext_corpusfile and doc2vec_corpusfile
#
# Copyright (C) 2018 Dmitry Persiyanov <dmitry.persiyanov@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
from libcpp.string cimport string
from libcpp.vector cimport vector
from libcpp.unordered_map cimport unordered_map
from libcpp cimport bool as bool_t
cimport numpy as np
ctypedef np.float32_t REAL_t
cdef extern from "fast_line_sentence.h":
cdef cppclass FastLineSentence:
FastLineSentence() except +
FastLineSentence(string&, size_t) except +
vector[string] ReadSentence() nogil except +
bool_t IsEof() nogil
void Reset() nogil
cdef class CythonLineSentence:
cdef FastLineSentence* _thisptr
cdef public bytes source
cdef public size_t max_sentence_length, max_words_in_batch, offset
cdef vector[vector[string]] buf_data
cpdef bool_t is_eof(self) nogil
cpdef vector[string] read_sentence(self) nogil except *
cpdef vector[vector[string]] _read_chunked_sentence(self) nogil except *
cpdef vector[vector[string]] _chunk_sentence(self, vector[string] sent) nogil
cpdef void reset(self) nogil
cpdef vector[vector[string]] next_batch(self) nogil except *
cdef struct VocabItem:
long long sample_int
np.uint32_t index
np.uint8_t *code
int code_len
np.uint32_t *point
# for FastText
int subword_idx_len
np.uint32_t *subword_idx
ctypedef unordered_map[string, VocabItem] cvocab_t
cdef class CythonVocab:
cdef cvocab_t vocab
cdef subword_arrays
cdef cvocab_t* get_vocab_ptr(self) nogil except *
cdef REAL_t get_alpha(REAL_t alpha, REAL_t end_alpha, int cur_epoch, int num_epochs) nogil
cdef REAL_t get_next_alpha(REAL_t start_alpha, REAL_t end_alpha, long long total_examples, long long total_words,
long long expected_examples, long long expected_words, int cur_epoch, int num_epochs) nogil