/
model.R
253 lines (243 loc) · 10.2 KB
/
model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#' Represents n-gram models
#'
#' @description
#' The Model class represents n-gram models. An instance of the class is a
#' single n-gram model. The attributes of this class are used to store n-gram
#' model information. The class provides methods for loading and saving the
#' model.
#'
#' @details
#' The attributes of this class are used to store n-gram model information such
#' as model name, model description, model file name, n-gram size, transition
#' probabilities data, default probability for words, data cleaning and
#' tokenization options, word list, model path, data directory path and
#' performance stats. The model is saved to a single file as a R object.
#'
#' A model file contains all the information required by the model. The model
#' object is used as input by classes that perform operations on the model such
#' as evaluation of model performance, text predictions and comparison of model
#' performance.
Model <- R6::R6Class(
"Model",
inherit = Base,
public = list(
#' @field pstats The performance stats for the model.
pstats = list(),
#' @field name The model name.
name = NULL,
#' @field desc The model description.
desc = NULL,
#' @description
#' It initializes the current object. It is used to set the
#' maximum n-gram number, sample size, input file name, data cleaner
#' options, tokenization options, combined transition probabilities file
#' name and verbose.
#' @param name The model name.
#' @param desc The model description.
#' @param fn The model file name.
#' @param df The name of the file used to generate the model.
#' @param n The maximum n-gram number supported by the model.
#' @param ssize The sample size as a proportion of the input file.
#' @param dir The directory containing the model files.
#' @param dc_opts The data cleaner options.
#' @param tg_opts The token generator options.
#' @param ve The level of detail in the information messages.
#' @export
initialize = function(name = NULL,
desc = NULL,
fn = NULL,
df = NULL,
n = 4,
ssize = 0.3,
dir = ".",
dc_opts = list(),
tg_opts = list(),
ve = 0) {
# The base class is initialized
super$initialize(NULL, NULL, ve)
# If the output file name is not given
if (is.null(fn)) {
# Error message is shown
private$dm("Output file name was not given", md = -1, ty = "e")
}
# The path to the data file
dfp <- paste0(dir, "/", df)
# If the data file does not exist, then an error is thrown
if (!file.exists(dfp)) {
# Error message is shown
private$dm("Invalid input file: ", dfp, md = -1, ty = "e")
}
# If the directory does not exist, then an error is thrown
if (!dir.exists(dir)) {
private$dm(
"The dir: ", dir, " does not exist !",
md = -1, ty = "e"
)
}
# An object of class EnvManager is created
em <- EnvManager$new(ve)
# The dict words file is checked
dc_opts[["dict_file"]] <- em$get_data_fn(
dc_opts[["dict_file"]], "dict-no-bad.txt"
)
# The model name is set
self$name <- name
# The model description is set
self$desc <- desc
# The n-gram number is set
private$n <- n
# The sample size is set
private$ssize <- ssize
# The directory name is set
private$dir <- dir
# The input file name is set
private$df <- df
# The word list file name is set
private$wlf <- paste0(dir, "/words.RDS")
# The model file name is set
private$fn <- fn
# If the dc_opts are given
if (length(dc_opts) > 0) {
# The custom dc_opts are merged with the default dc_opts
private$dc_opts <- modifyList(private$dc_opts, dc_opts)
}
# If the tg_opts are given
if (length(tg_opts) > 0) {
# The custom tg_opts are merged with the default tg_opts
private$tg_opts <- modifyList(private$tg_opts, tg_opts)
}
},
#' @description
#' It loads the model using the given information
load_model = function() {
# The tp file name
fn <- paste0(private$dir, "/model-", private$n, ".RDS")
# The tp file is read
private$tp <- private$read_obj(fn)
# The wl file is read
private$wl <- private$read_obj(private$wlf)
# The dictionary file name
fn <- private$dc_opts[["dict_file"]]
# The file contents
dict <- private$read_file(fn, F)
# The information message is shown
private$dh("Calculating default probability", "-", md = 1)
# The number of words in the dictionary file. It is used to
# calculate Perplexity.
vc <- length(dict)
# The path to the input data file
dfp <- paste0(private$dir, "/", private$df)
# The data file is read
data <- private$read_file(dfp, F)
# The words are split on " "
w <- strsplit(data, " ")
# The words are converted to atomic list
w <- unlist(w)
# The number of words
n <- length(w)
# The default probability is set
private$dp <- 1 / (n + vc)
# The information message is shown
private$dh("DONE", "=", md = 1)
},
#' @description
#' It returns the given configuration data
#' @param cn The name of the required configuration.
#' @return The configuration value.
get_config = function(cn) {
# The required configuration value
cv <- private[[cn]]
return(cv)
},
#' @description
#' It returns the size of the current object. The object
#' size is calculated as the sum of sizes of the object attributes.
#' @return The size of the object in bytes.
get_size = function() {
# The required object size
s <- 0
# The tp size is added
s <- s + as.numeric(object.size(private$tp))
# The wl size is added
s <- s + as.numeric(object.size(private$wl))
# The dc_opts size is added
s <- s + as.numeric(object.size(private$dc_opts))
# The tg_opts size is added
s <- s + as.numeric(object.size(private$tg_opts))
# The pstats size is added
s <- s + as.numeric(object.size(self$pstats))
return(s)
}
),
private = list(
# @field fn The path to the model file.
fn = NULL,
# @field wlf The path to the word list file.
wlf = NULL,
# @field df The short name of the input file.
df = NULL,
# @field tp The transition probabilities data frame.
tp = NULL,
# @field wl The list of unique words.
wl = NULL,
# @field dp The default probability is equal to 1/(N+V), where N is the
# number of words in the sentence, V is the number of words in the
# vocabulary.
dp = NULL,
# @field n The maximum number of n-grams supported by the model.
n = 4,
# @field dc_opts The options for the data cleaner object.
# * **min_words**. The minimum number of words per sentence.
# * **line_count**. The number of lines to read and clean at a time.
# * **sw_file**. The stop words file path.
# * **dict_file**. The dictionary file path.
# * **bad_file**. The bad words file path.
# * **to_lower**. If the words should be converted to lower case.
# * **remove_stop**. If stop words should be removed.
# * **remove_punct**. If punctuation symbols should be removed.
# * **remove_non_dict**. If non dictionary words should be removed.
# * **remove_non_alpha**. If non alphabet symbols should be removed.
# * **remove_extra_space**. If leading, trailing and double spaces
# should be removed.
# * **remove_bad**. If bad words should be removed
dc_opts = list(
"min_words" = 2,
"line_count" = 1000,
"sw_file" = NULL,
"dict_file" = NULL,
"bad_file" = NULL,
"to_lower" = T,
"remove_stop" = F,
"remove_punc" = T,
"remove_non_dict" = T,
"remove_non_alpha" = T,
"remove_extra_space" = T,
"remove_bad" = F
),
# @field tg_opts The options for the token generator obj.
# * **n**. The n-gram size.
# * **save_ngrams**. If the n-gram data should be saved.
# * **min_freq**. All n-grams with frequency less than min_freq are
# ignored.
# * **line_count**. The number of lines to process at a time.
# * **stem_words**. If words should be converted to their stem.
# * **dir**. The dir where the output file should be saved.
# * **format**. The format for the output. There are two options.
# ** **plain**. The data is stored in plain text.
# ** **obj**. The data is stored as a R obj.
tg_opts = list(
"min_freq" = -1,
"n" = 1,
"save_ngrams" = T,
"min_freq" = -1,
"line_count" = 5000,
"stem_words" = F,
"dir" = "./data/models",
"format" = "obj"
),
# @field ssize The sample size as a proportion of the input file.
ssize = 0.3,
# @field dir The folder containing the model related files.
dir = "./data"
)
)