-
Notifications
You must be signed in to change notification settings - Fork 1
/
3_k_on_trial.R
272 lines (215 loc) · 12.1 KB
/
3_k_on_trial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#"Someone must have slandered K, for one morning, without having done anything wrong, he was arrested."
#clean slate...:
rm(list = ls())
#library(rstudioapi)
#setwd(dirname(rstudioapi::callFun("getActiveDocumentContext")$path))
#The only thing we need really, for both lda tuning and perplexity calculations is our dtm, "dtmKW".
dtmKW <- readRDS("dtmKW.rds")
#The single most important parameter of the Gibbs sampler is K, the number of topics.
#Previously, we've used K=24, informed by the no of predefined categories in the samples.
#Another straight forward approach is to rely on our knowledge interest:
#A quick overview of a corpus requires perhaps 10 topics, something more refined 25,
#and a really detailed topical inspection perhaps 100 topics.
#Though robust, these approaches are still somewhat unsatisfying.
#Let's try to be a bit data driven.
#The general approach is to fixate the sampler params EXCEPT K,
#(alpha, beta, etc.) and then manipulate K to get the best model.
#We'll look into 3 methods to do it, and since doing this is computationally expensive,
#we'll do it in parallel.
#N0, not in the cloud. But the necessary code for parallellization is included, nonetheless.
#Once you've started the iterations on a fairly large dataset (>50')
#to find K for a good model, it's a good time have a cup of coffee.
#If that coffee is served in a harbor café in
#Murmansk, and you take your rowing boat to get there.
###############################################################################
###############################################################################
#Method 1. Density-based method
#From:
#Cao Juan, Xia Tian, Li Jintao, Zhang Yongdong, and Tang Sheng. 2009. A density-based method for adaptive lDA model selection.
#Neurocomputing - 16th European Symposium on Artificial Neural Networks 2008 72, 7-9: 1775-1781.
#"Clustering-like approach: ..."that the similarity will be as large as possible in the intra-
#cluster, but as small as possible between inter-clusters."
#Transfer this idea to topics, maximizing (density-based) similarity intra-topics
#and maximizing difference inter-topics.
###############################################################################
#Metod 2. Harmonic-mean of the log likelihood
#From (among others):
#Thomas L. Griffiths and Mark Steyvers. 2004. Finding scientific topics.
#Proceedings of the National Academy of Sciences 101, suppl 1: 5228-5235.
#Maximizing the likelihood of the observed data by changing the number of topics.
#(I.e. maximizing the probability that the model gives to the observed data,
#i.e. the likelihood P(words|no. of topics))
#This (log-)likelihood is estimated with harmonic means, using the Gibbs sampler.
###############################################################################
#From the package 'ldatuning', we get the result of method 1 and 2 normalized to a 0-1 scale,
#where we will just pick the model with a minimum from method 1, and maximum from method 2.
###############################################################################
###############################################################################
candidate_k <- c(2, 3, 2:60 * 2, 7:10 * 20) # a proper sampling of models with different no of K.
candidate_k
candidate_k <- c(2, 24, 60, 120) #for the purpose of this lab. Higher numbers add A LOT OF TIME.
controlGibbs <- list(alpha = 50/candidate_k, #Here we need alpha to be dynamic and vary with the size of K
seed = 5683, #hrm?
burnin = 200,
iter = 500,
delta = 0.1) #with a more dense dtm (resulting from our linguistic pre-processing),
# topics will be very specific,
# almost document-specific (especially for a small corpus).
# We can counter this effect by increasing alpha and/or delta (i.e. beta) slightly.
# Are we then rigging the below tests? In some sense, yes.
# i.e. what we "objectively" find as the appropriate K (no of topics)
# is partly the result of how we tweak delta.
#library(parallel) #If you run this locally, un-comment this and make the suggetsted changes below at "mc.cores"
library(ldatuning)
result <- FindTopicsNumber(
dtmKW,
topics = candidate_k,
metrics = c("Griffiths2004", "CaoJuan2009"),
method = "Gibbs",
control = controlGibbs,
mc.cores = 1L, #since this is in the cloud we cannot not use what we would otherwise have preferred: "detectCores()-1"
verbose = TRUE
)
FindTopicsNumber_plot(result)
#If you use to many cores on large models (say, many models and >300 topics) you can run out of RAM and then R/RStudio
#will abort your session. Solution: use fewer cores but more time. Or get more RAM.
#"But I'm not guilty," said K. "there's been a mistake.
#How is it even possible for someone to be guilty?
#We're all human beings here, one like the other."
#"That is true" said the priest "but that is how the guilty speak."
###############################################################################
###############################################################################
#Metod 3. Perplexity
#From (among others):
#Martin Ponweiser. 2012. Latent dirichlet allocation in r.
#Divide the corpus, build models with different number of topics on a training-set,
#see how well the models predict the held-out set. If the models does not predict well,
#it gets "perplexed" (who wouldn't be?) in terms of high entropy/low redundancy:
#the held-out dataset contains information we can't predict well with our model.
#Hence, high perplexity -> poor model. Low perplexity -> good model.
#The levels of perplexity are essentially
#corpus- but not model-specific.
#Find the model with the number of topics
#that have the lowest perplexity.
###############################################################################
###############################################################################
#Below script is ever so slightly adjusted from..:
#https://www.r-bloggers.com/cross-validation-of-topic-modelling/.
#Hats of to the r-blogger "Peter's stats stuff - R".
library(chinese.misc)
dtmKW <- m3m(dtmKW, "dtm") #The below function needs a 2d dtm...
#Ignore the warnings
library(doParallel)
#ForEach is a different way to parallelize, since it does not split the data, but the processes.
alpha = 50/candidate_k #Here we need alpha to be dynamic and vary with the size of K
burnin = 200
iter = 500
keep = 50
delta = 0.1 #We need these stated individually, to call them inside the below function...
n <- nrow(dtmKW) #We need this for splitting the sample
#cluster <- makeCluster(detectCores(logical = TRUE) - 1) #This is what you want
#if you run this on a machine where you are in control over the CPU:s. For now, use:
cluster <- makeCluster(1L)
registerDoParallel(cluster)
clusterEvalQ(cluster, {
library(topicmodels)
})
folds <- 3 #For the overachievers: =>5.
splitfolds <- sample(1:5, n, replace = TRUE) #1:5 will result in (roughly) a 80/20 split.
#to inspect train and validation data-set
#for(i in 1:folds){
# train_set <- dtmKW[splitfolds != i , ]
# valid_set <- dtmKW[splitfolds == i, ]} #i.e., you do not really need these two objects.
clusterExport(cluster, c("dtmKW", "burnin", "iter", "keep", "delta", "splitfolds", "folds", "candidate_k"))
#These are 'uploaded' to the clusters to be included in the calculations
#Below, parallelization below by the different number of topics: a processor is allocated a value
#of k, and does the cross-validation serially. Why (and why not over folds)?
#Because it is assumed there are more candidate values of k than there are cross-validation folds (k>folds),
system.time({
results <- foreach(j = 1:length(candidate_k), .combine = rbind) %dopar%{
k <- candidate_k[j]
results_1k <- matrix(0, nrow = folds, ncol = 2)
colnames(results_1k) <- c("k", "perplexity")
for(i in 1:folds){
train_set <- dtmKW[splitfolds != i , ]
valid_set <- dtmKW[splitfolds == i, ]
fitted <- LDA(train_set, k = k, method = "Gibbs",
control = list(alpha = k/candidate_k, burnin = burnin, iter = iter, keep = keep, delta = delta) )
results_1k[i,] <- c(k, perplexity(fitted, newdata = valid_set))
}
return(results_1k)
}
})
stopCluster(cluster)
results_perplexity <- as.data.frame(results)
library(ggplot2)
library(scales)
p <- ggplot(results_perplexity, aes(x = k, y = perplexity)) +
geom_point(pch = 21, size = 2, fill = I("orange")) +
# geom_line(color=c("#753633"),size=0.5) +
ggtitle("3-fold cross-validation of LDA-model with Gibbs sampler on the 'jokes' dataset",
"Perplexity when fitting the trained model to the hold-out set.") +
labs(x = "Candidate number of topics", y = "Perplexity when fitting the trained model to the hold-out set")
p
#make it interactive..:
#library(plotly)
#ggplotly(p)
###############################################################################
###############################################################################
#The above methods are global (valid for the model as a whole).
#You could/should also have a look at coherence measures that are both global
#and local (valid for single topics),
#readily applicable using the package 'textminer'.
###############################################################################
###############################################################################
#"No," said the priest, "you don't need to accept everything as true, you only have to accept it as necessary."
#"Depressing view," said K. "The lie made into the rule of the world."
#Let's find K for the whole 10K data set of jokes, and for many more models..
result_k <- readRDS("result_k.rds") #pre-tested...
FindTopicsNumber_plot(result_k)
# The x axis tick marks (also called "breaks") in the plot above are not readable.
# Let us adjust the plotting function that we used. In order to get the code
# of a function from a package that we have loaded, one can write and execute
# ?function_name. In our case ?FindTopicsNumber_plot. The only bit we adjust is:
# scale_x_continuous(breaks = seq(min(values$topics), max(values$topics), 4))
FindTopicsNumber_plot <- function(values) {
if ("LDA_model" %in% names(values)) {
values <- values[!names(values) %in% c("LDA_model")]
}
columns <- base::subset(values, select = 2:ncol(values))
values <- base::data.frame(values["topics"], base::apply(columns,
2, function(column) {
scales::rescale(column, to = c(0, 1), from = range(column))
}))
values <- reshape2::melt(values, id.vars = "topics", na.rm = TRUE)
values$group <- values$variable %in% c("Griffiths2004", "Deveaud2014")
values$group <- base::factor(values$group, levels = c(FALSE,
TRUE), labels = c("minimize", "maximize"))
values$topics <- as.numeric(values$topics)
p <- ggplot(values, aes_string(x = "topics", y = "value",
group = "variable"))
p <- p + geom_line()
p <- p + geom_point(aes_string(shape = "variable"), size = 3)
p <- p + guides(size = FALSE, shape = guide_legend(title = "metrics:"))
p <- p + scale_x_continuous(breaks = seq(min(values$topics), max(values$topics), 4))
p <- p + labs(x = "number of topics", y = NULL)
p <- p + facet_grid(group ~ .)
p <- p + theme_bw() %+replace% theme(panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(), panel.grid.major.x = element_line(colour = "grey70"),
panel.grid.minor.x = element_blank(), legend.key = element_blank(),
strip.text.y = element_text(angle = 90))
g <- ggplotGrob(p)
g$layout[g$layout$name == "strip-right", c("l", "r")] <- 3
grid::grid.newpage()
grid::grid.draw(g)
}
FindTopicsNumber_plot(result_k)
# compute ranks
result_k$rankG <- rank(-result_k$Griffiths2004, ties.method="min")
result_k$rankCJ <- rank(result_k$CaoJuan2009,ties.method="min")
# subset on the rank
subset(result_k,rankG==1)
subset(result_k,rankCJ==1)
#NB! Indexations is off in LDAVis app. Set lambda to 1 and compare words to match it the other app...
GMY <- "MYA"
GMY