-
Notifications
You must be signed in to change notification settings - Fork 1
/
LDA-Code.txt
85 lines (67 loc) · 3.01 KB
/
LDA-Code.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
%LDA code%
%Load training data usually 80%, so it appears as a table in Matlab. The text should appear under a column with header 'text'%
head(Automotive2) %preview data,change variable name as per requirement%
% preprocessed.m/ This is the preprocessing function required to preprocess the data%
%function [X]=preprocessed(textdata)
cleandata=erasePunctuation(textdata);
cleandata=lower(cleandata);
cleandocs=tokenizedDocument(cleandata);
removwords=removeWords(cleandocs,stopWords);
removwords=removeShortWords(removwords,2);
removwords=removeLongWords(removwords,15);
normal=normalizeWords(removwords);
X=normal;
end%
textdata=Automotive2.text;
documents=preprocessed(textdata);
numDocuments = numel(documents);
cvp = cvpartition(numDocuments,'HoldOut',0.1); %cross validation data can be changed, currently set to 0.9/0.1%
documentsTrain = documents(cvp.training);
documentsValidation = documents(cvp.test);
% the data is divided as 90% training and 10% validation data, with cross validation%
bag=bagOfWords(documentsTrain); %LDA requires BOW model wherein each word is a length vector%
bag=removeEmptyDocuments(bag);
aof=sum(sum(bag.Counts(:)))/bag.NumWords; %delete if not required%
bag=removeInfrequentWords(bag,aof); %delete if not required%
numTopicsRange = [5 10 15 20 40]; %change as per requirement%
for i = 1:numel(numTopicsRange)
numTopics = numTopicsRange(i);
mdl = fitlda(bag,numTopics, ...
'Solver','savb', ...
'DataPassLimit',10,...
'Verbose',0);
[~,validationPerplexity(i)] = logp(mdl,documentsValidation);
timeElapsed(i) = mdl.FitInfo.History.TimeSinceStart(end);
end
figure
yyaxis left
plot(numTopicsRange,validationPerplexity,'+-')
ylabel("Validation Perplexity")
yyaxis right
plot(numTopicsRange,timeElapsed,'o-')
ylabel("Time Elapsed (s)")
legend(["Validation Perplexity" "Time Elapsed (s)"],'Location','southeast')
xlabel("Number of Topics")
%choose number of topics as per requirement to run the model%
%We need to load our test data again%
%Use your test data and load it in documents variable. Create a bag of words model again on it%
head(Automotive1); %preview data,change variable name as per requirement%
% preprocessed.m/ This is the preprocessing function required to preprocess the data%
%function [X]=preprocessed(textdata)
cleandata=erasePunctuation(textdata);
cleandata=lower(cleandata);
cleandocs=tokenizedDocument(cleandata);
removwords=removeWords(cleandocs,stopWords);
removwords=removeShortWords(removwords,2);
removwords=removeLongWords(removwords,15);
normal=normalizeWords(removwords);
X=normal;
end%
textdata=Automotive1.text;
documents=preprocessed(textdata);
bag=bagOfWords(documents);
numTopics = 10; %change as per requirement%
mdl = fitlda(bag,numTopics);
%fit the model, run lda, get the keywords based on their probablities. You can also plot using WordCloud function for a given topic%
figure
wordcloud(mdl,10)