-
Notifications
You must be signed in to change notification settings - Fork 0
/
NGparser.m
58 lines (51 loc) · 1.47 KB
/
NGparser.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
function graphemes = NGparser(words,weighting)
%% Naive Grapheme Parser
% Author: Oscar Woolnough (owoolnough.github.io)
% Version 1.0 (28 July 2023)
%
% Inputs: words - nx1 string array of words (e.g. "DICTIONARY")
% weighting - GP weighting to use {'none' (default) 'freq'} (optional)
%
% Outputs: graphemes - nx1 cell array of graphemes
%
% Required files: GG_prob.mat
%% Check inputs
assert(isstring(words),'Inputs must be string arrays')
%% Load Grapheme-Phoneme Correspondence Table
if ~exist('weighting','var')
weighting = 'none';
end
switch weighting
case 'none'
load('GG_prob.mat','dubgraphlist','GGprobS');
GG = GGprobS;
case 'freq'
load('GG_prob.mat','dubgraphlist','GGfreqS');
GG = GGfreqS;
end
ind = GG<0.5;
dubgraphlist(ind) = [];
GG(ind) = [];
[~,I] = sortrows([strlength(dubgraphlist); GG]','descend');
dubgraphlist = dubgraphlist(I);
%%
graphemes = cell(length(words),1);
for ii = 1:length(words)
word = upper(words{ii});
while ~isempty(word)
ind = false(length(dubgraphlist),1);
for jj = 1:length(dubgraphlist)
ind(jj) = startsWith(word,dubgraphlist{jj});
end
if any(ind)
if ~(sum(ind) == 1)
ind = find(ind,1,'first');
end
graphemes{ii}{end+1} = dubgraphlist{ind};
word(1:length(dubgraphlist{ind}))=[];
else
graphemes{ii}{end+1} = word(1);
word(1)=[];
end
end
end