-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
5-split.js
executable file
·106 lines (94 loc) · 2.74 KB
/
5-split.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/local/bin/node
const fs = require('fs');
const path = require('path');
if (process.argv.length < 4) {
console.log('Needs 2 parameters: [input path] [output path]');
process.exit(1);
}
function sum(array) {
let result = 0;
for (const value of array) {
result += value;
}
return result;
}
function count(array, predicate) {
return sum(array.map((value) => predicate(value) ? 1 : 0))
}
function getPath(relativePath) {
return path.join(__dirname, relativePath);
}
// A very hacky way to find the event separators.
// Specific to tessaract 4.0 with the default ENG training data.
function isSeparator(line) {
// 0063 separators
if (line.indexOf('ISOS IO IO') !== -1) return true;
if (line.indexOf('SUNECREERE EE') !== -1) return true;
// 0067
if (line.indexOf('REECE EERE') !== -1) return true;
// 0084
if (line.startsWith('THAKUR')) return true;
// 0116
if (line.indexOf('TIKI HARRIE') !== -1) return true;
// 0117
if (line.startsWith('HREM')) return true;
// 0137
if (line.indexOf('HERRERA KR') !== -1) return true;
// 0138
if (line.startsWith('KEARAUKKIUK')) return true;
// 0156
if (line.indexOf('REIRAIIAA') !== -1) return true;
// 0157
if (line.indexOf('HARKER') !== -1) return true;
// 0160
if (line === 'RAKES') return true;
// 0168
if (line.indexOf('KEKE KHER') !== -1) return true;
// 0175
if (line === 'RUKIA') return true;
// 0180
if (line.indexOf('MRE MERE') !== -1) return true;
// 0183
if (line.indexOf('AISIAR ATI') !== -1) return true;
if (line.indexOf('KAKA RIKER') !== -1) return true;
// otherwise count K, E, R occurrences
const occurrences = count(
line.split(''),
(l) => l === 'K' || l === 'E' || l === 'R'
);
return occurrences > 15
|| ((occurrences / line.length) > 0.6 && line.length > 8);
}
function splitOn(array, isSeparator) {
const collections = [[]];
let currentCollection = collections[0];
for (const value of array) {
if (isSeparator(value)) {
currentCollection = [];
collections.push(currentCollection);
} else {
currentCollection.push(value);
}
}
return collections;
}
function appendName(filePath, name) {
let index = filePath.lastIndexOf('.');
if (index === -1) index = filePath.length;
return filePath.slice(0, index) + name + filePath.slice(index);
}
const [inputPath, outputPath] = process.argv.slice(2);
const content = fs.readFileSync(getPath(inputPath), { encoding: 'utf8' });
const lines = content.split('\n');
lines.forEach((line, index) => {
if (isSeparator(line)) {
console.log(`${inputPath}:${index} ${line}`);
}
});
splitOn(lines, isSeparator).forEach((lines, index) => {
const fileContent = lines
.filter((line) => !/^\s*$/.test(line))
.join('\n');
const newPath = appendName(getPath(outputPath), `-${index+1}`);
fs.writeFileSync(newPath, fileContent);
});