-
Notifications
You must be signed in to change notification settings - Fork 34
/
index.js
143 lines (125 loc) · 4.75 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import formatSeconds from './compose-subtitles/util/format-seconds.js';
import textSegmentation from './presegment-text/text-segmentation/index.js';
import addLineBreakBetweenSentences from './presegment-text/line-break-between-sentences/index.js';
import foldWords from './presegment-text/fold/index.js';
import divideIntoTwoLines from './presegment-text/divide-into-two-lines/index.js';
import preSegmentText from './presegment-text/index.js';
import { getTextFromWordsList } from './presegment-text/index.js';
import ttmlGeneratorPremiere from './compose-subtitles/premiere.js';
import ittGenerator from './compose-subtitles/itt.js';
import ttmlGenerator from './compose-subtitles/ttml.js';
import srtGenerator from './compose-subtitles/srt.js';
import vttGenerator from './compose-subtitles/vtt.js';
import csvGenerator from './compose-subtitles/csv.js';
import countWords from '../../count-words';
function segmentedTextToList(text) {
let result = text.split('\n\n');
result = result.map((line) => {
return line.trim();
});
return result;
}
function addTimecodesToLines(wordsList, paragraphs, lines) {
wordsList = wordsList.filter((w) => w.text.length > 0);
let startWordCounter = 0;
let endWordCounter = 0;
const results = lines.map((line) => {
endWordCounter += countWords(line);
const jsonLine = { text: line.trim() };
jsonLine.start = wordsList[startWordCounter].start;
// TODO: there's an issue here and `vtt_speakers_paragraphs` export is broken
jsonLine.end = wordsList[endWordCounter - 1].end;
// #-----------------|------|-----------------#
const possibleParagraphs = paragraphs
.filter((p) => jsonLine.start >= p.start && jsonLine.start < p.end)
.map((p) => {
const inParagraphEndTime = Math.min(jsonLine.end, p.end);
const inParagraphDuration = inParagraphEndTime - jsonLine.start;
const totalDuration = jsonLine.end - jsonLine.start;
const pctInParagraph = inParagraphDuration / totalDuration;
return {
...p,
pctInParagraph,
};
})
.sort((a, b) => b.pctInParagraph - a.pctInParagraph || a.start - b.start); // sort by % in paragraph descending, then start time ascending
jsonLine.speaker = possibleParagraphs.length > 0 ? possibleParagraphs[0].speaker : 'UNKNOWN';
startWordCounter = endWordCounter;
return jsonLine;
});
return results;
}
function segmentTextByParagraph(wordList, paragraphs) {
let str = [];
let p_id = '0';
const sorted_paragraphs = paragraphs.sort((a, b) => a.start - b.start);
for (const { text, start } of wordList) {
const foundParagraph = sorted_paragraphs.filter((p) => p.start <= start && p.end >= start)[0];
if (foundParagraph.id !== p_id) {
p_id = foundParagraph.id;
str.push('\n\n');
}
str.push(text);
}
return str.join(' ');
}
function preSegmentTextJson(wordsList, paragraphs, numberOfCharPerLine, paragraphMode = false) {
let result;
if (paragraphMode) {
result = segmentTextByParagraph(wordsList, paragraphs);
} else {
result = preSegmentText(wordsList, numberOfCharPerLine);
}
const segmentedTextArray = segmentedTextToList(result);
return addTimecodesToLines(wordsList, paragraphs, segmentedTextArray);
}
function subtitlesComposer({ words, paragraphs, type, numberOfCharPerLine }) {
const subtitlesJson = preSegmentTextJson(words, paragraphs, numberOfCharPerLine, type === 'vtt_speakers_paragraphs');
if (typeof words === 'string') {
return preSegmentText(words, numberOfCharPerLine);
}
switch (type) {
case 'premiereTTML':
return ttmlGeneratorPremiere(subtitlesJson);
case 'ttml':
return ttmlGenerator(subtitlesJson);
case 'itt':
return ittGenerator(subtitlesJson);
case 'srt':
return srtGenerator(subtitlesJson);
case 'vtt':
return vttGenerator(subtitlesJson);
case 'vtt_speakers':
case 'vtt_speakers_paragraphs':
return vttGenerator(subtitlesJson, true);
case 'json':
// converting timecodes to captions time stamps
return subtitlesJson.map((line) => {
line.start = formatSeconds(parseFloat(line.start)).replace('.', ',');
line.end = formatSeconds(parseFloat(line.end)).replace('.', ',');
return line;
});
case 'csv':
return csvGenerator(subtitlesJson);
case 'pre-segment-txt':
return preSegmentText(words, numberOfCharPerLine);
case 'txt':
return preSegmentText(words, numberOfCharPerLine);
default:
return 'Could not find the subtitle format';
}
}
export {
textSegmentation,
addLineBreakBetweenSentences,
foldWords,
divideIntoTwoLines,
getTextFromWordsList,
preSegmentText,
ttmlGeneratorPremiere,
ttmlGenerator,
ittGenerator,
srtGenerator,
vttGenerator,
};
export default subtitlesComposer;