Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subtitle Downloading #1117

Merged
merged 18 commits into from
Jun 15, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3,817 changes: 3,324 additions & 493 deletions package-lock.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
"semver": "^5.6.0",
"service-mobileapp-node": "git+https://github.com/wikimedia/mediawiki-services-mobileapps.git#d244439",
"service-runner": "^2.7.2",
"sharp": "^0.25.2",
"sharp": "^0.25.3",
"swig-templates": "^2.0.2",
"tslint": "^5.20.1",
"typescript": "^3.8.3",
Expand Down
1 change: 0 additions & 1 deletion src/S3.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ class S3 {
return true;
}
} catch (err) {
logger.log(err);
throw new Error(`Unable to connect to S3, either S3 login credentials are wrong or bucket cannot be found
Bucket used: ${this.bucketName}
End point used: ${s3UrlBase.href}
Expand Down
18 changes: 17 additions & 1 deletion src/util/misc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,23 @@ export function deDup<T>(_arr: T[], getter: (o: T) => any) {
});
}

export function getRelativeFilePath(parentArticleId: string, fileBase: string, resourceNamespace: 'I' | 'A' | 'M') {
let lastCalled = 0;
export function throttle(fn: (...args: any[]) => any, wait: number) {

return function (...args: any[]) {
const canCall = (Date.now() - lastCalled) >= wait;
if (canCall) {
fn(...args);
lastCalled = Date.now();
}
};
}

export const keepAlive = throttle(function keepAlive() {
logger.log(`Heartbeat - OK`);
}, 1000 * 60 * 9);

export function getRelativeFilePath(parentArticleId: string, fileBase: string, resourceNamespace: 'I' | 'A' | 'M'| '-') {
const slashesInUrl = parentArticleId.split('/').length - 1;
const upStr = '../'.repeat(slashesInUrl + 1);
const newUrl = `${upStr}${resourceNamespace}/` + fileBase;
Expand Down
69 changes: 53 additions & 16 deletions src/util/saveArticles.ts
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
import logger from '../Logger';
import Downloader from '../Downloader';
import MediaWiki from '../MediaWiki';
import {ZimArticle, ZimCreator} from '@openzim/libzim';
import { ZimArticle, ZimCreator } from '@openzim/libzim';
import htmlMinifier from 'html-minifier';
import * as urlParser from 'url';
import * as QueryStringParser from 'querystring';

import DU from '../DOMUtils';
import * as domino from 'domino';
import {Dump} from '../Dump';
import {mapLimit} from 'promiso';
import {contains, genCanonicalLink, genHeaderCSSLink, genHeaderScript, getFullUrl, getMediaBase, jsPath} from '.';
import {config} from '../config';
import {footerTemplate, htmlTemplateCode} from '../Templates';
import {articleDetailXId, filesToDownloadXPath, filesToRetryXPath} from '../stores';
import {getRelativeFilePath, getSizeFromUrl, encodeArticleIdForZimHtmlUrl, interpolateTranslationString} from './misc';
import {RedisKvs} from './RedisKvs';
import {rewriteUrl} from './rewriteUrls';
import {CONCURRENCY_LIMIT} from './const';
import { Dump } from '../Dump';
import { mapLimit } from 'promiso';
import { contains, genCanonicalLink, genHeaderCSSLink, genHeaderScript, getFullUrl, getMediaBase, jsPath } from '.';
import { config } from '../config';
import { footerTemplate, htmlTemplateCode } from '../Templates';
import { articleDetailXId, filesToDownloadXPath, filesToRetryXPath } from '../stores';
import { getRelativeFilePath, getSizeFromUrl, encodeArticleIdForZimHtmlUrl, interpolateTranslationString } from './misc';
import { RedisKvs } from './RedisKvs';
import { rewriteUrl } from './rewriteUrls';
import { CONCURRENCY_LIMIT } from './const';

const genericJsModules = config.output.mw.js;
const genericCssModules = config.output.mw.css;
Expand Down Expand Up @@ -111,6 +112,7 @@ async function downloadBulk(listOfArguments: any[], downloader: Downloader): Pro
resp.namespace = arg.val.namespace;
resp.mult = arg.val.mult;
resp.width = arg.val.width;

return downloader.downloadContent(arg.val.url).then((r) => {
resp.result = r;
return resp;
Expand Down Expand Up @@ -148,7 +150,7 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade
continue;
}

const { articleDoc: _articleDoc, mediaDependencies } = await processArticleHtml(articleHtml, downloader, mw, dump, articleId);
const { articleDoc: _articleDoc, mediaDependencies, subtitles } = await processArticleHtml(articleHtml, downloader, mw, dump, articleId);
let articleDoc = _articleDoc;

if (dump.customProcessor?.shouldKeepArticle) {
Expand All @@ -172,6 +174,10 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade
}
}

for (const subtitle of subtitles) {
await filesToDownloadXPath.set(subtitle.path, { url: subtitle.url, namespace: '-' });
}

const _moduleDependencies = await getModuleDependencies(nonPaginatedArticleId, mw, downloader);

for (const dep of _moduleDependencies.jsDependenciesList) {
Expand Down Expand Up @@ -293,9 +299,11 @@ async function getModuleDependencies(articleId: string, mw: MediaWiki, downloade

async function processArticleHtml(html: string, downloader: Downloader, mw: MediaWiki, dump: Dump, articleId: string) {
let mediaDependencies: Array<{ url: string, path: string }> = [];
let subtitles: Array<{ url: string, path: string }> = [];
let doc = domino.createDocument(html);
const tmRet = await treatMedias(doc, mw, dump, articleId);
doc = tmRet.doc;

mediaDependencies = mediaDependencies.concat(
tmRet.mediaDependencies
.filter((a) => a)
Expand All @@ -305,6 +313,15 @@ async function processArticleHtml(html: string, downloader: Downloader, mw: Medi
}),
);

subtitles = subtitles.concat(
tmRet.subtitles
.filter((a) => a)
.map((url) => {
const { title, lang } = QueryStringParser.parse(url) as { title: string, lang: string };
const path = `${title}-${lang}.vtt`;
return { url, path };
}),
);
const ruRet = await rewriteUrls(doc, articleId, downloader, mw, dump);
doc = ruRet.doc;
mediaDependencies = mediaDependencies.concat(
Expand All @@ -319,6 +336,7 @@ async function processArticleHtml(html: string, downloader: Downloader, mw: Medi
return {
articleDoc: doc,
mediaDependencies,
subtitles
};
}

Expand All @@ -336,10 +354,11 @@ function widthXHeightSorter(a: DominoElement, b: DominoElement) {
return aVal > bVal ? 1 : -1;
}

async function treatVideo(mw: MediaWiki, dump: Dump, srcCache: KVS<boolean>, articleId: string, videoEl: DominoElement): Promise<{ mediaDependencies: string[] }> {
export async function treatVideo(mw: MediaWiki, dump: Dump, srcCache: KVS<boolean>, articleId: string, videoEl: DominoElement): Promise<{ mediaDependencies: string[], subtitles: string[] }> {
// This function handles audio tags as well as video tags
const webUrlHost = urlParser.parse(mw.webUrl).host;
const mediaDependencies: string[] = [];
const subtitles: string[] = [];
// Worth noting:
// Video tags are used for audio files too (as opposed to the audio tag)
// When it's only audio, there will be a single OGG file
Expand All @@ -358,7 +377,7 @@ async function treatVideo(mw: MediaWiki, dump: Dump, srcCache: KVS<boolean>, art

if (dump.nopic || dump.novid || dump.nodet) {
DU.deleteNode(videoEl);
return { mediaDependencies };
return { mediaDependencies, subtitles };
}

videoSources = videoSources.sort(widthXHeightSorter);
Expand Down Expand Up @@ -402,7 +421,23 @@ async function treatVideo(mw: MediaWiki, dump: Dump, srcCache: KVS<boolean>, art
}

sourceEl.setAttribute('src', newUrl);
return { mediaDependencies };

/* Scrape subtitle */
for (const track of Array.from(videoEl.querySelectorAll('track'))) {
subtitles.push(await treatSubtitle(track, webUrlHost, mw, articleId));
}

return { mediaDependencies, subtitles };
}

export async function treatSubtitle(trackEle: DominoElement, webUrlHost: string, mw: MediaWiki, articleId: string): Promise<string> {
const subtitleSourceUrl = getFullUrl(webUrlHost, trackEle.getAttribute('src'), mw.base);
const { title, lang } = QueryStringParser.parse(subtitleSourceUrl) as { title: string, lang: string };
// The source URL we get from Mediawiki article is in srt format, so we replace it to vtt which is standard subtitle trackformat for <track> src attribute.
const vttFormatUrl = new URL(subtitleSourceUrl);
bakshiutkarsha marked this conversation as resolved.
Show resolved Hide resolved
vttFormatUrl.searchParams.set('trackformat', 'vtt');
trackEle.setAttribute('src', `${getRelativeFilePath(articleId, title, '-')}-${lang}.vtt`);
return vttFormatUrl.href;
}

function shouldKeepImage(dump: Dump, img: DominoElement) {
Expand Down Expand Up @@ -538,6 +573,7 @@ function treatImageFrames(mw: MediaWiki, dump: Dump, parsoidDoc: DominoElement,

export async function treatMedias(parsoidDoc: DominoElement, mw: MediaWiki, dump: Dump, articleId: string) {
let mediaDependencies: string[] = [];
let subtitles: string[] = [];
/* Clean/rewrite image tags */
const imgs = Array.from(parsoidDoc.getElementsByTagName('img'));
const videos: DominoElement = Array.from(parsoidDoc.querySelectorAll('video, audio'));
Expand All @@ -546,6 +582,7 @@ export async function treatMedias(parsoidDoc: DominoElement, mw: MediaWiki, dump
for (const videoEl of videos) { // <video /> and <audio />
const ret = await treatVideo(mw, dump, srcCache, articleId, videoEl);
mediaDependencies = mediaDependencies.concat(ret.mediaDependencies);
subtitles = subtitles.concat(ret.subtitles);
}

for (const imgEl of imgs) {
Expand All @@ -561,7 +598,7 @@ export async function treatMedias(parsoidDoc: DominoElement, mw: MediaWiki, dump
treatImageFrames(mw, dump, parsoidDoc, imageNode);
}

return { doc: parsoidDoc, mediaDependencies };
return { doc: parsoidDoc, mediaDependencies, subtitles };
}

async function rewriteUrls(parsoidDoc: DominoElement, articleId: string, downloader: Downloader, mw: MediaWiki, dump: Dump) {
Expand Down
37 changes: 35 additions & 2 deletions test/unit/saveArticles.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ import './bootstrap.test';
import test from 'blue-tape';
import domino from 'domino';

import { setupScrapeClasses } from 'test/util';
import { setupScrapeClasses, convertWikicodeToHtml, testHtmlRewritingE2e } from 'test/util';
import { articleDetailXId } from 'src/stores';
import { saveArticles, treatMedias, applyOtherTreatments } from '../../src/util/saveArticles';
import { saveArticles, treatMedias, applyOtherTreatments, treatSubtitle, treatVideo } from 'src/util/saveArticles';
import { ZimArticle } from '@openzim/libzim';
import { Dump } from 'src/Dump';
import { mwRetToArticleDetail } from 'src/util';
Expand Down Expand Up @@ -238,3 +238,36 @@ test('--customFlavour', async (t) => {
t.ok(ParisDocument.querySelector('#PRE_PROCESSOR'), `Paris was correctly pre-processed`);
t.ok(PragueDocument.querySelector('#POST_PROCESSOR'), `Prague was correctly post-processed`);
});

test('treat one subtitle', async(t) => {
const { downloader, mw, dump } = await setupScrapeClasses({ format: '' });

// Wikicode is taken from article "Mechanical energy" which has a video with subtitle
const wikicode = `[[File:Physicsworks.ogv|thumb|200px|alt="Lecture demonstrating conservation of mechanical energy"|MIT professor [[Walter Lewin]] demonstrating conservation of mechanical energy]]`;
const htmlStr = await convertWikicodeToHtml(wikicode, dump.mwMetaData.base);

const htmlDoc = domino.createDocument(htmlStr.data);
const contentRes = await treatSubtitle(htmlDoc.querySelector('track'), 'en.wikipedia.org', mw, 'Mechanical energy');
testHtmlRewritingE2e(t, wikicode, htmlStr.data, 'Converted wikicode to HTML for one subtitle');
t.equals(contentRes, 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3APhysicsworks.ogv&lang=en&trackformat=vtt&origin=*', 'Video subtitle rewriting matches');
});

test('treat multiple subtitles in one video', async(t) => {
const { downloader, mw, dump } = await setupScrapeClasses({ format: '' });

// Wikicode is taken from article "User:Charliechlorine/sandbox" which has multiple(4) subtitles in this video
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why you don't use testHtmlRewritingE2e here (and in other places)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you indicating to replace convertWikicodeToHtml to testHtmlRewritingE2e. It is bit unclear to me what exactly needs to be done here?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Replace 'treat one subtitle' and 'treat multiple subtitles in one video' ... or add a new test.... which is only based on testHtmlRewritingE2e and test that the HTML for a video with multiple subtitles is OK.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My last comment is confusing:

  • 'treat one subtitle' should test exactly which URL
  • 'treat multiple subtitles in one video' should do the same
  • I would add an global test, testing the whole HTML testHtmlRewritingE2e

const wikicode = `[[File:Videoonwikipedia.ogv|thumb|thumbtime=0:58|left|320px|Video about kola nuts ]]`;
const htmlStr = await convertWikicodeToHtml(wikicode, dump.mwMetaData.base);

const htmlDoc = domino.createDocument(htmlStr.data);
const contentRes = await treatVideo(mw, dump, {}, 'User:Charliechlorine/sandbox', htmlDoc.querySelector('video'));
testHtmlRewritingE2e(t, wikicode, htmlStr.data, 'Converted wikicode to HTML for multiple subtitle');
t.deepEqual(
contentRes.subtitles,
[ 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3AVideoonwikipedia.ogv&lang=en&trackformat=vtt&origin=*',
'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3AVideoonwikipedia.ogv&lang=eu&trackformat=vtt&origin=*',
'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3AVideoonwikipedia.ogv&lang=fr&trackformat=vtt&origin=*',
'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3AVideoonwikipedia.ogv&lang=sv&trackformat=vtt&origin=*' ],
'Video multiple subtitles rewriting matches');
t.equals(contentRes.subtitles.length, 4, 'All subtitles are found for this video');
});
9 changes: 9 additions & 0 deletions test/unit/util.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import './bootstrap.test';
import test from 'blue-tape';
import { encodeArticleIdForZimHtmlUrl, interpolateTranslationString } from 'src/util';
import { testHtmlRewritingE2e } from 'test/util';


test('util -> interpolateTranslationString', async (t) => {
Expand Down Expand Up @@ -33,3 +34,11 @@ test('Encoding ArticleId for Zim HTML Url', async(t) => {
t.equal(encoded, encodeArticleIdForZimHtmlUrl(unencoded), `encodeArticleIdForZimHtmlUrl() encoding`);
}
});

test('wikitext comparison', async(t) => {
kelson42 marked this conversation as resolved.
Show resolved Hide resolved
testHtmlRewritingE2e(
t,
`An [[isolated system]] remains the system is free.`,
`<p id="mwAQ">An <a rel="mw:WikiLink" href="./Isolated_system" title="Isolated system" id="mwAg">isolated system</a> remains the system is free.</p>`,
'HTML and Wikitext match')
})
19 changes: 19 additions & 0 deletions test/util.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import MediaWiki from '../src/MediaWiki';
import Downloader from '../src/Downloader';
import { Dump } from '../src/Dump';
import axios from 'axios';
import execa = require('execa');
import logger from '../src/Logger';

export function leftPad(_num: number, length: number) {
const num = `${_num}`;
Expand Down Expand Up @@ -64,3 +66,20 @@ export async function zimcheckAvailable() {
export async function zimcheck(filePath: string) {
return execa.command(`${zimcheckPath} ${filePath}`);
}

export async function convertWikicodeToHtml(wikicode: string, baseUrl: string): Promise<any> {
try {
return await axios.post(`${baseUrl}api/rest_v1/transform/wikitext/to/html`, {
wikitext: wikicode,
body_only: true,
})
} catch (err){
logger.log(`Got error during conversion of wikicode to HTML due to ${err}`);
return err;
}
}

export async function testHtmlRewritingE2e(t: any, wikicode: string, html: string, comment: string) {
const resultHtml = await convertWikicodeToHtml(wikicode, 'https://en.wikipedia.org/');
t.equal(html, resultHtml.data, comment);
}