From b5873e523122d1625188aa852c55b35335db218a Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Thu, 29 Oct 2020 09:51:16 -0400 Subject: [PATCH 1/9] Use git log know a documents last modified date Fixes #706 --- .github/workflows/production-build.yml | 5 ++++ build/git-history.js | 33 ++++++++++++++++++++++++++ build/index.js | 2 ++ package.json | 2 +- tool/cli.js | 15 ++++++++++++ 5 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 build/git-history.js diff --git a/.github/workflows/production-build.yml b/.github/workflows/production-build.yml index c8892b227d21..49927bd631fb 100644 --- a/.github/workflows/production-build.yml +++ b/.github/workflows/production-build.yml @@ -47,6 +47,11 @@ jobs: with: repository: mdn/content path: mdn/content + # Yes, this means fetch EVERY COMMIT EVER. + # It's probably not sustainable in the far future (e.g. past 2021) + # but for now it's good enough. We'll need all the history + # so we can figure out each document's last-modified date. + fetch-depth: 0 - uses: actions/checkout@v2 if: "contains(github.event.inputs.archived_content, 'true')" diff --git a/build/git-history.js b/build/git-history.js new file mode 100644 index 000000000000..4c30290cb593 --- /dev/null +++ b/build/git-history.js @@ -0,0 +1,33 @@ +const childProcess = require("child_process"); +const fs = require("fs"); +const path = require("path"); + +const { execGit } = require("../content"); + +function getFromGit() { + const output = execGit([ + "log", + "--name-only", + "--no-decorate", + '--format="←→ %ci"', + "--date-order", + "--reverse", + ]); + const map = new Map(); + output + .split("←→") + .slice(1) + .forEach((commit) => { + const [date, , ...files] = commit.trim().split("\n"); + const iso = new Date(date).toISOString(); + for (const file of files) { + map.set(file, iso); + } + }); + return map; +} +function gather(outputfile, previousfile = null) {} + +module.exports = { + gather, +}; diff --git a/build/index.js b/build/index.js index 5940df5660dc..147013bd13b8 100644 --- a/build/index.js +++ b/build/index.js @@ -23,6 +23,7 @@ const { getPageTitle } = require("./page-title"); const { syntaxHighlight } = require("./syntax-highlight"); const cheerio = require("./monkeypatched-cheerio"); const buildOptions = require("./build-options"); +const { gather: gatherGitHistory } = require("./git-history"); const { renderCache: renderKumascriptCache } = require("../kumascript"); const DEFAULT_BRANCH_NAME = "main"; // That's what we use for github.com/mdn/content @@ -380,4 +381,5 @@ module.exports = { SearchIndex, options: buildOptions, + gatherGitHistory, }; diff --git a/package.json b/package.json index c785bc4ce2d9..59483cb25dc6 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,7 @@ "test": "yarn prettier-check && yarn test:client && yarn test:kumascript && yarn test:testing", "build:client": "cd client && cross-env INLINE_RUNTIME_CHUNK=false react-scripts build", "build:ssr": "cd ssr && webpack --config webpack.config.js --mode=production", - "prepare-build": "yarn build:client && yarn build:ssr", + "prepare-build": "yarn build:client && yarn build:ssr && yarn tool gather-git-history", "build": "cd build && cross-env NODE_ENV=production node cli.js", "start": "(test -f client/build/index.html || yarn build:client) && (test -f ssr/dist/main.js || yarn build:ssr) && nf -j Procfile.start start", "dev": "yarn build:client && yarn build:ssr && nf -j Procfile.dev start", diff --git a/tool/cli.js b/tool/cli.js index b94257edfa4c..84f0cb781887 100644 --- a/tool/cli.js +++ b/tool/cli.js @@ -9,6 +9,7 @@ const path = require("path"); const { DEFAULT_LOCALE, VALID_LOCALES } = require("../libs/constants"); const { Redirect, Document, buildURL } = require("../content"); +const { gatherGitHistory } = require("../build"); const PORT = parseInt(process.env.SERVER_PORT || "5000"); @@ -219,6 +220,20 @@ program const url = `http://localhost:${PORT}${buildURL(locale, slug)}`; await open(url); }) + ) + + .command( + "gather-git-history", + "Extract all last-modified dates from the git logs" + ) + .argument("", "Where to dump the JSON file with all the data") + .argument("[previousfile]", "Optional old file to extend") + .action( + tryOrExit(async ({ args }) => { + const { outputfile, previousfile } = args; + const data = gatherGitHistory(outputfile, previousfile); + console.log(chalk.green(data)); + }) ); program.run(); From f2e110c35600985e4df6dd2590458c2e12d3761e Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Thu, 29 Oct 2020 12:08:08 -0400 Subject: [PATCH 2/9] progress --- build/git-history.js | 62 +++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/build/git-history.js b/build/git-history.js index 4c30290cb593..8c8d606c546f 100644 --- a/build/git-history.js +++ b/build/git-history.js @@ -1,32 +1,52 @@ -const childProcess = require("child_process"); const fs = require("fs"); const path = require("path"); -const { execGit } = require("../content"); +const { CONTENT_ROOT, execGit } = require("../content"); function getFromGit() { - const output = execGit([ - "log", - "--name-only", - "--no-decorate", - '--format="←→ %ci"', - "--date-order", - "--reverse", - ]); + const repoRoot = execGit(["rev-parse", "--show-toplevel"], { + cwd: CONTENT_ROOT, + }); + + const MARKER = "COMMIT:"; + const output = execGit( + [ + "log", + "--name-only", + "--no-decorate", + `--format=${MARKER}%cI`, + "--date-order", + "--reverse", + ], + { + cwd: repoRoot, + } + ); + const map = new Map(); - output - .split("←→") - .slice(1) - .forEach((commit) => { - const [date, , ...files] = commit.trim().split("\n"); - const iso = new Date(date).toISOString(); - for (const file of files) { - map.set(file, iso); - } - }); + let date = null; + for (let line of output.split("\n")) { + // Happens to file paths that contain non-ascii or control charaters. + // E.g. "files/en-us/glossary/b\303\251zier_curve/index.html" + if (line.startsWith('"') && line.endsWith('"')) { + line = line.slice(1, -1); + } + if (line.startsWith(MARKER)) { + date = new Date(line.replace(MARKER, "")); + } else if (line) { + map.set(line, date); + } + } return map; } -function gather(outputfile, previousfile = null) {} +function gather(outputfile, previousfile = null) { + // Every key in this map + const map = getFromGit(); + for (const [key, date] of map) { + if (!key.includes("files/en-us/")) console.log({ key, date }); + // console.log({ key, date }); + } +} module.exports = { gather, From 0d24109bf275f2e28a42bc1410a51f6c066fccf0 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Thu, 29 Oct 2020 17:37:16 -0400 Subject: [PATCH 3/9] Use git log know a documents last modified date Part of #706 --- build/git-history.js | 36 +++++++++++++++----- content/document.js | 11 ++++++- content/githistories.js | 26 +++++++++++++++ tool/cli.js | 73 ++++++++++++++++++++++++++++++++++++----- 4 files changed, 127 insertions(+), 19 deletions(-) create mode 100644 content/githistories.js diff --git a/build/git-history.js b/build/git-history.js index 8c8d606c546f..b8687e2c1e44 100644 --- a/build/git-history.js +++ b/build/git-history.js @@ -3,9 +3,9 @@ const path = require("path"); const { CONTENT_ROOT, execGit } = require("../content"); -function getFromGit() { +function getFromGit(contentRoot = CONTENT_ROOT) { const repoRoot = execGit(["rev-parse", "--show-toplevel"], { - cwd: CONTENT_ROOT, + cwd: contentRoot, }); const MARKER = "COMMIT:"; @@ -34,18 +34,36 @@ function getFromGit() { if (line.startsWith(MARKER)) { date = new Date(line.replace(MARKER, "")); } else if (line) { - map.set(line, date); + const relPath = path.relative(contentRoot, path.join(repoRoot, line)); + map.set(relPath, date); } } return map; } -function gather(outputfile, previousfile = null) { - // Every key in this map - const map = getFromGit(); - for (const [key, date] of map) { - if (!key.includes("files/en-us/")) console.log({ key, date }); - // console.log({ key, date }); +function gather(contentRoot, previousFile = null) { + const map = new Map(); + if (previousFile) { + const previous = JSON.parse(fs.readFileSync(previousFile, "utf-8")); + for (const [key, value] of Object.entries(previous)) { + map.set(key, value); + } } + // Every key in this map is a path, relative to CONTENT_ROOT. + for (const [key, value] of getFromGit(contentRoot)) { + // Because CONTENT_ROOT isn't necessarily the same as the path relative to + // the git root. For example "../README.md" and since those aren't documents + // exclude them. + // We also only care about documents. + if ( + !key.startsWith(".") && + (key.endsWith("index.html") || key.endsWith("index.md")) + ) { + map.set(key, { + modified: value, + }); + } + } + return map; } module.exports = { diff --git a/content/document.js b/content/document.js index bea86ac22437..404b2d010d07 100644 --- a/content/document.js +++ b/content/document.js @@ -14,6 +14,7 @@ const { } = require("./constants"); const { getPopularities } = require("./popularities"); const { getWikiHistories } = require("./wikihistories"); +const { getGitHistories } = require("./githistories"); const { buildURL, memoize, slugToFolder, execGit } = require("./utils"); const Redirect = require("./redirect"); @@ -169,13 +170,21 @@ const read = memoize((folder) => { const locale = extractLocale(folder); const url = `/${locale}/docs/${metadata.slug}`; + + // The last-modified is always coming from the git logs. Independent of + // which root it is. + const gitHistory = getGitHistories(root, locale).get( + path.relative(root, filePath) + ); + const modified = (gitHistory && gitHistory.modified) || null; + // Use the wiki histories for a list of legacy contributors. const wikiHistory = getWikiHistories(root, locale).get(url); const fullMetadata = { metadata: { ...metadata, locale, popularity: getPopularities().get(url) || 0.0, - modified: wikiHistory ? wikiHistory.modified : null, + modified, contributors: wikiHistory ? wikiHistory.contributors : [], }, url, diff --git a/content/githistories.js b/content/githistories.js new file mode 100644 index 000000000000..f12a494edb94 --- /dev/null +++ b/content/githistories.js @@ -0,0 +1,26 @@ +const fs = require("fs"); +const path = require("path"); + +// Module-level cache +const gitHistoryMaps = new Map(); + +function getGitHistories(root, locale) { + const localeLC = locale.toLowerCase(); + const folder = path.join(root, localeLC); + if (!gitHistoryMaps.has(folder)) { + const historyFilePath = path.join(folder, "_githistory.json"); + const history = fs.existsSync(historyFilePath) + ? new Map( + Object.entries(JSON.parse(fs.readFileSync(historyFilePath))).map( + ([filePath, history]) => { + return [filePath, history]; + } + ) + ) + : new Map(); + gitHistoryMaps.set(folder, history); + } + return gitHistoryMaps.get(folder); +} + +module.exports = { getGitHistories }; diff --git a/tool/cli.js b/tool/cli.js index 84f0cb781887..8b2f148ff06c 100644 --- a/tool/cli.js +++ b/tool/cli.js @@ -1,14 +1,16 @@ #!/usr/bin/env node +const fs = require("fs"); +const path = require("path"); +const os = require("os"); + const program = require("@caporal/core").default; const chalk = require("chalk"); const prompts = require("prompts"); const openEditor = require("open-editor"); const open = require("open"); -const fs = require("fs"); -const path = require("path"); const { DEFAULT_LOCALE, VALID_LOCALES } = require("../libs/constants"); -const { Redirect, Document, buildURL } = require("../content"); +const { CONTENT_ROOT, Redirect, Document, buildURL } = require("../content"); const { gatherGitHistory } = require("../build"); const PORT = parseInt(process.env.SERVER_PORT || "5000"); @@ -226,13 +228,66 @@ program "gather-git-history", "Extract all last-modified dates from the git logs" ) - .argument("", "Where to dump the JSON file with all the data") - .argument("[previousfile]", "Optional old file to extend") + .option("--root ", "Which content root", { + default: CONTENT_ROOT, + }) + .option("--save-history ", `File to save all previous history`, { + default: path.join(os.tmpdir(), "yari-git-history.json"), + }) + .option( + "--load-history ", + `Optional file to load all previous history`, + { + default: path.join(os.tmpdir(), "yari-git-history.json"), + } + ) .action( - tryOrExit(async ({ args }) => { - const { outputfile, previousfile } = args; - const data = gatherGitHistory(outputfile, previousfile); - console.log(chalk.green(data)); + tryOrExit(async ({ options }) => { + const { root, saveHistory, loadHistory } = options; + if (fs.existsSync(loadHistory)) { + console.log( + chalk.yellow(`Reusing exising history from ${loadHistory}`) + ); + } + const map = gatherGitHistory( + root, + fs.existsSync(loadHistory) ? loadHistory : null + ); + const historyPerLocale = {}; + + // Someplace to put the map into an object so it can be saved into `saveHistory` + const allHistory = {}; + for (const [relPath, value] of map) { + allHistory[relPath] = value; + const locale = relPath.split("/")[0]; + if (!historyPerLocale[locale]) { + historyPerLocale[locale] = {}; + } + historyPerLocale[locale][relPath] = value; + } + for (const [locale, history] of Object.entries(historyPerLocale)) { + const outputFile = path.join(root, locale, "_githistory.json"); + fs.writeFileSync(outputFile, JSON.stringify(history, null, 2), "utf-8"); + console.log( + chalk.green( + `Wrote '${locale}' ${Object.keys( + history + ).length.toLocaleString()} paths into ${outputFile}` + ) + ); + } + fs.writeFileSync( + saveHistory, + JSON.stringify(allHistory, null, 2), + "utf-8" + ); + console.log( + chalk.green( + `Wrote ${Object.keys( + allHistory + ).length.toLocaleString()} paths into ${saveHistory}` + ) + ); }) ); From 4e621f2c6acf9a84e548505e7eb3cc694caae826 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Mon, 9 Nov 2020 16:48:31 -0500 Subject: [PATCH 4/9] wording --- tool/cli.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tool/cli.js b/tool/cli.js index a91d2daf9740..5818ced9851c 100644 --- a/tool/cli.js +++ b/tool/cli.js @@ -302,7 +302,7 @@ program ); console.log( chalk.green( - `Wrote ${Object.keys( + `Saved ${Object.keys( allHistory ).length.toLocaleString()} paths into ${saveHistory}` ) From e4fadd7e40145f101b0a9d26a950521941a6c318 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Mon, 9 Nov 2020 18:41:49 -0500 Subject: [PATCH 5/9] getting tests to work --- .gitignore | 2 ++ build/git-history.js | 3 ++- content/utils.js | 9 ++++++++- package.json | 4 ++-- testing/.env | 4 ++-- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index c84f3ea7e4ac..b612dd1fed59 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,5 @@ yarn-error.log* fake-v1-api/ mdn-yari-*.tgz function.zip + +testing/content/files/en-us/_githistory.json diff --git a/build/git-history.js b/build/git-history.js index b8687e2c1e44..152b5e705a8f 100644 --- a/build/git-history.js +++ b/build/git-history.js @@ -20,7 +20,8 @@ function getFromGit(contentRoot = CONTENT_ROOT) { ], { cwd: repoRoot, - } + }, + repoRoot ); const map = new Map(); diff --git a/content/utils.js b/content/utils.js index 1767228f652a..9b6925381791 100644 --- a/content/utils.js +++ b/content/utils.js @@ -84,10 +84,17 @@ function execGit(args, opts = {}, root = null) { args, { cwd: gitRoot, + maxBuffer: 1024 * 1024 * 100, // 100MB } ); if (error || status !== 0) { - throw new Error(`git command failed:\n${stderr.toString()}`); + if (stderr) { + console.error(stderr); + } + if (error) { + throw error; + } + throw new Error(`git command failed: ${error.toString()}`); } return stdout.toString().trim(); } diff --git a/package.json b/package.json index b5a0e8d760cf..1a8f460719bc 100644 --- a/package.json +++ b/package.json @@ -13,8 +13,8 @@ "test": "yarn prettier-check && yarn test:client && yarn test:kumascript && yarn test:testing", "build:client": "cd client && cross-env INLINE_RUNTIME_CHUNK=false react-scripts build", "build:ssr": "cd ssr && webpack --config webpack.config.js --mode=production", - "prepare-build": "yarn build:client && yarn build:ssr && yarn tool gather-git-history", - "build": "cd build && cross-env NODE_ENV=production node cli.js", + "prepare-build": "yarn build:client && yarn build:ssr && yarn tool gather-git-history --verbose", + "build": "cross-env NODE_ENV=production node build/cli.js", "start": "(test -f client/build/index.html || yarn build:client) && (test -f ssr/dist/main.js || yarn build:ssr) && nf -j Procfile.start start", "dev": "yarn build:client && yarn build:ssr && nf -j Procfile.dev start", "start:client": "cd client && cross-env BROWSER=none PORT=3000 react-scripts start", diff --git a/testing/.env b/testing/.env index cb0ba74d109e..4c31075fc376 100644 --- a/testing/.env +++ b/testing/.env @@ -5,8 +5,8 @@ # The relative path trick is necessary because `yarn build` will # be running from the `./build/` directory. -CONTENT_ROOT=../testing/content/files -CONTENT_ARCHIVED_ROOT=../testing/archived-content/files +CONTENT_ROOT=testing/content/files +CONTENT_ARCHIVED_ROOT=testing/archived-content/files # Because you never benefit from the progressbar when doing mass builds in # testing environment. And in CI, it's always disabled anyway. From 68f6f1d350a1745c59802a208ce3dc6832149a1f Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Mon, 9 Nov 2020 20:22:10 -0500 Subject: [PATCH 6/9] comment --- build/git-history.js | 1 + content/utils.js | 2 ++ 2 files changed, 3 insertions(+) diff --git a/build/git-history.js b/build/git-history.js index 152b5e705a8f..f737bb87e1d7 100644 --- a/build/git-history.js +++ b/build/git-history.js @@ -41,6 +41,7 @@ function getFromGit(contentRoot = CONTENT_ROOT) { } return map; } + function gather(contentRoot, previousFile = null) { const map = new Map(); if (previousFile) { diff --git a/content/utils.js b/content/utils.js index 9b6925381791..2eecbb6fee1f 100644 --- a/content/utils.js +++ b/content/utils.js @@ -84,6 +84,8 @@ function execGit(args, opts = {}, root = null) { args, { cwd: gitRoot, + // Default is 1MB + // That's rarely big enough for what we're using Yari for. maxBuffer: 1024 * 1024 * 100, // 100MB } ); From 1648bbf8f21d0be1febecdb1e05943745d99820c Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Mon, 9 Nov 2020 20:26:18 -0500 Subject: [PATCH 7/9] use the -z trick --- build/git-history.js | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/build/git-history.js b/build/git-history.js index f737bb87e1d7..4236246d3748 100644 --- a/build/git-history.js +++ b/build/git-history.js @@ -17,6 +17,9 @@ function getFromGit(contentRoot = CONTENT_ROOT) { `--format=${MARKER}%cI`, "--date-order", "--reverse", + // "Separate the commits with NULs instead of with new newlines." + // So each line isn't, possibly, wrapped in "quotation marks". + "-z", ], { cwd: repoRoot, @@ -26,12 +29,7 @@ function getFromGit(contentRoot = CONTENT_ROOT) { const map = new Map(); let date = null; - for (let line of output.split("\n")) { - // Happens to file paths that contain non-ascii or control charaters. - // E.g. "files/en-us/glossary/b\303\251zier_curve/index.html" - if (line.startsWith('"') && line.endsWith('"')) { - line = line.slice(1, -1); - } + for (let line of output.split("\0")) { if (line.startsWith(MARKER)) { date = new Date(line.replace(MARKER, "")); } else if (line) { From f2768d26b0495df5651301369017a26d003e5847 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Mon, 9 Nov 2020 20:34:14 -0500 Subject: [PATCH 8/9] update comment --- build/git-history.js | 1 + 1 file changed, 1 insertion(+) diff --git a/build/git-history.js b/build/git-history.js index 4236246d3748..de0f276b7973 100644 --- a/build/git-history.js +++ b/build/git-history.js @@ -19,6 +19,7 @@ function getFromGit(contentRoot = CONTENT_ROOT) { "--reverse", // "Separate the commits with NULs instead of with new newlines." // So each line isn't, possibly, wrapped in "quotation marks". + // Now we just need to split the output, as a string, by \0. "-z", ], { From be65c0b6b746de7adc112ca7cc60990f5e1a76ce Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Mon, 9 Nov 2020 20:51:11 -0500 Subject: [PATCH 9/9] better testing --- content/utils.js | 1 + testing/tests/headless.test.js | 1 + testing/tests/index.test.js | 1 + 3 files changed, 3 insertions(+) diff --git a/content/utils.js b/content/utils.js index 2eecbb6fee1f..d07a1ab9274e 100644 --- a/content/utils.js +++ b/content/utils.js @@ -91,6 +91,7 @@ function execGit(args, opts = {}, root = null) { ); if (error || status !== 0) { if (stderr) { + console.log(`Error running git ${args}`); console.error(stderr); } if (error) { diff --git a/testing/tests/headless.test.js b/testing/tests/headless.test.js index c5fe5af1a351..85a99c0eee16 100644 --- a/testing/tests/headless.test.js +++ b/testing/tests/headless.test.js @@ -20,6 +20,7 @@ describe("Basic viewing of functional pages", () => { it("open the /en-US/docs/Web/Foo page", async () => { await page.goto(testURL("/en-US/docs/Web/Foo")); await expect(page).toMatch(": A test tag"); + await expect(page).toMatchElement(".document-meta time", { visible: true }); }); it("open the /en-US/docs/Web/InteractiveExample page", async () => { diff --git a/testing/tests/index.test.js b/testing/tests/index.test.js index 238cebb72c21..a6ea54cbbfc6 100644 --- a/testing/tests/index.test.js +++ b/testing/tests/index.test.js @@ -20,6 +20,7 @@ test("content built foo page", () => { expect(doc.pageTitle).toBe(`${doc.title} | MDN`); expect(doc.summary).toBe("This becomes the summary."); expect(doc.mdn_url).toBe("/en-US/docs/Web/Foo"); + expect(new Date(doc.modified)).toBeTruthy(); expect(doc.source).toBeTruthy(); expect(doc.flaws.macros.length).toBe(6);