lowlighter · lowlighter · Oct 16, 2022 · Sep 28, 2022 · Oct 4, 2022 · Oct 4, 2022
diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt
@@ -1,6 +1,10 @@
+gpgarmor
 github
 https
 leetcode
 pgn
+scm
+shas
 ssh
 ubuntu
+yargsparser
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -68,7 +68,8 @@
     "twemoji-parser": "^14.0.0",
     "vue": "^2.7.1",
     "vue-prism-component": "^1.2.0",
-    "xml-formatter": "^2.6.1"
+    "xml-formatter": "^2.6.1",
+    "yargs-parser": "^21.1.1"
   },
   "devDependencies": {
     "eslint": "^8.25.0",

diff --git a/source/app/metrics/utils.mjs b/source/app/metrics/utils.mjs
@@ -225,17 +225,19 @@ export async function language({filename, patch}) {
 }
 
 /**Run command (use this to execute commands and process whole output at once, may not be suitable for large outputs) */
-export async function run(command, options, {prefixed = true, log = true} = {}) {
+export async function run(command, options, {prefixed = true, log = true, debug = true} = {}) {
   const prefix = {win32: "wsl"}[process.platform] ?? ""
   command = `${prefixed ? prefix : ""} ${command}`.trim()
   return new Promise((solve, reject) => {
-    console.debug(`metrics/command/run > ${command}`)
+    if (debug)
+      console.debug(`metrics/command/run > ${command}`)
     const child = processes.exec(command, options)
     let [stdout, stderr] = ["", ""]
     child.stdout.on("data", data => stdout += data)
     child.stderr.on("data", data => stderr += data)
     child.on("close", code => {
-      console.debug(`metrics/command/run > ${command} > exited with code ${code}`)
+      if (debug)
+        console.debug(`metrics/command/run > ${command} > exited with code ${code}`)
       if (log) {
         console.debug(stdout)
         console.debug(stderr)
@@ -246,7 +248,7 @@ export async function run(command, options, {prefixed = true, log = true} = {})
 }
 
 /**Spawn command (use this to execute commands and process output on the fly) */
-export async function spawn(command, args = [], options = {}, {prefixed = true, timeout = 300 * 1000, stdout} = {}) { //eslint-disable-line max-params
+export async function spawn(command, args = [], options = {}, {prefixed = true, timeout = 300 * 1000, stdout, debug = true} = {}) { //eslint-disable-line max-params
   const prefix = {win32: "wsl"}[process.platform] ?? ""
   if ((prefixed) && (prefix)) {
     args.unshift(command)
@@ -255,15 +257,18 @@ export async function spawn(command, args = [], options = {}, {prefixed = true,
   if (!stdout)
     throw new Error("`stdout` argument was not provided, use run() instead of spawn() if processing output is not needed")
   return new Promise((solve, reject) => {
-    console.debug(`metrics/command/spawn > ${command} with ${args.join(" ")}`)
+    if (debug)
+      console.debug(`metrics/command/spawn > ${command} with ${args.join(" ")}`)
     const child = processes.spawn(command, args, {...options, shell: true, timeout})
     const reader = readline.createInterface({input: child.stdout})
     reader.on("line", stdout)
     const closed = new Promise(close => reader.on("close", close))
     child.on("close", async code => {
-      console.debug(`metrics/command/spawn > ${command} with ${args.join(" ")} > exited with code ${code}`)
+      if (debug)
+        console.debug(`metrics/command/spawn > ${command} with ${args.join(" ")} > exited with code ${code}`)
       await closed
-      console.debug(`metrics/command/spawn > ${command} with ${args.join(" ")} > reader closed`)
+      if (debug)
+        console.debug(`metrics/command/spawn > ${command} with ${args.join(" ")} > reader closed`)
       return code === 0 ? solve() : reject()
     })
   })
@@ -372,7 +377,7 @@ export const filters = {
     return result
   },
   /**Repository filter*/
-  repo(repository, patterns) {
+  repo(repository, patterns, {debug = true} = {}) {
     //Disable filtering when no pattern is provided
     if (!patterns.length)
       return true
@@ -390,11 +395,12 @@ export const filters = {
 
     //Basic pattern matching
     const include = (!patterns.includes(repo)) && (!patterns.includes(`${user}/${repo}`))
-    console.debug(`metrics/filters/repo > filter ${repo} (${include ? "included" : "excluded"})`)
+    if (debug)
+      console.debug(`metrics/filters/repo > filter ${repo} (${include ? "included" : "excluded"})`)
     return include
   },
   /**Text filter*/
-  text(text, patterns) {
+  text(text, patterns, {debug = true} = {}) {
     //Disable filtering when no pattern is provided
     if (!patterns.length)
       return true
@@ -404,7 +410,8 @@ export const filters = {
 
     //Basic pattern matching
     const include = !patterns.includes(text)
-    console.debug(`metrics/filters/text > filter ${text} (${include ? "included" : "excluded"})`)
+    if (debug)
+      console.debug(`metrics/filters/text > filter ${text} (${include ? "included" : "excluded"})`)
     return include
   },
 }

diff --git a/source/plugins/languages/README.md b/source/plugins/languages/README.md
@@ -236,8 +236,7 @@ It will be automatically hidden if empty.</p>
 
 ## 🔎 `indepth` mode
 
-The default algorithm use the top languages provided of each repository you contributed to.
-When working in collaborative projects with a lot of people, these numbers may be less representative of your actual work.
+The default algorithm uses the top languages from each repository you contributed to using GitHub GraphQL API (which is similar to the displayed languages bar on github.com). When working in collaborative projects with a lot of people, these numbers may be less representative of your actual work.
 
 The `plugin_languages_indepth` option lets you use a more advanced algorithm for more accurate statistics.
 Under the hood, it will clone your repositories, run [linguist-js](https://github.com/Nixinova/Linguist) (a JavaScript port of [GitHub linguist](https://github.com/github/linguist)) and iterate over patches matching your `commits_authoring` setting.
@@ -257,12 +256,52 @@ Since git lets you use any email and username for commits, *metrics* may not be
 
 > ⚠️ This feature significantly increase workflow time
 
-> ⚠️ Since this mode iterates over **each commit of each repository**, it is not suited for large code base, especially those with a large amount of commits and the ones containing binaries. While `plugin_languages_analysis_timeout` can be used to increase the default timeout for analysis, please be responsible and keep this feature disabled if it cannot work on your account to save GitHub resources and our planet 🌏
+> ⚠️ Since this mode iterates over **each matching commit of each repository**, it is not suited for large code base, especially those with a large amount of commits and the ones containing binaries. While `plugin_languages_analysis_timeout` and `plugin_languages_analysis_timeout_repositories` can be used to increase the default timeout for analysis, please be responsible and keep this feature disabled if it cannot work on your account to save GitHub resources and our planet 🌏
 
 > ⚠️ Although *metrics* does not send any code to external sources, repositories are temporarily cloned on the GitHub Action runner. It is advised to keep this option disabled when working with sensitive data or company code. Use at your own risk, *metrics* and its authors **cannot** be held responsible for any resulting code leaks. Source code is available for auditing at [analyzers.mjs](/source/plugins/languages/analyzers.mjs).
 
 > 🌐 Web instances must enable this feature in `settings.json`
 
+Below is a summary of the process used to compute indepth statistics:
+
+## Most used mode
+
+1. Fetch GPG keys linked to your GitHub account
+  - automatically add attached emails to `commits_authoring`
+  - *web-flow* (GitHub's public key for changes made through web-ui) is also fetched
+2. Import GPG keys so they can be used to verify commits later
+3. Iterate through repositories
+  - early break if `plugin_languages_analysis_timeout` is reached
+  - skip repository if it matches `plugin_languages_skipped`
+  - include repositories from `plugin_languages_indepth_custom`
+    - a specific branch and commit range can be used
+    - a source other than github.com can be used
+4. Clone repository
+  - target branch is checkout
+5. List of authored commits is computed
+  - using `git log --author` and `commits_authoring` to search in commit headers
+  - using `git log --grep` and `commits_authoring` to search in commit body
+  - ensure these are within the range specified by `plugin_languages_indepth_custom` (if applicable)
+6. Process authored commits
+  - early break if `plugin_languages_analysis_timeout_repositories` is reached
+  - using `git verify-commit` to check authenticity against imported GPG keys
+  - using `git log --patch` to extract added/deleted lines/bytes from each file
+  - using [GitHub linguist](https://github.com/github/linguist) ([linguist-js](https://github.com/Nixinova/LinguistJS)) to detect language for each file
+    - respect `plugin_languages_categories` option
+    - if a file has since been deleted or moved, checkout on the last commit file was present and run linguist again
+7. Aggregate results
+
+## Recently used mode
+
+1. Fetch push events linked to your account (or target repository)
+  - matching `plugin_languages_recent_load` and `plugin_languages_recent_days` options
+  - matching committer emails from `commits_authoring`
+2. Process authored commits
+  - using [GitHub linguist](https://github.com/github/linguist) ([linguist-js](https://github.com/Nixinova/LinguistJS)) to detect language for each file
+    - respect `plugin_languages_recent_categories` option
+    - directly pass file content rather than performing I/O and simulating a git repository
+3. Aggregate results
+
 ## 📅 Recently used languages
 
 This feature uses a similar algorithm as `indepth` mode, but uses patches from your events feed instead.

diff --git a/source/plugins/languages/analyzer/analyzer.mjs b/source/plugins/languages/analyzer/analyzer.mjs
@@ -0,0 +1,182 @@
+//Imports
+import fs from "fs/promises"
+import os from "os"
+import paths from "path"
+import git from "simple-git"
+import {filters} from "../../../app/metrics/utils.mjs"
+
+/**Analyzer */
+export class Analyzer {
+
+  /**Constructor */
+  constructor(login, {account = "bypass", authoring = [], uid = Math.random(), shell, rest = null, context = {mode:"user"}, skipped = [], categories = ["programming", "markup"], timeout = {global:NaN, repositories:NaN}}) {
+    //User informations
+    this.login = login
+    this.account = account
+    this.authoring = authoring
+    this.uid = uid
+    this.gpg = []
+
+    //Utilities
+    this.shell = shell
+    this.rest = rest
+    this.context = context
+    this.markers = {
+      hash:/\b[0-9a-f]{40}\b/,
+      file:/^[+]{3}\sb[/](?<file>[\s\S]+)$/,
+      line:/^(?<op>[-+])\s*(?<content>[\s\S]+)$/,
+    }
+    this.parser = /^(?<login>[\s\S]+?)\/(?<name>[\s\S]+?)(?:@(?<branch>[\s\S]+?)(?::(?<ref>[\s\S]+))?)?$/
+    this.consumed = false
+
+    //Options
+    this.skipped = skipped
+    this.categories = categories
+    this.timeout = timeout
+
+    //Results
+    this.results = {partial: {global:false, repositories:false}, total: 0, lines: {}, stats: {}, colors: {}, commits: 0, files: 0, missed: {lines: 0, bytes: 0, commits: 0}, elapsed:0}
+    this.debug(`instantiated a new ${this.constructor.name}`)
+  }
+
+  /**Run analyzer */
+  async run(runner) {
+    if (this.consumed)
+      throw new Error("This analyzer has already been consumed, another instance needs to be created to perform a new analysis")
+    this.consumed = true
+    const results = await new Promise(async solve => {
+      let completed = false
+      if (Number.isFinite(this.timeout.global)) {
+        this.debug(`timeout set to ${this.timeout.global}m`)
+        setTimeout(() => {
+          if (!completed) {
+            try {
+              this.debug(`reached maximum execution time of ${this.timeout.global}m for analysis`)
+              this.results.partial.global = true
+              solve(this.results)
+            }
+            catch {
+              //Ignore errors
+            }
+          }
+        }, this.timeout.global * 60 * 1000)
+      }
+      await runner()
+      completed = true
+      solve(this.results)
+    })
+    results.partial = (results.partial.global)||(results.partial.repositories)
+    return results
+  }
+
+  /**Parse repository */
+  parse(repository) {
+    let branch = null, ref = null
+    if (typeof repository === "string") {
+      if (!this.parser.test(repository))
+        throw new TypeError(`"${repository}" pattern is not supported`)
+      const {login, name, ...groups} = repository.match(this.parser)?.groups ?? {}
+      repository = {owner:{login}, name}
+      branch = groups.branch ?? null
+      ref = groups.ref ?? null
+    }
+    const repo = `${repository.owner.login}/${repository.name}`
+    const path = paths.join(os.tmpdir(), `${this.uid}-${repo.replace(/[^\w]/g, "_")}`)
+    return {repo, path, branch, ref}
+  }
+
+  /**Clone a repository */
+  async clone(repository) {
+    const {repo, branch, path} = this.parse(repository)
+    let url = /^https?:\/\//.test(repo) ? repo : `https://github.com/${repo}`
+    try {
+      this.debug(`cloning ${url} to ${path}`)
+      await fs.rm(path, {recursive: true, force: true})
+      await fs.mkdir(path, {recursive: true})
+      await git(path).clone(url, ".", ["--single-branch"]).status()
+      this.debug(`cloned ${url} to ${path}`)
+      if (branch) {
+        this.debug(`switching to branch ${branch} for ${repo}`)
+        await git(path).branch(branch)
+      }
+      return true
+    }
+    catch (error) {
+      this.debug(`failed to clone ${url} (${error})`)
+      this.clean(path)
+      return false
+    }
+  }
+
+  /**Analyze a repository */
+  async analyze(path, {commits = []} = {}) {
+    const cache = {files:{}, languages:{}}
+    const start = Date.now()
+    let elapsed = 0, processed = 0
+    if (this.timeout.repositories)
+      this.debug(`timeout for repository analysis set to ${this.timeout.repositories}m`)
+    for (const commit of commits) {
+      elapsed = (Date.now() - start)/1000/60
+      if ((this.timeout.repositories)&&(elapsed > this.timeout.repositories)) {
+        this.results.partial.repositories = true
+        this.debug(`reached maximum execution time of ${this.timeout.repositories}m for repository analysis (${elapsed}m elapsed)`)
+        break
+      }
+      try {
+        const {total, files, missed, lines, stats} = await this.linguist(path, {commit, cache})
+        this.results.commits++
+        this.results.total += total
+        this.results.files += files
+        this.results.missed.lines += missed.lines
+        this.results.missed.bytes += missed.bytes
+        for (const language in lines) {
+          if (this.categories.includes(cache.languages[language]?.type))
+            this.results.lines[language] = (this.results.lines[language] ?? 0) + lines[language]
+        }
+        for (const language in stats) {
+          if (this.categories.includes(cache.languages[language]?.type))
+            this.results.stats[language] = (this.results.stats[language] ?? 0) + stats[language]
+        }
+      }
+      catch (error) {
+        this.debug(`skipping commit ${commit.sha} (${error})`)
+        this.results.missed.commits++
+      }
+      finally {
+        this.results.elapsed += elapsed
+        processed++
+        if ((processed%50 === 0)||(processed === commits.length))
+          this.debug(`at commit ${processed}/${commits.length} (${(100*processed/commits.length).toFixed(2)}%, ${elapsed.toFixed(2)}m elapsed)`)
+      }
+    }
+    this.results.colors = Object.fromEntries(Object.entries(cache.languages).map(([lang, {color}]) => [lang, color]))
+  }
+
+  /**Clean a path */
+  async clean(path) {
+    try {
+      this.debug(`cleaning ${path}`)
+      await fs.rm(path, {recursive: true, force: true})
+      this.debug(`cleaned ${path}`)
+      return true
+    }
+    catch (error) {
+      this.debug(`failed to clean (${error})`)
+      return false
+    }
+  }
+
+  /**Whether to skip a repository or not */
+  ignore(repository) {
+    const ignored = !filters.repo(repository, this.skipped)
+    if (ignored)
+      this.debug(`skipping ${typeof repository === "string" ? repository : `${repository?.owner?.login}/${repository?.name}`} as it matches skipped repositories`)
+    return ignored
+  }
+
+  /**Debug log */
+  debug(message) {
+    return console.debug(`metrics/compute/${this.login}/plugins > languages > ${this.constructor.name.replace(/([a-z])([A-Z])/, (_, a, b) => `${a} ${b.toLocaleLowerCase()}`).toLocaleLowerCase()} > ${message}`)
+  }
+
+}