Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions .github/scripts/extract-changed-markdown-lines.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
use strict;
use warnings;
use File::Basename qw(dirname);
use File::Path qw(make_path);

my ($out_root, $list_path) = @ARGV;
die "usage: $0 OUT_ROOT LIST_PATH\n" unless defined $out_root && defined $list_path;

my %added_lines_by_file;
my %has_link_candidate;
my $file;

while (my $line = <STDIN>) {
chomp $line;

if ($line =~ m{^\+\+\+ b/(.+)$}) {
$file = $1;
next;
}

next unless defined $file;
next unless $line =~ /^\+(?!\+\+)(.*)$/;

my $content = $1;
push @{$added_lines_by_file{$file}}, $content;
$has_link_candidate{$file} = 1 if $content =~ m{https?://}i || $content =~ /\bhref\s*=/i;
Comment thread
qiancai marked this conversation as resolved.
}

make_path($out_root);
open my $list_fh, ">", $list_path or die "cannot write $list_path: $!";

for my $file (sort keys %added_lines_by_file) {
next unless $has_link_candidate{$file};
next if $file =~ m{(?:^|/)\.\.(?:/|$)};

my $out_path = "$out_root/$file";
make_path(dirname($out_path));
open my $out_fh, ">", $out_path or die "cannot write $out_path: $!";
for my $line (@{$added_lines_by_file{$file}}) {
print {$out_fh} "$line\n";
}
close $out_fh;
print {$list_fh} "$out_path\n";
}

close $list_fh;
58 changes: 58 additions & 0 deletions .github/scripts/extract-site-hrefs.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
use strict;
use warnings;
use File::Basename qw(dirname);
use File::Path qw(make_path);

my ($out_root, $list_path) = @ARGV;
die "usage: $0 OUT_ROOT LIST_PATH\n" unless defined $out_root && defined $list_path;

my $site_base_url = $ENV{DOCS_SITE_BASE_URL};
die "DOCS_SITE_BASE_URL is not set\n" unless defined $site_base_url && $site_base_url ne "";
$site_base_url =~ s{/+\z}{};

make_path($out_root);
open my $list_fh, ">", $list_path or die "cannot write $list_path: $!";

{
local $/ = "\0";
while (my $file = <STDIN>) {
chomp $file;
next if $file =~ m{(?:^|/)\.\.(?:/|$)};
next unless -f $file;

open my $in_fh, "<", $file or die "cannot read $file: $!";
my $content = do { local $/; <$in_fh> };
close $in_fh;
next unless defined $content;

my %seen;
while ($content =~ /\bhref\s*=\s*(["'])(.*?)\1/gi) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This script specifically extracts links from HTML href attributes but ignores standard Markdown link syntax [text](url). Since the majority of links in the documentation are likely in Markdown format, this script will miss many relative links that need to be normalized to site URLs for checking.

my $href = $2;
$href =~ s/^\s+|\s+$//g;
next if $href eq "";
next if $href =~ m{^https?://}i;
next if $href =~ m{^(?:#|[a-z][a-z0-9+.-]*:)}i;

my $url;
if ($href =~ m{^//}) {
$url = "https:$href";
} elsif ($href =~ m{^/}) {
$url = "$site_base_url$href";
} else {
next;
}
$seen{$url} = 1;
}

next unless %seen;
my $out_path = "$out_root/$file";
make_path(dirname($out_path));
open my $out_fh, ">", $out_path or die "cannot write $out_path: $!";
for my $url (sort keys %seen) {
print {$out_fh} "<$url>\n";
}
close $out_fh;
print {$list_fh} "$out_path\n";
}
}
close $list_fh;
47 changes: 38 additions & 9 deletions .github/workflows/link-fail-fast.yaml
Original file line number Diff line number Diff line change
@@ -1,27 +1,56 @@
name: Links (Fail Fast)
name: ci / external-links-in-changed-lines (pull_request)

on:
pull_request:

env:
DOCS_SITE_BASE_URL: "https://docs.pingcap.com"

permissions:
contents: read

jobs:
linkChecker:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6
with:
fetch-depth: 2

- name: 'Get a list of changed markdown files to process'
id: changed-files
- name: Collect changed markdown lines with links
id: changed-lines
run: |
CHANGED_FILES=$(git diff-tree --name-only --diff-filter 'AM' -r HEAD^1 HEAD -- "*.md" | sed -z "s/\n$//;s/\n/' '/g")
echo "all_changed_files=${CHANGED_FILES}" >> $GITHUB_OUTPUT
git -c core.quotePath=false diff --unified=0 --diff-filter=AM --no-ext-diff --no-color HEAD^1 HEAD -- '*.md' |
perl .github/scripts/extract-changed-markdown-lines.pl .lychee-pr-changed-lines .lychee-pr-inputs.txt

count=$(wc -l < .lychee-pr-inputs.txt | tr -d ' ')
echo "count=${count}" >> "$GITHUB_OUTPUT"

if [ "$count" -gt 0 ]; then
echo "has_inputs=true" >> "$GITHUB_OUTPUT"
sed 's/^/- /' .lychee-pr-inputs.txt
else
echo "has_inputs=false" >> "$GITHUB_OUTPUT"
fi

- name: Collect doc site href URLs
if: ${{ steps.changed-lines.outputs.has_inputs == 'true' }}
run: |
tr '\n' '\0' < .lychee-pr-inputs.txt |
perl .github/scripts/extract-site-hrefs.pl .lychee-site-hrefs .lychee-site-href-files.txt

count=$(wc -l < .lychee-site-href-files.txt | tr -d ' ')
if [ "$count" -gt 0 ]; then
cat .lychee-site-href-files.txt >> .lychee-pr-inputs.txt
sed 's/^/- /' .lychee-site-href-files.txt
fi

- name: Link Checker
if: ${{ steps.changed-files.outputs.all_changed_files }}
uses: lycheeverse/lychee-action@v2.3.0
if: ${{ steps.changed-lines.outputs.has_inputs == 'true' }}
uses: lycheeverse/lychee-action@v2
with:
fail: true
args: --root-dir $(pwd) -E -i -n -t 45 -- '${{ steps.changed-files.outputs.all_changed_files }}'
failIfEmpty: false
args: --root-dir $(pwd) --exclude '^file://' -E -i -n -t 45 --files-from .lychee-pr-inputs.txt
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
80 changes: 72 additions & 8 deletions .github/workflows/link.yaml
Original file line number Diff line number Diff line change
@@ -1,35 +1,99 @@
name: Links
name: Check external URLs in all files

on:
repository_dispatch:
workflow_dispatch:
schedule:
- cron: "0 0 * * 1"

env:
DOCS_SITE_BASE_URL: "https://docs.pingcap.com"

permissions:
contents: read
issues: write

jobs:
linkChecker:
if: github.repository == 'pingcap/docs'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6

- name: Download Exclude Path
run: |
curl https://raw.githubusercontent.com/pingcap/docs/master/.lycheeignore --output .lycheeignore
curl -fsSL https://raw.githubusercontent.com/pingcap/docs/master/.lycheeignore --output .lycheeignore

- name: Restore lychee cache
uses: actions/cache@v4
with:
path: .lycheecache
key: cache-lychee-${{ github.sha }}
restore-keys: cache-lychee-

- name: Check Links
uses: lycheeverse/lychee-action@v1.6.1
uses: lycheeverse/lychee-action@v2
with:
# Don't fail as we want the workflow to continue and run 'Create Issue From File'
fail: false
failIfEmpty: false
args: --root-dir $(pwd) --cache --max-cache-age 8d --cache-exclude-status '..200,300..' --exclude '^file://' -E -i -n -t 45 --exclude-path '^\./releases/' --exclude-path '^\./tidb-cloud/releases/' --exclude-path '^\./resources/' .
output: out-external.md
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}

- name: Collect doc site href URLs
id: site-hrefs
run: |
git ls-files -z -- \
'*.md' '*.mdx' '*.markdown' '*.mkd' '*.mdown' '*.mdwn' '*.mkdn' '*.mkdown' \
'*.html' '*.htm' '*.css' '*.txt' |
perl -0ne 'print unless m{^(?:releases|tidb-cloud/releases|resources)/}' |
perl .github/scripts/extract-site-hrefs.pl .lychee-site-hrefs .lychee-site-href-files.txt

count=$(wc -l < .lychee-site-href-files.txt | tr -d ' ')
echo "count=${count}" >> "$GITHUB_OUTPUT"

if [ "$count" -gt 0 ]; then
echo "has_hrefs=true" >> "$GITHUB_OUTPUT"
sed 's/^/- /' .lychee-site-href-files.txt
else
echo "has_hrefs=false" >> "$GITHUB_OUTPUT"
fi

- name: Check site href URLs
if: ${{ steps.site-hrefs.outputs.has_hrefs == 'true' }}
uses: lycheeverse/lychee-action@v2
with:
# Don't fail as we want the workflow to continue and run 'Create Issue From File'
# Excluding releases paths as historic releases may have outdated links.
fail: false
failIfEmpty: false
args: --root-dir $(pwd) --cache --max-cache-age 8d -E -i -n -t 45 --exclude-path '^./releases/' --exclude-path '^./tidb-cloud/releases/' --exclude-path '^./resources/' .
output: out.md
args: --cache --max-cache-age 8d --cache-exclude-status '..200,300..' -E -i -n -t 45 --files-from .lychee-site-href-files.txt
output: out-site-hrefs.md
env:
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}

- name: Combine Link Reports
run: |
{
echo "# External URL Check"
echo
if [ -f out-external.md ]; then
cat out-external.md
else
echo "*(external link check did not produce output)*"
fi

if [ -f out-site-hrefs.md ]; then
echo
echo "# Site href URL Check"
echo
cat out-site-hrefs.md
fi
} > out.md

- name: Create Issue From File
uses: peter-evans/create-issue-from-file@v4
uses: peter-evans/create-issue-from-file@v6
with:
title: Broken Link Detected
content-filepath: out.md
Expand Down
6 changes: 4 additions & 2 deletions .lycheeignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,16 @@ https://platform\.openai\.com/api-keys
https://openai\.com/.*
https://jwt\.io/
https://typeorm\.io/.*
https://dl\.acm\.org/doi/10\.1145/(1988842\.1988850|2588555\.2610507)
https://developer\.salesforce\.com/.*
https?://(www\.)?npmjs\.com/package/.*
https://dash\.cloudflare\.com/.*
https://centminmod\.com/mydumper\.html
https://docs\.pingcap\.com/tidb/v6\.6/system-variables#tidb_pessimistic_txn_aggressive_locking-new-in-v660
https://docs\.pingcap\.com/tidb/v7\.6/system-variables#tidb_ddl_version-new-in-v760
https://developers\.redhat\.com/blog/2021/01/05/building-red-hat-enterprise-linux-9-for-the-x86-64-v2-microarchitecture-level
https://portal\.azure\.com/.*
https://.*github.*/%7B%7B%7B%20.tidb_operator_version%20%7D%7D%7D
https://.*github.*/%7B%7B%7B.tidb-operator-version%7D%7D%7D
https://console\.cloud\.google\.com/.*
https://portal\.azure\.com/.*
https://azuremarketplace\.microsoft\.com/.*
https://one\.newrelic\.com/.*
Expand Down
Loading