diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d4a44091..69ade427 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,7 +23,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install PHP uses: shivammathur/setup-php@v2 @@ -31,7 +31,7 @@ jobs: php-version: ${{ matrix.php }} - name: Cache PHP dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: vendor key: ${{ runner.os }}-php-${{ matrix.php }}-composer-${{ hashFiles('**/composer.json') }} diff --git a/src/Document.php b/src/Document.php index 716b218a..5ee56c5c 100644 --- a/src/Document.php +++ b/src/Document.php @@ -31,31 +31,56 @@ public function __construct(Extractor $extractor) preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $contentType, $match); if (!empty($match[1])) { $encoding = trim($match[1], ','); - try { - $ret = mb_encoding_aliases($encoding ?? ''); - if ($ret === false) { - $encoding = null; - } - } catch (\ValueError $exception) { - $encoding = null; - } + $encoding = $this->getValidEncoding($encoding); } if (is_null($encoding) && !empty($html)) { preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $html, $match); if (!empty($match[1])) { $encoding = trim($match[1], ','); + $encoding = $this->getValidEncoding($encoding); } + } + $this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument(); + $this->initXPath(); + } + + /** + * Get valid encoding name if it exists, otherwise return null + * + * Uses mb_encoding_aliases() to verify the encoding is valid. + * + * TODO: When dropping PHP 7.4 support, remove the PHP_VERSION_ID < 80000 branch. + * PHP version differences: + * - PHP 7.4: mb_encoding_aliases() returns false for invalid encoding and throws Warning for empty string + * - PHP 8.0+: mb_encoding_aliases() throws ValueError for invalid/empty encoding + * + * @see https://www.php.net/manual/en/function.mb-encoding-aliases.php + */ + private function getValidEncoding(?string $encoding): ?string + { + if (PHP_VERSION_ID < 80000) { + // PHP 7.4: Check return value (false = invalid encoding) + // Need to check empty() first to avoid Warning + // TODO: Remove this entire branch when PHP 7.4 support is dropped + if (empty($encoding)) { + return null; + } + $ret = mb_encoding_aliases($encoding); + if ($ret === false) { + return null; + } else { + return $encoding; + } + } else { + // PHP 8.0+: ValueError exception is thrown for invalid/empty encoding try { - $ret = mb_encoding_aliases($encoding ?? ''); - if ($ret === false) { - $encoding = null; - } + $aliases = mb_encoding_aliases($encoding ?? ''); + // Check if aliases array is not empty (valid encoding should have at least one alias) + return !empty($aliases) ? $encoding : null; } catch (\ValueError $exception) { - $encoding = null; + return null; } } - $this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument(); - $this->initXPath(); } private function initXPath() diff --git a/tests/DocumentTest.php b/tests/DocumentTest.php index 8978f5cc..aa516acd 100644 --- a/tests/DocumentTest.php +++ b/tests/DocumentTest.php @@ -28,7 +28,7 @@ public function testSelectors() $extractor = self::getEmbed()->get('http://www.wired.com/?p=2064839'); $document = $extractor->getDocument(); - $expected = 23; + $expected = 3; $this->assertCount($expected, $document->select('.//p')->nodes()); $this->assertCount($expected, $document->selectCss('p')->nodes()); diff --git a/tests/cache/www.wired.com..1202600986b37d2c6a30336f82c671f8.php b/tests/cache/www.wired.com..1202600986b37d2c6a30336f82c671f8.php index e2bda995..7feb7e9e 100644 --- a/tests/cache/www.wired.com..1202600986b37d2c6a30336f82c671f8.php +++ b/tests/cache/www.wired.com..1202600986b37d2c6a30336f82c671f8.php @@ -3,1154 +3,1935 @@ return [ 'headers' => [ - 'connection' => [ - 'close' - ], - 'content-length' => [ - '0' - ], 'server' => [ - 'Varnish' - ], - 'retry-after' => [ - '0' - ], - 'location' => [ - 'https://www.wired.com/?p=2064839' - ], - 'accept-ranges' => [ - 'bytes', - 'none' + 'CloudFront', + 'CloudFront' ], 'date' => [ - 'Mon, 10 Jun 2024 15:59:03 GMT', - 'Mon, 10 Jun 2024 15:59:04 GMT' + 'Sat, 04 Oct 2025 10:19:08 GMT', + 'Sat, 04 Oct 2025 10:07:16 GMT' ], - 'via' => [ - '1.1 varnish', - '1.1 varnish, 1.1 varnish' + 'content-type' => [ + 'text/html', + 'text/html; charset=utf-8' ], - 'set-cookie' => [ - 'CN_xid=40f10590-911c-4553-8a23-0517f2a85fe9; Expires=Sat, 07 Dec 2024 15:59:03 GMT; Domain=.wired.com; path=/; Secure; SameSite=None;', - 'CN_xid_refresh=40f10590-911c-4553-8a23-0517f2a85fe9; Expires=Tue, 10 Jun 2025 15:59:03 GMT; Domain=.wired.com; path=/; Secure; httponly; SameSite=None;', - 'xid1=1; Expires=Mon, 10 Jun 2024 15:59:18 GMT; Domain=.wired.com; path=/;', - 'CN_segments=co.w2424; Expires=Sat, 07 Dec 2024 15:59:03 GMT; Domain=.wired.com; path=/;', - 'CN_geo_country_code=ES; Expires=Sat, 07 Dec 2024 15:59:03 GMT; Path=/; Domain=wired.com; Samesite=None; Secure', - 'CN_xid=9934a602-d41d-4eb6-a86c-73151ac6715e; Expires=Sat, 07 Dec 2024 15:59:04 GMT; Domain=.wired.com; path=/; Secure; SameSite=None;', - 'CN_xid_refresh=9934a602-d41d-4eb6-a86c-73151ac6715e; Expires=Tue, 10 Jun 2025 15:59:04 GMT; Domain=.wired.com; path=/; Secure; httponly; SameSite=None;', - 'xid1=1; Expires=Mon, 10 Jun 2024 15:59:19 GMT; Domain=.wired.com; path=/;', - 'CN_segments=co.w2424; Expires=Sat, 07 Dec 2024 15:59:04 GMT; Domain=.wired.com; path=/;', - 'verso_bucket=565; Expires=Tue, 10 Jun 2025 15:59:04 GMT; path=/;', - 'CN_geo_country_code=ES; Expires=Sat, 07 Dec 2024 15:59:04 GMT; Path=/; Domain=wired.com; Samesite=None; Secure' + 'content-length' => [ + '167' ], - 'content-security-policy' => [ - 'default-src https: data: \'unsafe-inline\' \'unsafe-eval\'; child-src https: data: blob:; connect-src https: data: blob: wss://*.hotjar.com wss://*.conde.digital; font-src https: data:; img-src https: blob: data: android-webview-video-poster:; media-src blob: data: https:; object-src https:; script-src https: data: blob: \'unsafe-inline\' \'unsafe-eval\'; style-src https: \'unsafe-inline\'; block-all-mixed-content; upgrade-insecure-requests;', - 'default-src https: data: \'unsafe-inline\' \'unsafe-eval\'; child-src https: data: blob:; connect-src https: data: blob: wss://*.hotjar.com wss://*.conde.digital; font-src https: data:; img-src https: blob: data: android-webview-video-poster:; media-src blob: data: https:; object-src https:; script-src https: data: blob: \'unsafe-inline\' \'unsafe-eval\'; style-src https: \'unsafe-inline\'; block-all-mixed-content; upgrade-insecure-requests;' + 'connection' => [ + 'keep-alive' ], - 'x-served-by' => [ - 'cache-mad22074-MAD', - 'cache-iad-kjyo7100080-IAD, cache-mad2200147-MAD' + 'location' => [ + 'https://www.wired.com/?p=2064839' ], 'x-cache' => [ - 'HIT', - 'MISS, HIT' + 'Redirect from cloudfront', + 'Hit from cloudfront' ], - 'x-cache-hits' => [ - '0', - '0, 0' + 'via' => [ + '1.1 06dea94a9acccc89bf073f5b6e5408ea.cloudfront.net (CloudFront)', + '1.1 e8ccc8fdd24646b17e2edb99277c5024.cloudfront.net (CloudFront), 1.1 b3db53b8c0d360b6f708a44987d1b5ea.cloudfront.net (CloudFront)' ], - 'x-timer' => [ - 'S1718035144.955581,VS0,VE0', - 'S1718035144.034389,VS0,VE2' + 'x-amz-cf-pop' => [ + 'NRT57-P2', + 'NRT20-P5', + 'NRT57-P2' ], - 'vary' => [ - '', - 'accept-encoding, cn-experiments, X-UA-Device, high-ad-cadence, Verso' + 'alt-svc' => [ + 'h3=":443"; ma=86400', + 'h3=":443"; ma=86400' ], - 'apple-news-services-host' => [ - 'www.wired.com HTTP/2 200', - 'www.wired.com' + 'x-amz-cf-id' => [ + 'S2owJ59zeRbG9Biy4BXGPAWt0Fxtf6qHw7WPcfgEwERw3WGkUX7RdA== HTTP/2 200', + 'lInWg8AYlFb0iT7fqyMh8p3TRzd1LHfurW8i8Bbb3l0Zb7pJK4n29Q==' ], - 'content-type' => [ - 'text/html; charset=utf-8' + 'modified-at' => [ + '1759504312' ], 'cache-control' => [ - 'no-cache' + 'stale-while-revalidate=60, stale-if-error=86400, s-maxage=900' ], - 'fastly-debug-state' => [ - 'MISS-CLUSTER' + 'back-lae-origin-response-start' => [ + '1759572436271' ], - 'x-esi' => [ - 'on' + 'x-organization-slug' => [ + 'wired' ], - 'verso' => [ - 'true' - ], - 'age' => [ - '85' - ], - 'strict-transport-security' => [ - 'max-age=31536000; preload' + 'x-content-type-options' => [ + 'nosniff' ], 'x-ua-device' => [ 'desktop' ], - 'apple-news-services-request-url' => [ - '/?p=2064839' + 'content-encoding' => [ + 'gzip' ], - 'apple-news-services-parsed-url' => [ - '/?p=2064839' + 'vary' => [ + 'accept-encoding' ], - 'apple-news-services-handled' => [ - 'false' + 'age' => [ + '712' ], - 'content-encoding' => [ - 'gzip' + 'set-cookie' => [ + 'CN_geo_country_code=JP; Expires=Thu, 02 Apr 2026 10:19:08 GMT; path=/; Domain=.wired.com; Secure; SameSite=None;', + 'CN_segments=co.w2540; Expires=Thu, 02 Apr 2026 10:19:08 GMT; Domain=.wired.com; path=/;', + 'CN_xid=ef802d5d-833b-45f0-ad09-1aeff194ebc0; Expires=Thu, 02 Apr 2026 10:19:08 GMT; Domain=.wired.com; path=/; Secure; SameSite=None;', + 'CN_xid_refresh=ef802d5d-833b-45f0-ad09-1aeff194ebc0; Expires=Sun, 04 Oct 2026 10:19:08 GMT; Domain=.wired.com; path=/; Secure; httponly; SameSite=None;', + 'xid1=1; Expires=Sat, 04 Oct 2025 10:19:23 GMT; Domain=.wired.com; path=/;' ], 'Content-Location' => [ 'https://www.wired.com/?p=2064839' ], 'X-Request-Time' => [ - '0.167 ms' + '0.134 ms' ] ], 'statusCode' => 200, 'reasonPhrase' => 'OK', - 'body' => '