Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@ jobs:

steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v4

- name: Install PHP
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php }}

- name: Cache PHP dependencies
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: vendor
key: ${{ runner.os }}-php-${{ matrix.php }}-composer-${{ hashFiles('**/composer.json') }}
Expand Down
55 changes: 40 additions & 15 deletions src/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,31 +31,56 @@ public function __construct(Extractor $extractor)
preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $contentType, $match);
if (!empty($match[1])) {
$encoding = trim($match[1], ',');
try {
$ret = mb_encoding_aliases($encoding ?? '');
if ($ret === false) {
$encoding = null;
}
} catch (\ValueError $exception) {
$encoding = null;
}
$encoding = $this->getValidEncoding($encoding);
}
if (is_null($encoding) && !empty($html)) {
preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $html, $match);
if (!empty($match[1])) {
$encoding = trim($match[1], ',');
$encoding = $this->getValidEncoding($encoding);
}
}
$this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument();
$this->initXPath();
}

/**
* Get valid encoding name if it exists, otherwise return null
*
* Uses mb_encoding_aliases() to verify the encoding is valid.
*
* TODO: When dropping PHP 7.4 support, remove the PHP_VERSION_ID < 80000 branch.
* PHP version differences:
* - PHP 7.4: mb_encoding_aliases() returns false for invalid encoding and throws Warning for empty string
* - PHP 8.0+: mb_encoding_aliases() throws ValueError for invalid/empty encoding
*
* @see https://www.php.net/manual/en/function.mb-encoding-aliases.php
*/
private function getValidEncoding(?string $encoding): ?string
{
if (PHP_VERSION_ID < 80000) {
// PHP 7.4: Check return value (false = invalid encoding)
// Need to check empty() first to avoid Warning
// TODO: Remove this entire branch when PHP 7.4 support is dropped
if (empty($encoding)) {
return null;
}
$ret = mb_encoding_aliases($encoding);
if ($ret === false) {
return null;
} else {
return $encoding;
}
} else {
// PHP 8.0+: ValueError exception is thrown for invalid/empty encoding
try {
$ret = mb_encoding_aliases($encoding ?? '');
if ($ret === false) {
$encoding = null;
}
$aliases = mb_encoding_aliases($encoding ?? '');
// Check if aliases array is not empty (valid encoding should have at least one alias)
return !empty($aliases) ? $encoding : null;
} catch (\ValueError $exception) {
$encoding = null;
return null;
}
}
$this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument();
$this->initXPath();
}

private function initXPath()
Expand Down
2 changes: 1 addition & 1 deletion tests/DocumentTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public function testSelectors()
$extractor = self::getEmbed()->get('http://www.wired.com/?p=2064839');
$document = $extractor->getDocument();

$expected = 23;
$expected = 3;

$this->assertCount($expected, $document->select('.//p')->nodes());
$this->assertCount($expected, $document->selectCss('p')->nodes());
Expand Down
3,832 changes: 1,999 additions & 1,833 deletions tests/cache/www.wired.com..1202600986b37d2c6a30336f82c671f8.php

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
'feeds' => [],
'icon' => 'https://4pda.to/s/as6ywymaTWM6wnea1mxojxCz0Yet7IeumfOBnaxb.png',
'image' => 'https://i.4pda.ws/s/as6yueQrUwnKt0LgJ5m26uBjbZsccTet21FqwJkADfGw.jpg?v=1669981373',
'keywords' => ['состоялся релиз clown of duty — пародии на call of duty'],
'keywords' => [
'состоялся релиз clown of duty — пародии на call of duty'
],
'language' => 'ru-RU',
'languages' => [],
'license' => null,
Expand All @@ -23,5 +25,5 @@
'url' => 'https://4pda.to/2022/12/04/406834/sostoyalsya_reliz_clown_of_duty_parodii_na_call_of_duty/',
'linkedData' => [],
'oEmbed' => [],
'allLinkedData' => [],
'allLinkedData' => []
];
73 changes: 39 additions & 34 deletions tests/fixtures/animoto.com.play-gjsj1gu0wdrfr4pgw12xzq.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,55 @@
declare(strict_types = 1);

return [
'authorName' => null,
'authorUrl' => null,
'authorName' => '@animoto',
'authorUrl' => 'https://twitter.com/animoto',
'cms' => null,
'code' => [
'html' => '<iframe id="vp1GjsJ1" title="Video Player" width="640" height="360" frameborder="0" src="https://s3.amazonaws.com/embed.animoto.com/play.html?w=swf/production/vp1&e=1617549702&f=GjsJ1gu0WDRfr4pGw12xZQ&d=0&m=p&r=360p&i=m&asset_domain=s3-p.animoto.com&animoto_domain=animoto.com&options=start_hq" allowfullscreen></iframe>',
'width' => 640,
'height' => 360,
'ratio' => 56.25
'code' => null,
'description' => 'Create, edit, and share videos with our free video maker. Combine your photos, video clips, and music to make quality videos in minutes. Get started free!',
'favicon' => 'https://animoto.com/favicon-32x32.png?v=f7bad0df2a4af8688773dca5ee0b1ed6',
'feeds' => [
'https://animoto.com/rss.xml'
],
'description' => 'Animoto makes video creation easy! Animoto\'s video maker turns your photos and video clips into professional videos in minutes. Fast and shockingly simple!',
'favicon' => 'https://d14pr3cu5atb0x.cloudfront.net/images/icons/favicon-fbb19e53d0.ico',
'feeds' => [],
'icon' => 'https://d14pr3cu5atb0x.cloudfront.net/images/icons/touchicon-144-4a42d97241.png',
'image' => 'https://d2m23yiuv18ohn.cloudfront.net/Video/GjsJ1gu0WDRfr4pGw12xZQ/cover_648x360.jpg',
'icon' => 'https://animoto.com/icons/icon-48x48.png?v=f7bad0df2a4af8688773dca5ee0b1ed6',
'image' => null,
'keywords' => [],
'language' => null,
'languages' => [],
'license' => null,
'providerName' => 'Animoto',
'providerUrl' => 'https://animoto.com/',
'providerUrl' => 'https://animoto.com',
'publishedTime' => null,
'redirect' => null,
'title' => 'taco bell',
'title' => 'Free Video Maker | Create & Edit Your Videos Easily',
'url' => 'https://animoto.com/play/GjsJ1gu0WDRfr4pGw12xZQ',
'linkedData' => [],
'linkedData' => [
'@context' => 'http://schema.org',
'@type' => 'WebSite',
'url' => 'https://animoto.com',
'name' => 'Animoto video maker - Stand out on social media. Easily.',
'alternateName' => ''
],
'oEmbed' => [
'version' => 1.0,
'provider_name' => 'Animoto',
'provider_url' => 'https://animoto.com/',
'type' => 'video',
'author_name' => null,
'title' => 'taco bell',
'description' => '',
'thumbnail_url' => 'https://d2m23yiuv18ohn.cloudfront.net/Video/GjsJ1gu0WDRfr4pGw12xZQ/cover_648x360.jpg',
'thumbnail_height' => 360,
'thumbnail_width' => 648,
'icon_url' => 'https://d2m23yiuv18ohn.cloudfront.net/Video/GjsJ1gu0WDRfr4pGw12xZQ/cover_224x126.jpg',
'icon_height' => 54,
'icon_width' => 54,
'width' => 640,
'height' => 360,
'cache_age' => 604800,
'video_url' => 'https://d150hyw1dtprld.cloudfront.net/swf/w.swf?w=swf/production/vp1&e=1617549702&f=GjsJ1gu0WDRfr4pGw12xZQ&d=0&m=p&r=360p&i=m&asset_domain=s3-p.animoto.com&animoto_domain=animoto.com&options=start_hq',
'html' => '<iframe id="vp1GjsJ1" title="Video Player" width="640" height="360" frameborder="0" src="https://s3.amazonaws.com/embed.animoto.com/play.html?w=swf/production/vp1&e=1617549702&f=GjsJ1gu0WDRfr4pGw12xZQ&d=0&m=p&r=360p&i=m&asset_domain=s3-p.animoto.com&animoto_domain=animoto.com&options=start_hq" allowfullscreen></iframe>'
'error' => 'URL not supported or not found',
'url' => 'https://animoto.com/play/GjsJ1gu0WDRfr4pGw12xZQ'
],
'allLinkedData' => []
'allLinkedData' => [
[
'@context' => 'http://schema.org',
'@type' => 'WebSite',
'url' => 'https://animoto.com',
'name' => 'Animoto video maker - Stand out on social media. Easily.',
'alternateName' => ''
],
[
'@context' => 'http://schema.org',
'@type' => 'VideoObject',
'name' => 'Animoto: Free Online Video Maker',
'contentUrl' => 'https://d2of6bhnpl91ni.cloudfront.net/cms/animoto-free-online-video-maker-e8d6870030.mp4',
'description' => 'With Animoto, you\'ll have everything you need to create your own professional videos in minutes. No experience required. All it takes is an idea. ',
'thumbnailUrl' => '//images.ctfassets.net/00i767ygo3tc/010bacg5wwIhMMx6xYS3qj/18e39c5d16f5614a3b477a284faea1a2/free-online-video-maker.webp',
'transcript' => 'We all have a story to tell, and video is the best way to make yours stand out. With Animoto, you\'ll have everything you need to create your own professional videos in minutes. No experience required. All it takes is an idea. We\'ve made it easy to get started with customizable templates for everything from saying Happy Birthday to selling your product. From there, it\'s as simple as dragging and dropping your photos and video clips, choosing from our millions of Getty stock images or recording with our built-in screen and webcam recorder. Customize your videos with transitions, music, voiceovers, and more. Then bring your videos on brand with a single click. Make sure your story is heard with Animoto. Everything you need to create your own videos is right at your fingertips. Start creating for free.',
'uploadDate' => '2020-09-15T21:58:55.636Z'
]
]
];
15 changes: 8 additions & 7 deletions tests/fixtures/archive.org.details-dn2015-0220_vid.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
],
'oEmbed' => [],
'api' => [
'server' => 'ia802600.us.archive.org',
'dir' => '/20/items/dn2015-0220_vid',
'server' => 'ia801600.us.archive.org',
'dir' => '/34/items/dn2015-0220_vid',
'metadata' => [
'identifier' => [
'dn2015-0220_vid'
Expand Down Expand Up @@ -786,11 +786,12 @@
'/dn2015-0220_vid_files.xml' => [
'source' => 'original',
'format' => 'Metadata',
'md5' => '745ea2f6dde93e4b70b1c0b238d4c0e2'
'md5' => 'c8085d21bd5d528af0697f7d1cfff599',
'summation' => 'md5'
],
'/dn2015-0220_vid_meta.xml' => [
'source' => 'original',
'mtime' => '1542757137',
'mtime' => '1675274129',
'size' => '1973',
'format' => 'Metadata',
'md5' => '6a144c80a58ab5f08c0ecffdb580954a',
Expand All @@ -799,12 +800,12 @@
]
],
'misc' => [
'image' => 'https://ia802600.us.archive.org/20/items/dn2015-0220_vid/dn2015-0220.gif',
'image' => 'https://ia801600.us.archive.org/34/items/dn2015-0220_vid/dn2015-0220.gif',
'collection-title' => 'Democracy Now!'
],
'item' => [
'downloads' => 132,
'month' => 2,
'downloads' => 156,
'month' => 0,
'item_size' => 3667677269,
'files_count' => 68,
'item_count' => null,
Expand Down
12 changes: 6 additions & 6 deletions tests/fixtures/codepen.io.zhouzi-pen-jorazp.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

return [
'authorName' => 'Gabin Aureche',
'authorUrl' => 'https://codepen.io/Zhouzi/',
'authorUrl' => 'https://codepen.io/Zhouzi',
'cms' => null,
'code' => [
'html' => '<iframe id="cp_embed_JoRazP" src="https://codepen.io/Zhouzi/embed/preview/JoRazP?height=300&amp;slug-hash=JoRazP&amp;default-tabs=js,result&amp;host=https://codepen.io" title="TheaterJS" scrolling="no" frameborder="0" height="300" allowtransparency="true" class="cp_embed_iframe" style="width: 100%; overflow: hidden;"></iframe>',
'html' => '<iframe id="cp_embed_JoRazP" src="https://codepen.io/Zhouzi/embed/preview/JoRazP?default-tabs=js%2Cresult&amp;height=300&amp;host=https%3A%2F%2Fcodepen.io&amp;slug-hash=JoRazP" title="TheaterJS" scrolling="no" frameborder="0" height="300" allowtransparency="true" class="cp_embed_iframe" style="width: 100%; overflow: hidden;"></iframe>',
'width' => 800,
'height' => 300,
'ratio' => 37.5
Expand All @@ -15,7 +15,7 @@
'favicon' => 'https://codepen.io/favicon.ico',
'feeds' => [],
'icon' => null,
'image' => 'https://assets.codepen.io/99102/internal/screenshots/pens/JoRazP.default.png?fit=cover&format=auto&ha=true&height=360&quality=75&v=2&version=1467971314&width=640',
'image' => 'https://shots.codepen.io/username/pen/JoRazP-512.jpg?version=1467971314',
'keywords' => [],
'language' => 'en-US',
'languages' => [],
Expand All @@ -35,13 +35,13 @@
'provider_url' => 'https://codepen.io',
'title' => 'TheaterJS',
'author_name' => 'Gabin Aureche',
'author_url' => 'https://codepen.io/Zhouzi/',
'author_url' => 'https://codepen.io/Zhouzi',
'height' => '300',
'width' => '800',
'thumbnail_width' => '384',
'thumbnail_height' => '225',
'thumbnail_url' => 'https://assets.codepen.io/99102/internal/screenshots/pens/JoRazP.default.png?fit=cover&amp;format=auto&amp;ha=true&amp;height=360&amp;quality=75&amp;v=2&amp;version=1467971314&amp;width=640',
'html' => '<iframe id="cp_embed_JoRazP" src="https://codepen.io/Zhouzi/embed/preview/JoRazP?height=300&amp;slug-hash=JoRazP&amp;default-tabs=js,result&amp;host=https://codepen.io" title="TheaterJS" scrolling="no" frameborder="0" height="300" allowtransparency="true" class="cp_embed_iframe" style="width: 100%; overflow: hidden;"></iframe>'
'thumbnail_url' => 'https://shots.codepen.io/username/pen/JoRazP-512.jpg?version=1467971314',
'html' => '<iframe id="cp_embed_JoRazP" src="https://codepen.io/Zhouzi/embed/preview/JoRazP?default-tabs=js%2Cresult&amp;height=300&amp;host=https%3A%2F%2Fcodepen.io&amp;slug-hash=JoRazP" title="TheaterJS" scrolling="no" frameborder="0" height="300" allowtransparency="true" class="cp_embed_iframe" style="width: 100%; overflow: hidden;"></iframe>'
],
'allLinkedData' => []
];
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@
'cms' => null,
'code' => null,
'description' => null,
'favicon' => 'https://ssl.gstatic.com/images/branding/product/1x/drive_2020q4_32dp.png',
'favicon' => 'https://drive.google.com/favicon.ico',
'feeds' => [],
'icon' => null,
'image' => null,
'keywords' => [],
'language' => null,
'languages' => [],
'license' => null,
'providerName' => 'Google Docs',
'providerName' => 'Google',
'providerUrl' => 'https://drive.google.com',
'publishedTime' => null,
'redirect' => null,
'title' => 'Entrevista_Rianxo_RadioFusion_150724.mp3',
'title' => null,
'url' => 'https://drive.google.com/file/d/0B2rwN8wAbVSWbmFJdUdnV2VSTTg/view',
'linkedData' => [],
'oEmbed' => [],
Expand Down
Loading