Skip to content

Commit eb8778c

Browse files
authored
Better twitter video extraction (#104)
1 parent 840ff6f commit eb8778c

File tree

6 files changed

+98
-39
lines changed

6 files changed

+98
-39
lines changed

packages/metascraper-video-provider/package.json

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"description": "Get video property from HTML markup",
44
"homepage": "https://metascraper.js.org",
55
"version": "3.12.1",
6-
"main": "index.js",
6+
"main": "src/index",
77
"author": {
88
"email": "ian@ianstormtaylor.com",
99
"name": "Ian Storm Taylor"
@@ -17,13 +17,12 @@
1717
},
1818
"dependencies": {
1919
"@metascraper/helpers": "^3.12.1",
20+
"got": "~9.0.0",
2021
"json-future": "~2.1.2",
2122
"lodash": "~4.17.10",
22-
"twdown": "~1.0.3",
2323
"youtube-dl": "~1.12.2"
2424
},
2525
"devDependencies": {
26-
"browserless": "latest",
2726
"mocha": "latest",
2827
"nyc": "latest",
2928
"puppeteer": "latest",
@@ -35,8 +34,7 @@
3534
"node": ">= 8"
3635
},
3736
"files": [
38-
"index.js",
39-
"scripts"
37+
"src"
4038
],
4139
"scripts": {
4240
"test": "NODE_PATH=.. TZ=UTC NODE_ENV=test nyc mocha test"
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
'use strict'
2+
3+
const {isTwitterUrl, getTwitterVideoInfo} = require('./twitter-video-info')
4+
const getVideoInfo = require('./video-info')
5+
const { chain } = require('lodash')
6+
7+
const getInfo = async url => {
8+
if (!isTwitterUrl(url)) return getVideoInfo(url)
9+
10+
const [videoInfo, twitterVideos] = await Promise.all([
11+
getVideoInfo(url),
12+
getTwitterVideoInfo(url)
13+
])
14+
15+
const formats = chain(videoInfo.formats)
16+
.reduce((acc, format, index) => {
17+
const { url } = twitterVideos[index]
18+
const item = {...format, url}
19+
return [...acc, item]
20+
}, [])
21+
.value()
22+
23+
return {...videoInfo, formats}
24+
}
25+
26+
// Local cache for successive calls
27+
let cachedVideoInfoUrl
28+
let cachedVideoInfo
29+
30+
module.exports = async url => {
31+
if (url === cachedVideoInfoUrl) return cachedVideoInfo
32+
cachedVideoInfoUrl = url
33+
34+
try {
35+
cachedVideoInfo = await getInfo(url)
36+
} catch (err) {
37+
cachedVideoInfo = {}
38+
}
39+
return cachedVideoInfo
40+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
'use strict'
2+
3+
const { chain } = require('lodash')
4+
const { URL } = require('url')
5+
const got = require('got')
6+
7+
// twitter guest web token
8+
// https://github.com/soimort/you-get/blob/da8c982608c9308765e0960e08fc28cccb74b215/src/you_get/extractors/twitter.py#L72
9+
const TWITTER_BEARER_TOKEN = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
10+
11+
const TWITTER_HOSTNAMES = ['twitter.com', 'mobile.twitter.com']
12+
13+
const isTwitterUrl = url => TWITTER_HOSTNAMES.includes(new URL(url).hostname)
14+
15+
const getTweetId = url => url.split('/').reverse()[0]
16+
17+
const getGuestToken = async () => {
18+
const { body } = await got.post('https://api.twitter.com/1.1/guest/activate.json', {
19+
headers: { authorization: TWITTER_BEARER_TOKEN },
20+
json: true
21+
})
22+
return body.guest_token
23+
}
24+
25+
const getTwitterVideoInfo = async url => {
26+
const tweetId = getTweetId(url)
27+
const apiUrl = `https://api.twitter.com/2/timeline/conversation/${tweetId}.json?tweet_mode=extended`
28+
const { body } = await got(apiUrl, {
29+
json: true,
30+
headers: {
31+
authorization: TWITTER_BEARER_TOKEN,
32+
'x-guest-token': await getGuestToken()
33+
}
34+
})
35+
36+
return chain(body)
37+
.get(`globalObjects.tweets.${tweetId}.extended_entities.media.0.video_info.variants`)
38+
.orderBy('bitrate', 'asc')
39+
.value()
40+
}
41+
42+
module.exports = { getTwitterVideoInfo, isTwitterUrl }
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
'use strict'
2+
3+
const youtubedl = require('youtube-dl')
4+
const { promisify } = require('util')
5+
6+
module.exports = promisify(youtubedl.getInfo)

packages/metascraper-video-provider/index.js renamed to packages/metascraper-video-provider/src/index.js

Lines changed: 6 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,9 @@
22

33
const { overEvery, isEmpty, eq, has, round, size, get, chain, find, isString } = require('lodash')
44
const { isUrl, titleize } = require('@metascraper/helpers')
5-
const youtubedl = require('youtube-dl')
6-
const { promisify } = require('util')
7-
const twdown = require('twdown')
8-
const { URL } = require('url')
95
const path = require('path')
106

11-
const getInfo = promisify(youtubedl.getInfo)
12-
13-
const TWITTER_HOSTNAMES = ['twitter.com', 'mobile.twitter.com']
14-
15-
const isTwitterUrl = url => TWITTER_HOSTNAMES.includes(new URL(url).hostname)
16-
17-
// Local cache for successive calls
18-
let cachedVideoInfoUrl
19-
let cachedVideoInfo
20-
21-
const getVideoInfo = async url => {
22-
if (url === cachedVideoInfoUrl) return cachedVideoInfo
23-
cachedVideoInfoUrl = url
24-
25-
try {
26-
cachedVideoInfo = await getInfo(url)
27-
} catch (err) {
28-
cachedVideoInfo = {}
29-
}
30-
return cachedVideoInfo
31-
}
7+
const getVideoInfo = require('./get-video-info')
328

339
const isMp4 = video =>
3410
eq(get(video, 'ext'), 'mp4') || path.extname(get(video, 'url')).startsWith('.mp4')
@@ -52,12 +28,10 @@ const getVideoUrl = (videos, filters = []) => {
5228
}
5329

5430
/**
55-
* Get a URL-like video source.
31+
* Get a URL-like video source
5632
*/
57-
const getVideoProvider = getBrowserless => async ({ url }) => {
58-
const formats = !isTwitterUrl(url)
59-
? (await getVideoInfo(url)).formats
60-
: await twdown({ url, browserless: await getBrowserless() })
33+
const getVideoProvider = async ({ url }) => {
34+
const { formats } = await getVideoInfo(url)
6135

6236
const videoUrl =
6337
getVideoUrl(formats, [isMp4, isHttps, hasAudio]) ||
@@ -97,9 +71,9 @@ const getVideoDate = async ({ url }) => {
9771
return timestamp && new Date(timestamp * 1000).toISOString()
9872
}
9973

100-
module.exports = ({ getBrowserless }) => {
74+
module.exports = () => {
10175
return {
102-
video: getVideoProvider(getBrowserless),
76+
video: getVideoProvider,
10377
author: getVideoAuthor,
10478
publisher: getVideoPublisher,
10579
title: getVideoTitle,

packages/metascraper-video-provider/test/index.js

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
'use strict'
22

33
const { isUrl } = require('@metascraper/helpers')
4-
const browserless = require('browserless')()
54
const { isString } = require('lodash')
65
const snapshot = require('snap-shot')
76
const { promisify } = require('util')
@@ -11,7 +10,7 @@ const should = require('should')
1110
const fs = require('fs')
1211

1312
const metascraper = require('metascraper').load([
14-
require('metascraper-video-provider')({ getBrowserless: () => browserless }),
13+
require('metascraper-video-provider')(),
1514
require('metascraper-author')(),
1615
require('metascraper-date')(),
1716
require('metascraper-description')(),

0 commit comments

Comments
 (0)