Skip to content

Commit 87b1342

Browse files
authored
fix(instagram-embed): use facebookexternalhit UA & don't cache empty shells (#806)
1 parent da57b6e commit 87b1342

3 files changed

Lines changed: 92 additions & 10 deletions

File tree

packages/script/src/runtime/server/instagram-embed.ts

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import { createError, defineEventHandler, getQuery, setHeader } from 'h3'
2-
import { useRuntimeConfig } from 'nitropack/runtime'
2+
import { defineCachedFunction, useRuntimeConfig } from 'nitropack/runtime'
3+
import { $fetch } from 'ofetch'
34
import { ELEMENT_NODE, parse, renderSync, TEXT_NODE, walkSync } from 'ultrahtml'
45
import { createCachedJsonFetch } from './utils/cached-upstream'
5-
import { proxyAssetUrl, rewriteUrl, rewriteUrlsInText, RSRC_RE, scopeCss } from './utils/instagram-embed'
6+
import { isEmbedShell, proxyAssetUrl, rewriteUrl, rewriteUrlsInText, RSRC_RE, scopeCss } from './utils/instagram-embed'
67
import { withSigning } from './utils/withSigning'
78

89
export { proxyAssetUrl, proxyImageUrl, rewriteUrl, rewriteUrlsInText, scopeCss } from './utils/instagram-embed'
@@ -12,10 +13,35 @@ const SRCSET_SPLIT_RE = /\s+/
1213

1314
// Instagram embed HTML is semi-fresh (likes, captions may update); 10min
1415
// matches the outbound Cache-Control header and dedupes per post+captions.
15-
const cachedEmbedFetch = createCachedJsonFetch<string>(
16-
'nuxt-scripts-instagram-embed',
17-
600,
18-
url => url,
16+
// Throws on shell responses so nitro doesn't cache them.
17+
const cachedEmbedFetch = defineCachedFunction(
18+
async (url: string, headers: Record<string, string>): Promise<string> => {
19+
const html = await $fetch<string>(url, { timeout: 10000, headers })
20+
if (isEmbedShell(html)) {
21+
throw createError({
22+
statusCode: 502,
23+
statusMessage: 'Instagram returned an empty embed shell (post unavailable or upstream rate-limiting)',
24+
})
25+
}
26+
return html
27+
},
28+
{
29+
// v2 — bump to evict any v1 entries that cached the empty JS shell
30+
// before the shell-detection / UA fix landed.
31+
name: 'nuxt-scripts-instagram-embed-v2',
32+
maxAge: 600,
33+
swr: true,
34+
staleMaxAge: 600,
35+
// Vary on headers too — Instagram's response is UA-dependent, so
36+
// different callers (e.g. unit tests, future UA changes) must not
37+
// collide on the same key.
38+
getKey: (url: string, headers: Record<string, string>) => {
39+
const parts = [url]
40+
for (const [k, v] of Object.entries(headers).sort(([a], [b]) => a.localeCompare(b)))
41+
parts.push(`${k}=${v}`)
42+
return parts.join('|')
43+
},
44+
},
1945
)
2046

2147
// Static CSS from Instagram's CDN is versioned; 24h cache is safe because the
@@ -79,10 +105,12 @@ export default withSigning(defineEventHandler(async (event) => {
79105
const embedUrl = `${cleanUrl}embed/${captions ? 'captioned/' : ''}`
80106

81107
const html = await cachedEmbedFetch(embedUrl, {
82-
headers: {
83-
'Accept': 'text/html',
84-
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
85-
},
108+
'Accept': 'text/html',
109+
// Meta's own crawler UA. Googlebot's UA is also accepted by Instagram
110+
// but is IP-verified, so it fails from hosts outside Google's ranges
111+
// (e.g. Cloudflare/Vercel) and Instagram serves the JS shell instead
112+
// of the SSR'd post.
113+
'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)',
86114
}).catch((error: any) => {
87115
throw createError({
88116
statusCode: error.statusCode || 500,

packages/script/src/runtime/server/utils/instagram-embed.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,19 @@
11
import { buildProxyUrl } from './proxy-url'
22

33
export const RSRC_RE = /url\(\/rsrc\.php([^)]+)\)/g
4+
5+
// Instagram serves a JS-only shell (splash-screen + comet sentinel, no SSR'd
6+
// post markup) when it can't or won't render server-side — e.g. for bot UAs
7+
// it can't verify, or for removed/private posts.
8+
const SHELL_BODY_RE = /id=["'](?:splash-screen|has-finished-comet-page)["']/
9+
// Match Embed / EmbeddedMedia / EmbeddedMediaImage as tokens inside any
10+
// class attribute (single- or double-quoted, multi-class lists).
11+
const HAS_POST_CONTENT_RE = /\bclass=(["'])[^"']*\b(?:Embed|EmbeddedMedia|EmbeddedMediaImage)\b[^"']*\1/i
12+
13+
export function isEmbedShell(html: string): boolean {
14+
return SHELL_BODY_RE.test(html) && !HAS_POST_CONTENT_RE.test(html)
15+
}
16+
417
export const AMP_RE = /&amp;/g
518
export const SCONTENT_RE = /https:\/\/scontent[^"'\s),]+\.cdninstagram\.com[^"'\s),]+/g
619
export const STATIC_CDN_RE = /https:\/\/static\.cdninstagram\.com[^"'\s),]+/g

test/unit/instagram-embed.test.ts

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { ELEMENT_NODE, parse, renderSync, TEXT_NODE, walkSync } from 'ultrahtml'
22
import { describe, expect, it } from 'vitest'
33
import {
4+
isEmbedShell,
45
proxyImageUrl,
56
rewriteUrl,
67
rewriteUrlsInText,
@@ -302,3 +303,43 @@ html, body { margin: 0; padding: 0; }
302303
expect(result).toContain(`${scope} [data-x="a,b"] .Embed`)
303304
})
304305
})
306+
307+
describe('instagram-embed: isEmbedShell', () => {
308+
it('detects JS-only shell with splash-screen and no post markup', () => {
309+
const html = '<body><div id="splash-screen"></div><div id="has-finished-comet-page"></div></body>'
310+
expect(isEmbedShell(html)).toBe(true)
311+
})
312+
313+
it('accepts real SSR\'d post even when shell sentinels appear elsewhere', () => {
314+
// Some Instagram responses include the comet sentinel alongside real content.
315+
const html = '<body><a class="EmbeddedMedia"><img class="EmbeddedMediaImage" /></a><div id="has-finished-comet-page"></div></body>'
316+
expect(isEmbedShell(html)).toBe(false)
317+
})
318+
319+
it('accepts real post with Embed wrapper', () => {
320+
const html = '<div class="Embed" data-media-id="123"><div>content</div></div>'
321+
expect(isEmbedShell(html)).toBe(false)
322+
})
323+
324+
it('returns false on unrelated HTML (no shell sentinels)', () => {
325+
expect(isEmbedShell('<html><body>nothing here</body></html>')).toBe(false)
326+
})
327+
328+
it('detects post content inside multi-class lists', () => {
329+
// Real Instagram responses include splash-screen alongside the SSR'd post;
330+
// multi-class lists like `class="post EmbeddedMedia foo"` must count.
331+
const html = '<div id="splash-screen"></div><div class="post EmbeddedMedia foo"></div>'
332+
expect(isEmbedShell(html)).toBe(false)
333+
})
334+
335+
it('detects post content with single-quoted class attribute', () => {
336+
const html = `<div id='splash-screen'></div><a class='EmbeddedMedia'></a>`
337+
expect(isEmbedShell(html)).toBe(false)
338+
})
339+
340+
it('does not match Embed inside an unrelated class token (word boundary)', () => {
341+
// `EmbedSomething` should not count as post content.
342+
const html = '<div id="splash-screen"></div><div class="NotAnEmbedThing"></div>'
343+
expect(isEmbedShell(html)).toBe(true)
344+
})
345+
})

0 commit comments

Comments
 (0)