Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
swscale/x86/swscale: Remove obsolete and harmful MMX(EXT) functions
x64 always has MMX, MMXEXT, SSE and SSE2 and this means
that some functions for MMX, MMXEXT, SSE and 3dnow are always
overridden by other functions (unless one e.g. explicitly
disables SSE2). So given that the only systems that
benefit from these functions are truely ancient 32bit x86s
they are removed.

Moreover, some of the removed code was buggy/not bitexact
and lead to failures involving the f32le and f32be versions of
gray, gbrp and gbrap on x86-32 when SSE2 was not disabled.
See e.g.
https://fate.ffmpeg.org/report.cgi?time=20220609221253&slot=x86_32-debian-kfreebsd-gcc-4.4-cpuflags-mmx

Notice that yuv2yuvX_mmx is not removed, because it is used
by SSE3 and AVX2 as fallback in case of unaligned data and
also for tail processing. I don't know why yuv2yuvX_mmxext
isn't being used for this; an earlier version [1] of
554c2bc used it, but
the version that was eventually applied does not.

[1]: https://ffmpeg.org/pipermail/ffmpeg-devel/2020-November/272124.html

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
  • Loading branch information
mkver committed Jun 11, 2022
1 parent c4befc3 commit c5513ad
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 133 deletions.
20 changes: 0 additions & 20 deletions libswscale/x86/input.asm
Expand Up @@ -342,11 +342,6 @@ RGB24_TO_UV_FN %2, rgb
RGB24_TO_UV_FN %2, bgr, rgb
%endmacro

%if ARCH_X86_32
INIT_MMX mmx
RGB24_FUNCS 0, 0
%endif

INIT_XMM sse2
RGB24_FUNCS 10, 12

Expand Down Expand Up @@ -535,11 +530,6 @@ RGB32_TO_UV_FN %2, a, r, g, b, rgba
RGB32_TO_UV_FN %2, a, b, g, r, rgba
%endmacro

%if ARCH_X86_32
INIT_MMX mmx
RGB32_FUNCS 0, 0
%endif

INIT_XMM sse2
RGB32_FUNCS 8, 12

Expand Down Expand Up @@ -735,16 +725,6 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%endif ; mmsize == 8/16
%endmacro

%if ARCH_X86_32
INIT_MMX mmx
YUYV_TO_Y_FN 0, yuyv
YUYV_TO_Y_FN 0, uyvy
YUYV_TO_UV_FN 0, yuyv
YUYV_TO_UV_FN 0, uyvy
NVXX_TO_UV_FN 0, nv12
NVXX_TO_UV_FN 0, nv21
%endif

INIT_XMM sse2
YUYV_TO_Y_FN 3, yuyv
YUYV_TO_Y_FN 2, uyvy
Expand Down
14 changes: 1 addition & 13 deletions libswscale/x86/output.asm
Expand Up @@ -312,11 +312,9 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%endif ; %1 == 8/9/10/16
%endmacro

%if ARCH_X86_32
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
INIT_MMX mmxext
yuv2planeX_fn 8, 0, 7
yuv2planeX_fn 9, 0, 5
yuv2planeX_fn 10, 0, 5
%endif

INIT_XMM sse2
Expand Down Expand Up @@ -451,16 +449,6 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
REP_RET
%endmacro

%if ARCH_X86_32
INIT_MMX mmx
yuv2plane1_fn 8, 0, 5
yuv2plane1_fn 16, 0, 3

INIT_MMX mmxext
yuv2plane1_fn 9, 0, 3
yuv2plane1_fn 10, 0, 3
%endif

INIT_XMM sse2
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
Expand Down
4 changes: 0 additions & 4 deletions libswscale/x86/scale.asm
Expand Up @@ -411,10 +411,6 @@ SCALE_FUNCS 14, 19, %2
SCALE_FUNCS 16, 19, %3
%endmacro

%if ARCH_X86_32
INIT_MMX mmx
SCALE_FUNCS2 0, 0, 0
%endif
INIT_XMM sse2
SCALE_FUNCS2 7, 6, 8
INIT_XMM ssse3
Expand Down
83 changes: 9 additions & 74 deletions libswscale/x86/swscale.c
Expand Up @@ -54,14 +54,6 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;


//MMX versions
#if HAVE_MMX_INLINE
#undef RENAME
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _mmx
#include "swscale_template.c"
#endif

// MMXEXT versions
#if HAVE_MMXEXT_INLINE
#undef RENAME
Expand Down Expand Up @@ -269,9 +261,6 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
SCALE_FUNCS(X4, opt); \
SCALE_FUNCS(X8, opt)

#if ARCH_X86_32
SCALE_FUNCS_MMX(mmx);
#endif
SCALE_FUNCS_SSE(sse2);
SCALE_FUNCS_SSE(ssse3);
SCALE_FUNCS_SSE(sse4);
Expand All @@ -288,9 +277,7 @@ void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
VSCALEX_FUNC(9, opt); \
VSCALEX_FUNC(10, opt)

#if ARCH_X86_32
VSCALEX_FUNCS(mmxext);
#endif
VSCALEX_FUNC(8, mmxext);
VSCALEX_FUNCS(sse2);
VSCALEX_FUNCS(sse4);
VSCALEX_FUNC(16, sse4);
Expand All @@ -305,9 +292,6 @@ void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int ds
VSCALE_FUNC(10, opt2); \
VSCALE_FUNC(16, opt1)

#if ARCH_X86_32
VSCALE_FUNCS(mmx, mmxext);
#endif
VSCALE_FUNCS(sse2, sse2);
VSCALE_FUNC(16, sse4);
VSCALE_FUNCS(avx, avx);
Expand Down Expand Up @@ -337,9 +321,6 @@ void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(bgr24, opt)

#if ARCH_X86_32
INPUT_FUNCS(mmx);
#endif
INPUT_FUNCS(sse2);
INPUT_FUNCS(ssse3);
INPUT_FUNCS(avx);
Expand Down Expand Up @@ -470,19 +451,11 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();

#if HAVE_MMX_INLINE
if (INLINE_MMX(cpu_flags))
sws_init_swscale_mmx(c);
#endif
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
sws_init_swscale_mmxext(c);
#endif
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
#if HAVE_MMX_EXTERNAL
if (EXTERNAL_MMX(cpu_flags))
c->yuv2planeX = yuv2yuvX_mmx;
#endif
#if HAVE_MMXEXT_EXTERNAL
if (EXTERNAL_MMXEXT(cpu_flags))
c->yuv2planeX = yuv2yuvX_mmxext;
Expand All @@ -496,6 +469,14 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
c->yuv2planeX = yuv2yuvX_avx2;
#endif
}
#if ARCH_X86_32 && !HAVE_ALIGNED_STACK
// The better yuv2planeX_8 functions need aligned stack on x86-32,
// so we use MMXEXT in this case if they are not available.
if (EXTERNAL_MMXEXT(cpu_flags)) {
if (c->dstBpc == 8 && !c->use_mmx_vfilter)
c->yuv2planeX = ff_yuv2planeX_8_mmxext;
}
#endif /* ARCH_X86_32 && !HAVE_ALIGNED_STACK */

#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
if (c->srcBpc == 8) { \
Expand All @@ -519,12 +500,6 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
} \
} while (0)
#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
}
#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
switch(c->dstBpc){ \
case 16: do_16_case; break; \
Expand All @@ -546,46 +521,6 @@ switch(c->dstBpc){ \
if (!c->chrSrcHSubSample) \
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
break
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);

switch (c->srcFormat) {
case AV_PIX_FMT_YA8:
c->lumToYV12 = ff_yuyvToY_mmx;
if (c->needAlpha)
c->alpToYV12 = ff_uyvyToY_mmx;
break;
case AV_PIX_FMT_YUYV422:
c->lumToYV12 = ff_yuyvToY_mmx;
c->chrToYV12 = ff_yuyvToUV_mmx;
break;
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = ff_uyvyToY_mmx;
c->chrToYV12 = ff_uyvyToUV_mmx;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_mmx;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_mmx;
break;
case_rgb(rgb24, RGB24, mmx);
case_rgb(bgr24, BGR24, mmx);
case_rgb(bgra, BGRA, mmx);
case_rgb(rgba, RGBA, mmx);
case_rgb(abgr, ABGR, mmx);
case_rgb(argb, ARGB, mmx);
default:
break;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
}
#endif /* ARCH_X86_32 */
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
Expand Down
30 changes: 8 additions & 22 deletions libswscale/x86/swscale_template.c
Expand Up @@ -29,13 +29,8 @@
#undef PREFETCH


#if COMPILE_TEMPLATE_MMXEXT
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
#define MOVNTQ2 "movntq "
#else
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
#define MOVNTQ2 "movq "
#endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)

#define YSCALEYUV2PACKEDX_UV \
Expand Down Expand Up @@ -600,13 +595,8 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
"cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"

#if COMPILE_TEMPLATE_MMXEXT
#undef WRITEBGR24
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
#else
#undef WRITEBGR24
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
#endif

#if HAVE_6REGS
static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
Expand Down Expand Up @@ -1478,17 +1468,13 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}

if (c->srcBpc == 8 && c->dstBpc <= 14) {
// Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
#if COMPILE_TEMPLATE_MMXEXT
if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
c->hyscale_fast = ff_hyscale_fast_mmxext;
c->hcscale_fast = ff_hcscale_fast_mmxext;
} else {
#endif /* COMPILE_TEMPLATE_MMXEXT */
c->hyscale_fast = NULL;
c->hcscale_fast = NULL;
#if COMPILE_TEMPLATE_MMXEXT
}
#endif /* COMPILE_TEMPLATE_MMXEXT */
// Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
c->hyscale_fast = ff_hyscale_fast_mmxext;
c->hcscale_fast = ff_hcscale_fast_mmxext;
} else {
c->hyscale_fast = NULL;
c->hcscale_fast = NULL;
}
}
}

0 comments on commit c5513ad

Please sign in to comment.