diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index dc1ff72add491..10d3007e5839b 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1072,10 +1072,20 @@ def : ProcessorModel<"pentium-m", GenericPostRAModel,
                       FeatureCMOV, FeatureInsertVZEROUPPER]>;
 
 foreach P = ["pentium4", "pentium4m"] in {
+//  def : ProcessorModel<P, GenericPostRAModel,
+//                       [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+//                        FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
+//                        FeatureCMOV, FeatureInsertVZEROUPPER]>;
+
+  // Since 'pentium4' is the default 32-bit CPU on Linux and Windows,
+  // give it more modern tunings.
+  // FIXME: This wouldn't be needed if we supported mtune.
   def : ProcessorModel<P, GenericPostRAModel,
-                       [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
+                       [FeatureX87, FeatureCMPXCHG8B,
                         FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
-                        FeatureCMOV, FeatureInsertVZEROUPPER]>;
+                        FeatureCMOV, FeatureInsertVZEROUPPER,
+                        FeatureSlow3OpsLEA, FeatureSlowDivide64,
+                        FeatureSlowIncDec, FeatureMacroFusion]>;
 }
 
 // Intel Quark.
diff --git a/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll b/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll
index 25e3691913c8c..380c18fbf5c5b 100644
--- a/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll
+++ b/llvm/test/CodeGen/X86/cfguard-x86-vectorcall.ll
@@ -16,19 +16,19 @@ entry:
   ; X32-LABEL: func_cf_vector_x86
   ; X32: 	     movl 12(%ebp), %eax
   ; X32: 	     movl 8(%ebp), %ecx
-  ; X32: 	     movsd 24(%eax), %xmm4         # xmm4 = mem[0],zero
-  ; X32: 	     movsd %xmm4, 24(%esp)
-  ; X32: 	     movsd 16(%eax), %xmm5         # xmm5 = mem[0],zero
-  ; X32: 	     movsd %xmm5, 16(%esp)
-  ; X32: 	     movsd (%eax), %xmm6           # xmm6 = mem[0],zero
-  ; X32: 	     movsd 8(%eax), %xmm7          # xmm7 = mem[0],zero
-  ; X32: 	     movsd %xmm7, 8(%esp)
-  ; X32: 	     movsd %xmm6, (%esp)
+  ; X32: 	     movups	(%eax), %xmm0
+  ; X32: 	     movups	16(%eax), %xmm1
+  ; X32: 	     movaps	%xmm0, (%esp)
+  ; X32: 	     movaps	%xmm1, 16(%esp)
+  ; X32: 	     movsd	(%esp), %xmm4
+  ; X32: 	     movsd	8(%esp), %xmm5
+  ; X32: 	     movsd	16(%esp), %xmm6
+  ; X32: 	     movsd	24(%esp), %xmm7
   ; X32: 	     calll *___guard_check_icall_fptr
-  ; X32: 	     movaps %xmm6, %xmm0
-  ; X32: 	     movaps %xmm7, %xmm1
-  ; X32: 	     movaps %xmm5, %xmm2
-  ; X32: 	     movaps %xmm4, %xmm3
+  ; X32: 	     movaps %xmm4, %xmm0
+  ; X32: 	     movaps %xmm5, %xmm1
+  ; X32: 	     movaps %xmm6, %xmm2
+  ; X32: 	     movaps %xmm7, %xmm3
   ; X32: 	     calll  *%ecx
 }
 attributes #0 = { "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
diff --git a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
index f2c7c2fa4a564..295fdfb5a2617 100644
--- a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -3,8 +3,6 @@
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3      2>&1 | FileCheck %s --check-prefix=SLOW
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m     2>&1 | FileCheck %s --check-prefix=SLOW
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m     2>&1 | FileCheck %s --check-prefix=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4      2>&1 | FileCheck %s --check-prefix=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m     2>&1 | FileCheck %s --check-prefix=SLOW
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah         2>&1 | FileCheck %s --check-prefix=SLOW
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott      2>&1 | FileCheck %s --check-prefix=SLOW
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona        2>&1 | FileCheck %s --check-prefix=SLOW
@@ -14,6 +12,10 @@
 
 ; Intel chips with fast unaligned memory accesses
 
+; Marked fast because this is the default 32-bit mode CPU in clang.
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4      2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m     2>&1 | FileCheck %s --check-prefix=FAST
+
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont     2>&1 | FileCheck %s --check-prefix=FAST
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nehalem        2>&1 | FileCheck %s --check-prefix=FAST
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=westmere       2>&1 | FileCheck %s --check-prefix=FAST
diff --git a/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll b/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll
index d42dcf0dd1742..23a1f56fdf409 100644
--- a/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll
+++ b/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll
@@ -40,7 +40,7 @@
 ; OBJ: SubSectionType: FrameData (0xF5)
 ; OBJ:    FrameData {
 ; OBJ:      RvaStart: 0x0
-; OBJ:      CodeSize: 0x34
+; OBJ:      CodeSize: 0x36
 ; OBJ:      PrologSize: 0x9
 ; OBJ:      FrameFunc [
 ; OBJ-NEXT:   $T0 .raSearch =
@@ -50,7 +50,7 @@
 ; OBJ:    }
 ; OBJ:    FrameData {
 ; OBJ:      RvaStart: 0x7
-; OBJ:      CodeSize: 0x2D
+; OBJ:      CodeSize: 0x2F
 ; OBJ:      PrologSize: 0x2
 ; OBJ:      FrameFunc [
 ; OBJ-NEXT:   $T0 .raSearch =
@@ -61,7 +61,7 @@
 ; OBJ:    }
 ; OBJ:    FrameData {
 ; OBJ:      RvaStart: 0x8
-; OBJ:      CodeSize: 0x2C
+; OBJ:      CodeSize: 0x2E
 ; OBJ:      PrologSize: 0x1
 ; OBJ:      FrameFunc [
 ; OBJ-NEXT:   $T0 .raSearch =
@@ -73,7 +73,7 @@
 ; OBJ:    }
 ; OBJ:    FrameData {
 ; OBJ:      RvaStart: 0x9
-; OBJ:      CodeSize: 0x2B
+; OBJ:      CodeSize: 0x2D
 ; OBJ:      PrologSize: 0x0
 ; OBJ:      FrameFunc [
 ; OBJ-NEXT:   $T0 .raSearch =
diff --git a/llvm/test/DebugInfo/COFF/types-array.ll b/llvm/test/DebugInfo/COFF/types-array.ll
index 2962f970aca14..19ddcf9ffe2c9 100644
--- a/llvm/test/DebugInfo/COFF/types-array.ll
+++ b/llvm/test/DebugInfo/COFF/types-array.ll
@@ -51,7 +51,7 @@
 ; CHECK:       PtrParent: 0x0
 ; CHECK:       PtrEnd: 0x0
 ; CHECK:       PtrNext: 0x0
-; CHECK:       CodeSize: 0x39
+; CHECK:       CodeSize: 0x2A
 ; CHECK:       DbgStart: 0x0
 ; CHECK:       DbgEnd: 0x0
 ; CHECK:       FunctionType: f (0x1002)
@@ -73,7 +73,7 @@
 ; CHECK:       LocalVariableAddrRange {
 ; CHECK:         OffsetStart: .text+0x6
 ; CHECK:         ISectStart: 0x0
-; CHECK:         Range: 0x33
+; CHECK:         Range: 0x24
 ; CHECK:       }
 ; CHECK:     }
 ; CHECK:     ProcEnd {