From 14f71c3d2fe38c90cbd11d2132f98280abd9d7eb Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 17 Jan 2024 11:18:44 -0800 Subject: [PATCH 01/23] x86/cpu: Add model number for Intel Clearwater Forest processor commit 090e3bec01763e415bccae445f5bfe3d0c61b629 upstream. Server product based on the Atom Darkmont core. Intel-SIG: commit 090e3bec0176 x86/cpu: Add model number for Intel Clearwater Forest processor. BACKPORTING NEW CPU IFM Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240117191844.56180-1-tony.luck@intel.com [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- arch/x86/include/asm/intel-family.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 197316121f04e..b65e9c46b9221 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -162,6 +162,8 @@ #define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ #define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ +#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ + /* Xeon Phi */ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ From f18fbd387e7e2025cda0157faebe019b0bba6324 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 22 Mar 2024 09:17:25 -0700 Subject: [PATCH 02/23] x86/cpu: Add model number for another Intel Arrow Lake mobile processor commit 8a8a9c9047d1089598bdb010ec44d7f14b4f9203 upstream. This one is the regular laptop CPU. Intel-SIG: commit 8a8a9c9047d1 x86/cpu: Add model number for another Intel Arrow Lake mobile processor. BACKPORTING NEW CPU IFM Signed-off-by: Tony Luck Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240322161725.195614-1-tony.luck@intel.com [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- arch/x86/include/asm/intel-family.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index b65e9c46b9221..d0941f4c27249 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -127,6 +127,7 @@ #define INTEL_FAM6_ARROWLAKE_H 0xC5 #define INTEL_FAM6_ARROWLAKE 0xC6 +#define INTEL_FAM6_ARROWLAKE_U 0xB5 #define INTEL_FAM6_LUNARLAKE_M 0xBD From 8bc909c7d8d459355efbc83e1a6b597b53d12206 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 16 Apr 2024 14:19:03 -0700 Subject: [PATCH 03/23] x86/cpu/vfm: Add/initialize x86_vfm field to struct cpuinfo_x86 commit a9d0adce69075192961f3be466c4810a21b7bc9e upstream. Refactor struct cpuinfo_x86 so that the vendor, family, and model fields are overlaid in a union with a 32-bit field that combines all three (together with a one byte reserved field in the upper byte). This will make it easy, cheap, and reliable to check all three values at once. See https://lore.kernel.org/r/Zgr6kT8oULbnmEXx@agluck-desk3 for why the ordering is (low-to-high bits): (vendor, family, model) [ bp: Move comments over the line, add the backstory about the particular order of the fields. ] Intel-SIG: commit a9d0adce6907 x86/cpu/vfm: Add/initialize x86_vfm field to struct cpuinfo_x86. BACKPORTING NEW CPU IFM Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240416211941.9369-2-tony.luck@intel.com [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- arch/x86/include/asm/processor.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 67ad64efa9263..4a32f4b772884 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -81,9 +81,23 @@ extern u16 __read_mostly tlb_lld_1g[NR_INFO]; */ struct cpuinfo_x86 { - __u8 x86; /* CPU family */ - __u8 x86_vendor; /* CPU vendor */ - __u8 x86_model; + union { + /* + * The particular ordering (low-to-high) of (vendor, + * family, model) is done in case range of models, like + * it is usually done on AMD, need to be compared. + */ + struct { + __u8 x86_model; + /* CPU family */ + __u8 x86; + /* CPU vendor */ + __u8 x86_vendor; + __u8 x86_reserved; + }; + /* combined vendor, family, model */ + __u32 x86_vfm; + }; __u8 x86_stepping; #ifdef CONFIG_X86_64 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ From 51b40c6566b10a4e1687eb6a4eb0a1b7cb2c051a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 16 Apr 2024 14:19:05 -0700 Subject: [PATCH 04/23] x86/cpu/vfm: Update arch/x86/include/asm/intel-family.h commit f055b6260eb3ef20a6e310d1e555a5d5a0a28ca0 upstream. New CPU #defines encode vendor and family as well as model. Update the example usage comment in arch/x86/kernel/cpu/match.c Intel-SIG: commit f055b6260eb3 x86/cpu/vfm: Update arch/x86/include/asm/intel-family.h. BACKPORTING NEW CPU IFM Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240416211941.9369-4-tony.luck@intel.com [ Quanxian Wang: amend commit log ] Signed-off-by: Quanxian Wang --- arch/x86/include/asm/intel-family.h | 84 +++++++++++++++++++++++++++++ arch/x86/kernel/cpu/match.c | 3 +- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index d0941f4c27249..f81a851c46dca 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -40,137 +40,221 @@ * their own names :-( */ +#define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model) + /* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ #define INTEL_FAM6_ANY X86_MODEL_ANY +/* Wildcard match for FAM6 so X86_MATCH_VFM(ANY) works */ +#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY) #define INTEL_FAM6_CORE_YONAH 0x0E +#define INTEL_CORE_YONAH IFM(6, 0x0E) #define INTEL_FAM6_CORE2_MEROM 0x0F +#define INTEL_CORE2_MEROM IFM(6, 0x0F) #define INTEL_FAM6_CORE2_MEROM_L 0x16 +#define INTEL_CORE2_MEROM_L IFM(6, 0x16) #define INTEL_FAM6_CORE2_PENRYN 0x17 +#define INTEL_CORE2_PENRYN IFM(6, 0x17) #define INTEL_FAM6_CORE2_DUNNINGTON 0x1D +#define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D) #define INTEL_FAM6_NEHALEM 0x1E +#define INTEL_NEHALEM IFM(6, 0x1E) #define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ +#define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */ #define INTEL_FAM6_NEHALEM_EP 0x1A +#define INTEL_NEHALEM_EP IFM(6, 0x1A) #define INTEL_FAM6_NEHALEM_EX 0x2E +#define INTEL_NEHALEM_EX IFM(6, 0x2E) #define INTEL_FAM6_WESTMERE 0x25 +#define INTEL_WESTMERE IFM(6, 0x25) #define INTEL_FAM6_WESTMERE_EP 0x2C +#define INTEL_WESTMERE_EP IFM(6, 0x2C) #define INTEL_FAM6_WESTMERE_EX 0x2F +#define INTEL_WESTMERE_EX IFM(6, 0x2F) #define INTEL_FAM6_SANDYBRIDGE 0x2A +#define INTEL_SANDYBRIDGE IFM(6, 0x2A) #define INTEL_FAM6_SANDYBRIDGE_X 0x2D +#define INTEL_SANDYBRIDGE_X IFM(6, 0x2D) #define INTEL_FAM6_IVYBRIDGE 0x3A +#define INTEL_IVYBRIDGE IFM(6, 0x3A) #define INTEL_FAM6_IVYBRIDGE_X 0x3E +#define INTEL_IVYBRIDGE_X IFM(6, 0x3E) #define INTEL_FAM6_HASWELL 0x3C +#define INTEL_HASWELL IFM(6, 0x3C) #define INTEL_FAM6_HASWELL_X 0x3F +#define INTEL_HASWELL_X IFM(6, 0x3F) #define INTEL_FAM6_HASWELL_L 0x45 +#define INTEL_HASWELL_L IFM(6, 0x45) #define INTEL_FAM6_HASWELL_G 0x46 +#define INTEL_HASWELL_G IFM(6, 0x46) #define INTEL_FAM6_BROADWELL 0x3D +#define INTEL_BROADWELL IFM(6, 0x3D) #define INTEL_FAM6_BROADWELL_G 0x47 +#define INTEL_BROADWELL_G IFM(6, 0x47) #define INTEL_FAM6_BROADWELL_X 0x4F +#define INTEL_BROADWELL_X IFM(6, 0x4F) #define INTEL_FAM6_BROADWELL_D 0x56 +#define INTEL_BROADWELL_D IFM(6, 0x56) #define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */ +#define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */ #define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */ +#define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */ #define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */ +#define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */ /* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */ /* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */ #define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */ +#define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */ /* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */ /* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */ /* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */ #define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */ +#define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */ /* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */ #define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */ +#define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */ #define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */ +#define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */ #define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */ +#define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */ #define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */ +#define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */ #define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */ +#define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */ #define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */ +#define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */ #define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ +#define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */ #define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ +#define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */ #define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ +#define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */ #define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ +#define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */ #define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */ +#define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */ #define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ +#define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */ #define INTEL_FAM6_EMERALDRAPIDS_X 0xCF +#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) #define INTEL_FAM6_GRANITERAPIDS_X 0xAD +#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) #define INTEL_FAM6_GRANITERAPIDS_D 0xAE +#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) /* "Hybrid" Processors (P-Core/E-Core) */ #define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ +#define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */ #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ +#define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */ #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ +#define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */ #define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */ +#define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */ #define INTEL_FAM6_RAPTORLAKE_P 0xBA +#define INTEL_RAPTORLAKE_P IFM(6, 0xBA) #define INTEL_FAM6_RAPTORLAKE_S 0xBF +#define INTEL_RAPTORLAKE_S IFM(6, 0xBF) #define INTEL_FAM6_METEORLAKE 0xAC +#define INTEL_METEORLAKE IFM(6, 0xAC) #define INTEL_FAM6_METEORLAKE_L 0xAA +#define INTEL_METEORLAKE_L IFM(6, 0xAA) #define INTEL_FAM6_ARROWLAKE_H 0xC5 +#define INTEL_ARROWLAKE_H IFM(6, 0xC5) #define INTEL_FAM6_ARROWLAKE 0xC6 +#define INTEL_ARROWLAKE IFM(6, 0xC6) #define INTEL_FAM6_ARROWLAKE_U 0xB5 +#define INTEL_ARROWLAKE_U IFM(6, 0xB5) #define INTEL_FAM6_LUNARLAKE_M 0xBD +#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* "Small Core" Processors (Atom/E-Core) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ +#define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */ #define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ +#define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */ #define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ +#define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */ #define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ +#define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */ #define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ +#define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */ #define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ +#define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */ #define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */ +#define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */ #define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ +#define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ +#define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */ #define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ +#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */ #define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */ +#define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ +#define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */ #define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */ +#define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */ /* Note: the micro-architecture is "Goldmont Plus" */ #define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ +#define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */ #define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */ +#define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */ #define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ +#define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */ #define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ +#define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */ #define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */ +#define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */ #define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ +#define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */ #define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ +#define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */ #define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ +#define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */ /* Xeon Phi */ #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ +#define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */ #define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ +#define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */ /* Family 5 */ #define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ +#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index ae71b8ef909c9..978a3094e8ff7 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -17,8 +17,7 @@ * * A typical table entry would be to match a specific CPU * - * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, - * X86_FEATURE_ANY, NULL); + * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL); * * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) From 17ef7ed9c74723143360523b1f27c1c21fead2d6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Fri, 14 Jun 2024 11:05:43 +0300 Subject: [PATCH 05/23] memblock: use numa_valid_node() helper to check for invalid node ID commit 8043832e2a123fd9372007a29192f2f3ba328cd6 upstream. Introduce numa_valid_node(nid) that verifies that nid is a valid node ID and use that instead of comparing nid parameter with either NUMA_NO_NODE or MAX_NUMNODES. This makes the checks for valid node IDs consistent and more robust and allows to get rid of multiple WARNings. Intel-SIG: commit 8043832e2a12 memblock: use numa_valid_node() helper to check for invalid node ID Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Suggested-by: Linus Torvalds Signed-off-by: Mike Rapoport (IBM) [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- include/linux/numa.h | 5 +++++ mm/memblock.c | 22 ++++++---------------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/include/linux/numa.h b/include/linux/numa.h index 1d43371fafd2f..eb19503604fe3 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -15,6 +15,11 @@ #define NUMA_NO_NODE (-1) #define NUMA_NO_MEMBLK (-1) +static inline bool numa_valid_node(int nid) +{ + return nid >= 0 && nid < MAX_NUMNODES; +} + /* optionally keep NUMA memory info available post init */ #ifdef CONFIG_NUMA_KEEP_MEMINFO #define __initdata_or_meminfo diff --git a/mm/memblock.c b/mm/memblock.c index d630f5c2bdb90..9dbce558bc81c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1009,7 +1009,7 @@ static bool should_skip_region(struct memblock_type *type, return false; /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != m_nid) + if (numa_valid_node(nid) && nid != m_nid) return true; /* skip hotpluggable memory regions if needed */ @@ -1066,10 +1066,6 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, - "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - for (; idx_a < type_a->cnt; idx_a++) { struct memblock_region *m = &type_a->regions[idx_a]; @@ -1163,9 +1159,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (*idx == (u64)ULLONG_MAX) { idx_a = type_a->cnt - 1; if (type_b != NULL) @@ -1251,7 +1244,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - if (nid == MAX_NUMNODES || nid == r_nid) + if (!numa_valid_node(nid) || nid == r_nid) break; } if (*idx >= type->cnt) { @@ -1396,9 +1389,6 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (!align) { /* Can't use WARNs this early in boot on powerpc */ dump_stack(); @@ -1411,7 +1401,7 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, if (found && !memblock_reserve(found, size)) goto done; - if (nid != NUMA_NO_NODE && !exact_nid) { + if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); @@ -1931,7 +1921,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) end = base + size - 1; flags = rgn->flags; #ifdef CONFIG_NUMA - if (memblock_get_region_node(rgn) != MAX_NUMNODES) + if (numa_valid_node(memblock_get_region_node(rgn))) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif @@ -2120,7 +2110,7 @@ static void __init memmap_init_reserved_pages(void) start = region->base; end = start + region->size; - if (nid == NUMA_NO_NODE || nid >= MAX_NUMNODES) + if (!numa_valid_node(nid)) nid = early_pfn_to_nid(PFN_DOWN(start)); reserve_bootmem_region(start, end, nid); @@ -2209,7 +2199,7 @@ static int memblock_debug_show(struct seq_file *m, void *private) seq_printf(m, "%4d: ", i); seq_printf(m, "%pa..%pa ", ®->base, &end); - if (nid != MAX_NUMNODES) + if (numa_valid_node(nid)) seq_printf(m, "%4d ", nid); else seq_printf(m, "%4c ", 'x'); From 5c05eae0a224e0b43845efb30055d31d0b1f1a84 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 24 Jun 2024 01:54:32 +0000 Subject: [PATCH 06/23] memblock tests: fix implicit declaration of function 'numa_valid_node' commit 9364a7e40d54e6858479f0a96e1a04aa1204be16 upstream. commit 8043832e2a12 ("memblock: use numa_valid_node() helper to check for invalid node ID") introduce a new helper numa_valid_node(), which is not defined in memblock tests. Let's add it in the corresponding header file. Intel-SIG: commit 9364a7e40d54 memblock tests: fix implicit declaration of function 'numa_valid_node' Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Wei Yang CC: Mike Rapoport (IBM) Link: https://lore.kernel.org/r/20240624015432.31134-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- tools/include/linux/numa.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/include/linux/numa.h b/tools/include/linux/numa.h index 110b0e5d0fb00..c8b9369335e00 100644 --- a/tools/include/linux/numa.h +++ b/tools/include/linux/numa.h @@ -13,4 +13,9 @@ #define NUMA_NO_NODE (-1) +static inline bool numa_valid_node(int nid) +{ + return nid >= 0 && nid < MAX_NUMNODES; +} + #endif /* _LINUX_NUMA_H */ From e7f6efeb692296bedb1fd96bb1302202bc4e15bd Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 15 Oct 2024 15:22:35 +0800 Subject: [PATCH 07/23] EDAC/skx_common: Differentiate memory error sources commit 2397f795735219caa9c2fe61e7bcdd0652e670d3 upstream. The current skx_common determines whether the memory error source is the near memory of the 2LM system and then retrieves the decoded error results from the ADXL components (near-memory vs. far-memory) accordingly. However, some memory controllers may have limitations in correctly reporting the memory error source, leading to the retrieval of incorrect decoded parts from the ADXL. To address these limitations, instead of simply determining whether the memory error is from the near memory of the 2LM system, it is necessary to distinguish the memory error source details as follows: Memory error from the near memory of the 2LM system. Memory error from the far memory of the 2LM system. Memory error from the 1LM system. Not a memory error. This will enable the i10nm_edac driver to take appropriate actions for those memory controllers that have limitations in reporting the memory error source. Intel-SIG: commit 2397f7957352 EDAC/skx_common: Differentiate memory error sources Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Fixes: ba987eaaabf9 ("EDAC/i10nm: Add Intel Granite Rapids server support") Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: Diego Garcia Rodriguez Link: https://lore.kernel.org/r/20241015072236.24543-2-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/skx_common.c | 34 ++++++++++++++++------------------ drivers/edac/skx_common.h | 7 +++++++ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 8d18099fd528c..42266120ef427 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -119,7 +119,7 @@ void skx_adxl_put(void) } EXPORT_SYMBOL_GPL(skx_adxl_put); -static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem) +static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) { struct skx_dev *d; int i, len = 0; @@ -136,7 +136,7 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me } res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]]; - if (error_in_1st_level_mem) { + if (err_src == ERR_SRC_2LM_NM) { res->imc = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ? (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1; res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ? @@ -620,31 +620,27 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, optype, skx_msg); } -static bool skx_error_in_1st_level_mem(const struct mce *m) +static enum error_source skx_error_source(const struct mce *m) { - u32 errcode; + u32 errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK; - if (!skx_mem_cfg_2lm) - return false; - - errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK; - - return errcode == MCACOD_EXT_MEM_ERR; -} + if (errcode != MCACOD_MEM_CTL_ERR && errcode != MCACOD_EXT_MEM_ERR) + return ERR_SRC_NOT_MEMORY; -static bool skx_error_in_mem(const struct mce *m) -{ - u32 errcode; + if (!skx_mem_cfg_2lm) + return ERR_SRC_1LM; - errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK; + if (errcode == MCACOD_EXT_MEM_ERR) + return ERR_SRC_2LM_NM; - return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR); + return ERR_SRC_2LM_FM; } int skx_mce_check_error(struct notifier_block *nb, unsigned long val, void *data) { struct mce *mce = (struct mce *)data; + enum error_source err_src; struct decoded_addr res; struct mem_ctl_info *mci; char *type; @@ -652,8 +648,10 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, if (mce->kflags & MCE_HANDLED_CEC) return NOTIFY_DONE; + err_src = skx_error_source(mce); + /* Ignore unless this is memory related with an address */ - if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV)) + if (err_src == ERR_SRC_NOT_MEMORY || !(mce->status & MCI_STATUS_ADDRV)) return NOTIFY_DONE; memset(&res, 0, sizeof(res)); @@ -667,7 +665,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, /* Try driver decoder first */ if (!(driver_decode && driver_decode(&res))) { /* Then try firmware decoder (ACPI DSM methods) */ - if (!(adxl_component_count && skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce)))) + if (!(adxl_component_count && skx_adxl_decode(&res, err_src))) return NOTIFY_DONE; } diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 11faf1db4fa48..30a795d8b8d36 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -147,6 +147,13 @@ enum { INDEX_MAX }; +enum error_source { + ERR_SRC_1LM, + ERR_SRC_2LM_NM, + ERR_SRC_2LM_FM, + ERR_SRC_NOT_MEMORY, +}; + #define BIT_NM_MEMCTRL BIT_ULL(INDEX_NM_MEMCTRL) #define BIT_NM_CHANNEL BIT_ULL(INDEX_NM_CHANNEL) #define BIT_NM_DIMM BIT_ULL(INDEX_NM_DIMM) From 295436ee0020be7891c8fa8c7d88ff3614491172 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 15 Oct 2024 15:22:36 +0800 Subject: [PATCH 08/23] EDAC/{skx_common,i10nm}: Fix incorrect far-memory error source indicator commit a36667037a0c0e36c59407f8ae636295390239a5 upstream. The Granite Rapids CPUs with Flat2LM memory configurations may mistakenly report near-memory errors as far-memory errors, resulting in the invalid decoded ADXL results: EDAC skx: Bad imc -1 Fix this incorrect far-memory error source indicator by prefetching the decoded far-memory controller ID, and adjust the error source indicator to near-memory if the far-memory controller ID is invalid. Intel-SIG: commit a36667037a0c EDAC/{skx_common,i10nm}: Fix incorrect far-memory error source indicator Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Fixes: ba987eaaabf9 ("EDAC/i10nm: Add Intel Granite Rapids server support") Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: Diego Garcia Rodriguez Link: https://lore.kernel.org/r/20241015072236.24543-3-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 1 + drivers/edac/skx_common.c | 23 +++++++++++++++++++++++ drivers/edac/skx_common.h | 1 + 3 files changed, 25 insertions(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 2b83d6de9352b..535f058b48eef 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -1088,6 +1088,7 @@ static int __init i10nm_init(void) return -ENODEV; cfg = (struct res_config *)id->driver_data; + skx_set_res_cfg(cfg); res_cfg = cfg; rc = skx_get_hi_lo(0x09a2, off, &tolm, &tohm); diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 42266120ef427..0b8aaf5f77d9f 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -47,6 +47,7 @@ static skx_show_retry_log_f skx_show_retry_rd_err_log; static u64 skx_tolm, skx_tohm; static LIST_HEAD(dev_edac_list); static bool skx_mem_cfg_2lm; +static struct res_config *skx_res_cfg; int skx_adxl_get(void) { @@ -135,6 +136,22 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) return false; } + /* + * GNR with a Flat2LM memory configuration may mistakenly classify + * a near-memory error(DDR5) as a far-memory error(CXL), resulting + * in the incorrect selection of decoded ADXL components. + * To address this, prefetch the decoded far-memory controller ID + * and adjust the error source to near-memory if the far-memory + * controller ID is invalid. + */ + if (skx_res_cfg && skx_res_cfg->type == GNR && err_src == ERR_SRC_2LM_FM) { + res->imc = (int)adxl_values[component_indices[INDEX_MEMCTRL]]; + if (res->imc == -1) { + err_src = ERR_SRC_2LM_NM; + edac_dbg(0, "Adjust the error source to near-memory.\n"); + } + } + res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]]; if (err_src == ERR_SRC_2LM_NM) { res->imc = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ? @@ -191,6 +208,12 @@ void skx_set_mem_cfg(bool mem_cfg_2lm) } EXPORT_SYMBOL_GPL(skx_set_mem_cfg); +void skx_set_res_cfg(struct res_config *cfg) +{ + skx_res_cfg = cfg; +} +EXPORT_SYMBOL_GPL(skx_set_res_cfg); + void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log) { driver_decode = decode; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 30a795d8b8d36..e7f18ada16681 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -242,6 +242,7 @@ int skx_adxl_get(void); void skx_adxl_put(void); void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log); void skx_set_mem_cfg(bool mem_cfg_2lm); +void skx_set_res_cfg(struct res_config *cfg); int skx_get_src_id(struct skx_dev *d, int off, u8 *id); int skx_get_node_id(struct skx_dev *d, u8 *id); From 3b84aa6e62e814df093ed7bfe7c1e19c7c154dfc Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 29 Jan 2024 14:20:40 +0800 Subject: [PATCH 09/23] EDAC/i10nm: Add Intel Grand Ridge micro-server support commit e77086c3750834553cf6fd2255c5f3ee04843ed8 upstream. The Grand Ridge CPU model uses similar memory controller registers with Granite Rapids server. Add Grand Ridge CPU model ID for EDAC support. Intel-SIG: commit e77086c37508 EDAC/i10nm: Add Intel Grand Ridge micro-server support Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Tested-by: Ricardo Neri Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20240129062040.60809-3-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 535f058b48eef..a3043b6ea79fb 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -951,6 +951,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(EMERALDRAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(GRANITERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); From e1951ae7142060ef258256566d3704ddf74bd4f3 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 14 Feb 2025 08:27:28 +0800 Subject: [PATCH 10/23] EDAC/{skx_common,i10nm}: Fix some missing error reports on Emerald Rapids commit d9207cf7760f5f5599e9ff7eb0fedf56821a1d59 upstream. When doing error injection to some memory DIMMs on certain Intel Emerald Rapids servers, the i10nm_edac missed error reports for some memory DIMMs. Certain BIOS configurations may hide some memory controllers, and the i10nm_edac doesn't enumerate these hidden memory controllers. However, the ADXL decodes memory errors using memory controller physical indices even if there are hidden memory controllers. Therefore, the memory controller physical indices reported by the ADXL may mismatch the logical indices enumerated by the i10nm_edac, resulting in missed error reports for some memory DIMMs. Fix this issue by creating a mapping table from memory controller physical indices (used by the ADXL) to logical indices (used by the i10nm_edac) and using it to convert the physical indices to the logical indices during the error handling process. Intel-SIG: commit d9207cf7760f EDAC/{skx_common,i10nm}: Fix some missing error reports on Emerald Rapids Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Fixes: c545f5e41225 ("EDAC/i10nm: Skip the absent memory controllers") Reported-by: Kevin Chang Tested-by: Kevin Chang Reported-by: Thomas Chen Tested-by: Thomas Chen Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20250214002728.6287-1-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 2 ++ drivers/edac/skx_common.c | 33 +++++++++++++++++++++++++++++++++ drivers/edac/skx_common.h | 11 +++++++++++ 3 files changed, 46 insertions(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index a3043b6ea79fb..2c928069cf092 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -755,6 +755,8 @@ static int i10nm_get_ddr_munits(void) continue; } else { d->imc[lmc].mdev = mdev; + if (res_cfg->type == SPR) + skx_set_mc_mapping(d, i, lmc); lmc++; } } diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 0b8aaf5f77d9f..d47f0055217e4 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -120,6 +120,35 @@ void skx_adxl_put(void) } EXPORT_SYMBOL_GPL(skx_adxl_put); +static void skx_init_mc_mapping(struct skx_dev *d) +{ + /* + * By default, the BIOS presents all memory controllers within each + * socket to the EDAC driver. The physical indices are the same as + * the logical indices of the memory controllers enumerated by the + * EDAC driver. + */ + for (int i = 0; i < NUM_IMC; i++) + d->mc_mapping[i] = i; +} + +void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) +{ + edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n", + pmc, lmc); + + d->mc_mapping[pmc] = lmc; +} +EXPORT_SYMBOL_GPL(skx_set_mc_mapping); + +static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc) +{ + edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", + pmc, d->mc_mapping[pmc]); + + return d->mc_mapping[pmc]; +} + static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) { struct skx_dev *d; @@ -187,6 +216,8 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) return false; } + res->imc = skx_get_mc_mapping(d, res->imc); + for (i = 0; i < adxl_component_count; i++) { if (adxl_values[i] == ~0x0ull) continue; @@ -307,6 +338,8 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) d->bus[0], d->bus[1], d->bus[2], d->bus[3]); list_add_tail(&d->list, &dev_edac_list); prev = pdev; + + skx_init_mc_mapping(d); } if (list) diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index e7f18ada16681..5acfef8fd3d36 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -94,6 +94,16 @@ struct skx_dev { struct pci_dev *uracu; /* for i10nm CPU */ struct pci_dev *pcu_cr3; /* for HBM memory detection */ u32 mcroute; + /* + * Some server BIOS may hide certain memory controllers, and the + * EDAC driver skips those hidden memory controllers. However, the + * ADXL still decodes memory error address using physical memory + * controller indices. The mapping table is used to convert the + * physical indices (reported by ADXL) to the logical indices + * (used the EDAC driver) of present memory controllers during the + * error handling process. + */ + u8 mc_mapping[NUM_IMC]; struct skx_imc { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ @@ -243,6 +253,7 @@ void skx_adxl_put(void); void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log); void skx_set_mem_cfg(bool mem_cfg_2lm); void skx_set_res_cfg(struct res_config *cfg); +void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc); int skx_get_src_id(struct skx_dev *d, int off, u8 *id); int skx_get_node_id(struct skx_dev *d, u8 *id); From eff4058aebd2090fd378dbdc73143d569f4cfdd6 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 17 Apr 2025 23:07:19 +0800 Subject: [PATCH 11/23] EDAC/{skx_common,i10nm}: Fix the loss of saved RRL for HBM pseudo channel 0 commit eeed3e03f4261e5e381a72ae099ff00ccafbb437 upstream. When enabling the retry_rd_err_log (RRL) feature during the loading of the i10nm_edac driver with the module parameter retry_rd_err_log=2 (Linux RRL control mode), the default values of the control bits of RRL are saved so that they can be restored during the unloading of the driver. In the current code, the RRL of pseudo channel 1 of HBM overwrites pseudo channel 0 during the loading of the driver, resulting in the loss of saved RRL for pseudo channel 0. This causes the RRL of pseudo channel 0 of HBM to be wrongly restored with the values from pseudo channel 1 when unloading the driver. Fix this issue by creating two separate groups of RRL control registers per channel to save default RRL settings of two {sub-,pseudo-}channels. Intel-SIG: commit eeed3e03f426 EDAC/{skx_common,i10nm}: Fix the loss of saved RRL for HBM pseudo channel 0 Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Fixes: acd4cf68fefe ("EDAC/i10nm: Retrieve and print retry_rd_err_log registers for HBM") Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: Feng Xu Link: https://lore.kernel.org/r/20250417150724.1170168-3-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 35 +++++++++++++++++++---------------- drivers/edac/skx_common.h | 11 ++++++++--- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 2c928069cf092..11f800cb22900 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -99,7 +99,7 @@ static u32 offsets_demand2_spr[] = {0x22c70, 0x22d80, 0x22f18, 0x22d58, 0x22c64, static u32 offsets_demand_spr_hbm0[] = {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0}; static u32 offsets_demand_spr_hbm1[] = {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0}; -static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable, +static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable, u32 *rrl_ctl, u32 *offsets_scrub, u32 *offsets_demand, u32 *offsets_demand2) { @@ -112,10 +112,10 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable if (enable) { /* Save default configurations */ - imc->chan[chan].retry_rd_err_log_s = s; - imc->chan[chan].retry_rd_err_log_d = d; + rrl_ctl[0] = s; + rrl_ctl[1] = d; if (offsets_demand2) - imc->chan[chan].retry_rd_err_log_d2 = d2; + rrl_ctl[2] = d2; s &= ~RETRY_RD_ERR_LOG_NOOVER_UC; s |= RETRY_RD_ERR_LOG_EN; @@ -129,25 +129,25 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable } } else { /* Restore default configurations */ - if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC) + if (rrl_ctl[0] & RETRY_RD_ERR_LOG_UC) s |= RETRY_RD_ERR_LOG_UC; - if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER) + if (rrl_ctl[0] & RETRY_RD_ERR_LOG_NOOVER) s |= RETRY_RD_ERR_LOG_NOOVER; - if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN)) + if (!(rrl_ctl[0] & RETRY_RD_ERR_LOG_EN)) s &= ~RETRY_RD_ERR_LOG_EN; - if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC) + if (rrl_ctl[1] & RETRY_RD_ERR_LOG_UC) d |= RETRY_RD_ERR_LOG_UC; - if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER) + if (rrl_ctl[1] & RETRY_RD_ERR_LOG_NOOVER) d |= RETRY_RD_ERR_LOG_NOOVER; - if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN)) + if (!(rrl_ctl[1] & RETRY_RD_ERR_LOG_EN)) d &= ~RETRY_RD_ERR_LOG_EN; if (offsets_demand2) { - if (imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_UC) + if (rrl_ctl[2] & RETRY_RD_ERR_LOG_UC) d2 |= RETRY_RD_ERR_LOG_UC; - if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_NOOVER)) + if (!(rrl_ctl[2] & RETRY_RD_ERR_LOG_NOOVER)) d2 &= ~RETRY_RD_ERR_LOG_NOOVER; - if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_EN)) + if (!(rrl_ctl[2] & RETRY_RD_ERR_LOG_EN)) d2 &= ~RETRY_RD_ERR_LOG_EN; } } @@ -161,6 +161,7 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable static void enable_retry_rd_err_log(bool enable) { int i, j, imc_num, chan_num; + struct skx_channel *chan; struct skx_imc *imc; struct skx_dev *d; @@ -175,8 +176,9 @@ static void enable_retry_rd_err_log(bool enable) if (!imc->mbase) continue; + chan = d->imc[i].chan; for (j = 0; j < chan_num; j++) - __enable_retry_rd_err_log(imc, j, enable, + __enable_retry_rd_err_log(imc, j, enable, chan[j].rrl_ctl[0], res_cfg->offsets_scrub, res_cfg->offsets_demand, res_cfg->offsets_demand2); @@ -190,12 +192,13 @@ static void enable_retry_rd_err_log(bool enable) if (!imc->mbase || !imc->hbm_mc) continue; + chan = d->imc[i].chan; for (j = 0; j < chan_num; j++) { - __enable_retry_rd_err_log(imc, j, enable, + __enable_retry_rd_err_log(imc, j, enable, chan[j].rrl_ctl[0], res_cfg->offsets_scrub_hbm0, res_cfg->offsets_demand_hbm0, NULL); - __enable_retry_rd_err_log(imc, j, enable, + __enable_retry_rd_err_log(imc, j, enable, chan[j].rrl_ctl[1], res_cfg->offsets_scrub_hbm1, res_cfg->offsets_demand_hbm1, NULL); diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 5acfef8fd3d36..2ea4d1d1fbef2 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -80,6 +80,9 @@ */ #define MCACOD_EXT_MEM_ERR 0x280 +/* Max RRL register sets per {,sub-,pseudo-}channel. */ +#define NUM_RRL_SET 3 + /* * Each cpu socket contains some pci devices that provide global * information, and also some that are local to each of the two @@ -118,9 +121,11 @@ struct skx_dev { struct skx_channel { struct pci_dev *cdev; struct pci_dev *edev; - u32 retry_rd_err_log_s; - u32 retry_rd_err_log_d; - u32 retry_rd_err_log_d2; + /* + * Two groups of RRL control registers per channel to save default RRL + * settings of two {sub-,pseudo-}channels in Linux RRL control mode. + */ + u32 rrl_ctl[2][NUM_RRL_SET]; struct skx_dimm { u8 close_pg; u8 bank_xor_enable; From a304d0daf28c6213449ca81e2a3dfb676eea4ec4 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 29 Aug 2024 13:51:01 +0800 Subject: [PATCH 12/23] EDAC/{skx_common,skx,i10nm}: Move the common debug code to skx_common commit 8b935823530d6fd0bea0e90a747f3ebd8cfefb0d upstream. Commit afdb82fd763c ("EDAC, i10nm: make skx_common.o a separate module") made skx_common.o a separate module. With skx_common.o now a separate module, move the common debug code setup_{skx,i10nm}_debug() and teardown_{skx,i10nm}_debug() in {skx,i10nm}_base.c to skx_common.c to reduce code duplication. Additionally, prefix these function names with 'skx' to maintain consistency with other names in the file. Intel-SIG: commit 8b935823530d EDAC/{skx_common,skx,i10nm}: Move the common debug code to skx_common Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/all/20240829055101.56245-1-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 52 ++------------------------------------- drivers/edac/skx_base.c | 52 ++------------------------------------- drivers/edac/skx_common.c | 47 +++++++++++++++++++++++++++++++++++ drivers/edac/skx_common.h | 8 ++++++ 4 files changed, 59 insertions(+), 100 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 11f800cb22900..b0ea71a4e30ce 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -1018,54 +1018,6 @@ static struct notifier_block i10nm_mce_dec = { .priority = MCE_PRIO_EDAC, }; -#ifdef CONFIG_EDAC_DEBUG -/* - * Debug feature. - * Exercise the address decode logic by writing an address to - * /sys/kernel/debug/edac/i10nm_test/addr. - */ -static struct dentry *i10nm_test; - -static int debugfs_u64_set(void *data, u64 val) -{ - struct mce m; - - pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val); - - memset(&m, 0, sizeof(m)); - /* ADDRV + MemRd + Unknown channel */ - m.status = MCI_STATUS_ADDRV + 0x90; - /* One corrected error */ - m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT); - m.addr = val; - skx_mce_check_error(NULL, 0, &m); - - return 0; -} -DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); - -static void setup_i10nm_debug(void) -{ - i10nm_test = edac_debugfs_create_dir("i10nm_test"); - if (!i10nm_test) - return; - - if (!edac_debugfs_create_file("addr", 0200, i10nm_test, - NULL, &fops_u64_wo)) { - debugfs_remove(i10nm_test); - i10nm_test = NULL; - } -} - -static void teardown_i10nm_debug(void) -{ - debugfs_remove_recursive(i10nm_test); -} -#else -static inline void setup_i10nm_debug(void) {} -static inline void teardown_i10nm_debug(void) {} -#endif /*CONFIG_EDAC_DEBUG*/ - static int __init i10nm_init(void) { u8 mc = 0, src_id = 0, node_id = 0; @@ -1165,7 +1117,7 @@ static int __init i10nm_init(void) opstate_init(); mce_register_decode_chain(&i10nm_mce_dec); - setup_i10nm_debug(); + skx_setup_debug("i10nm_test"); if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { skx_set_decode(i10nm_mc_decode, show_retry_rd_err_log); @@ -1193,7 +1145,7 @@ static void __exit i10nm_exit(void) enable_retry_rd_err_log(false); } - teardown_i10nm_debug(); + skx_teardown_debug(); mce_unregister_decode_chain(&i10nm_mce_dec); skx_adxl_put(); skx_remove(); diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 0a862336a7ce8..51b2cc9472efa 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -587,54 +587,6 @@ static struct notifier_block skx_mce_dec = { .priority = MCE_PRIO_EDAC, }; -#ifdef CONFIG_EDAC_DEBUG -/* - * Debug feature. - * Exercise the address decode logic by writing an address to - * /sys/kernel/debug/edac/skx_test/addr. - */ -static struct dentry *skx_test; - -static int debugfs_u64_set(void *data, u64 val) -{ - struct mce m; - - pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val); - - memset(&m, 0, sizeof(m)); - /* ADDRV + MemRd + Unknown channel */ - m.status = MCI_STATUS_ADDRV + 0x90; - /* One corrected error */ - m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT); - m.addr = val; - skx_mce_check_error(NULL, 0, &m); - - return 0; -} -DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); - -static void setup_skx_debug(void) -{ - skx_test = edac_debugfs_create_dir("skx_test"); - if (!skx_test) - return; - - if (!edac_debugfs_create_file("addr", 0200, skx_test, - NULL, &fops_u64_wo)) { - debugfs_remove(skx_test); - skx_test = NULL; - } -} - -static void teardown_skx_debug(void) -{ - debugfs_remove_recursive(skx_test); -} -#else -static inline void setup_skx_debug(void) {} -static inline void teardown_skx_debug(void) {} -#endif /*CONFIG_EDAC_DEBUG*/ - /* * skx_init: * make sure we are running on the correct cpu model @@ -728,7 +680,7 @@ static int __init skx_init(void) /* Ensure that the OPSTATE is set correctly for POLL or NMI */ opstate_init(); - setup_skx_debug(); + skx_setup_debug("skx_test"); mce_register_decode_chain(&skx_mce_dec); @@ -742,7 +694,7 @@ static void __exit skx_exit(void) { edac_dbg(2, "\n"); mce_unregister_decode_chain(&skx_mce_dec); - teardown_skx_debug(); + skx_teardown_debug(); if (nvdimm_count) skx_adxl_put(); skx_remove(); diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index d47f0055217e4..ec0d58ffbfc97 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -793,6 +793,53 @@ void skx_remove(void) } EXPORT_SYMBOL_GPL(skx_remove); +#ifdef CONFIG_EDAC_DEBUG +/* + * Debug feature. + * Exercise the address decode logic by writing an address to + * /sys/kernel/debug/edac/{skx,i10nm}_test/addr. + */ +static struct dentry *skx_test; + +static int debugfs_u64_set(void *data, u64 val) +{ + struct mce m; + + pr_warn_once("Fake error to 0x%llx injected via debugfs\n", val); + + memset(&m, 0, sizeof(m)); + /* ADDRV + MemRd + Unknown channel */ + m.status = MCI_STATUS_ADDRV + 0x90; + /* One corrected error */ + m.status |= BIT_ULL(MCI_STATUS_CEC_SHIFT); + m.addr = val; + skx_mce_check_error(NULL, 0, &m); + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); + +void skx_setup_debug(const char *name) +{ + skx_test = edac_debugfs_create_dir(name); + if (!skx_test) + return; + + if (!edac_debugfs_create_file("addr", 0200, skx_test, + NULL, &fops_u64_wo)) { + debugfs_remove(skx_test); + skx_test = NULL; + } +} +EXPORT_SYMBOL_GPL(skx_setup_debug); + +void skx_teardown_debug(void) +{ + debugfs_remove_recursive(skx_test); +} +EXPORT_SYMBOL_GPL(skx_teardown_debug); +#endif /*CONFIG_EDAC_DEBUG*/ + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Tony Luck"); MODULE_DESCRIPTION("MC Driver for Intel server processors"); diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 2ea4d1d1fbef2..b89fc9bd10ec6 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -284,4 +284,12 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, void skx_remove(void); +#ifdef CONFIG_EDAC_DEBUG +void skx_setup_debug(const char *name); +void skx_teardown_debug(void); +#else +static inline void skx_setup_debug(const char *name) {} +static inline void skx_teardown_debug(void) {} +#endif + #endif /* _SKX_COMM_EDAC_H */ From 8c290036edf4faa2d442fed8d087bc85d7af5053 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 29 Aug 2024 14:13:09 +0800 Subject: [PATCH 13/23] EDAC/{skx_common,i10nm}: Remove the AMAP register for determing DDR5 commit 7a33c144c28ebb27c59963007992fed15f3953b3 upstream. The configuration flag 'res_config->support_ddr5 = true' sufficiently indicates DDR5 memory support for Sapphire Rapids and Granite Rapids. Additionally, the i10nm_edac driver doesn't need to use the AMAP register for setting the 'fine_grain_bank' of each DIMM. Therefore, remove the AMAP register for determining DDR5. Intel-SIG: commit 7a33c144c28e EDAC/{skx_common,i10nm}: Remove the AMAP register for determing DDR5 Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/all/20240829061309.57738-1-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 9 ++------- drivers/edac/skx_common.c | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index b0ea71a4e30ce..7e3cef1936472 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -47,10 +47,6 @@ readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : \ (res_cfg->type == GNR ? 0xaf8 : 0x20ef8)) + \ (i) * (m)->chan_mmio_sz) -#define I10NM_GET_AMAP(m, i) \ - readl((m)->mbase + ((m)->hbm_mc ? 0x814 : \ - (res_cfg->type == GNR ? 0xc14 : 0x20814)) + \ - (i) * (m)->chan_mmio_sz) #define I10NM_GET_REG32(m, i, offset) \ readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset)) #define I10NM_GET_REG64(m, i, offset) \ @@ -976,7 +972,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, { struct skx_pvt *pvt = mci->pvt_info; struct skx_imc *imc = pvt->imc; - u32 mtr, amap, mcddrtcfg = 0; + u32 mtr, mcddrtcfg = 0; struct dimm_info *dimm; int i, j, ndimms; @@ -985,7 +981,6 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, continue; ndimms = 0; - amap = I10NM_GET_AMAP(imc, i); if (res_cfg->type != GNR) mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i); @@ -997,7 +992,7 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, mtr, mcddrtcfg, imc->mc, i, j); if (IS_DIMM_PRESENT(mtr)) - ndimms += skx_get_dimm_info(mtr, 0, amap, dimm, + ndimms += skx_get_dimm_info(mtr, 0, 0, dimm, imc, i, j, cfg); else if (IS_NVDIMM_PRESENT(mcddrtcfg, j)) ndimms += skx_get_nvdimm_info(dimm, imc, i, j, diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index ec0d58ffbfc97..85ec3196664d3 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -419,7 +419,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, if (imc->hbm_mc) { banks = 32; mtype = MEM_HBM2; - } else if (cfg->support_ddr5 && (amap & 0x8)) { + } else if (cfg->support_ddr5) { banks = 32; mtype = MEM_DDR5; } else { From a6bb496e16a6cd4d053edc7669315fcb948a6d73 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Tue, 3 Dec 2024 10:20:38 +0800 Subject: [PATCH 14/23] EDAC/i10nm: Add Intel Clearwater Forest server support commit 2e55bb9b71e179c37d05deff37daa0dd8d04b59d upstream. Clearwater Forest is the successor to Sierra Forest. Add Clearwater Forest CPU model ID for EDAC support. Intel-SIG: commit 2e55bb9b71e1 EDAC/i10nm: Add Intel Clearwater Forest server support Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: Yi Lai Link: https://lore.kernel.org/r/20241203022038.72873-1-qiuxu.zhuo@intel.com [ Zhang Rui: resolve conflict (use old X86 Macro) and amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 7e3cef1936472..8b337cba8c74d 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -953,6 +953,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(GRANITERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_DARKMONT_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); From c88fba6fcfd8feea76c77f878ea1c0e3ae13a6f2 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Thu, 12 Dec 2024 19:25:49 -0600 Subject: [PATCH 15/23] EDAC/{i10nm,skx,skx_common}: Support UV systems commit 584e09743d2f44905290b0dbf3215064d2a1888c upstream. The 3-bit source IDs in PCI configuration space registers, used to map devices to sockets, are limited to 8 unique IDs, and each ID is local to a UPI/QPI domain. Source IDs cannot be used to map devices to sockets on UV systems because they can exceed 8 sockets and have multiple UPI/QPI domains with identical, repeating source IDs. Use NUMA information to get package IDs instead of source IDs on UV systems, and use package/source IDs to name IMC information structures. Intel-SIG: commit 584e09743d2f EDAC/{i10nm,skx,skx_common}: Support UV systems Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Kyle Meyer Signed-off-by: Tony Luck Tested-by: Qiuxu Zhuo Reviewed-by: Qiuxu Zhuo Link: https://lore.kernel.org/all/20241213012549.43099-1-kyle.meyer@hpe.com/ [ Zhang Rui: resolve conflict (use topology_physical_package_id) and amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 11 +++------ drivers/edac/skx_base.c | 9 +++----- drivers/edac/skx_common.c | 47 +++++++++++++++++++++++++++------------ drivers/edac/skx_common.h | 3 +-- 4 files changed, 40 insertions(+), 30 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 8b337cba8c74d..7629444587a09 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -1016,7 +1016,7 @@ static struct notifier_block i10nm_mce_dec = { static int __init i10nm_init(void) { - u8 mc = 0, src_id = 0, node_id = 0; + u8 mc = 0, src_id = 0; const struct x86_cpu_id *id; struct res_config *cfg; const char *owner; @@ -1076,19 +1076,14 @@ static int __init i10nm_init(void) if (rc < 0) goto fail; - rc = skx_get_node_id(d, &node_id); - if (rc < 0) - goto fail; - - edac_dbg(2, "src_id = %d node_id = %d\n", src_id, node_id); + edac_dbg(2, "src_id = %d\n", src_id); for (i = 0; i < imc_num; i++) { if (!d->imc[i].mdev) continue; d->imc[i].mc = mc++; d->imc[i].lmc = i; - d->imc[i].src_id = src_id; - d->imc[i].node_id = node_id; + d->imc[i].src_id = src_id; if (d->imc[i].hbm_mc) { d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz; d->imc[i].num_channels = cfg->hbm_chan_num; diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 51b2cc9472efa..67634d3af4617 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -600,7 +600,7 @@ static int __init skx_init(void) const struct munit *m; const char *owner; int rc = 0, i, off[3] = {0xd0, 0xd4, 0xd8}; - u8 mc = 0, src_id, node_id; + u8 mc = 0, src_id; struct skx_dev *d; edac_dbg(2, "\n"); @@ -650,15 +650,12 @@ static int __init skx_init(void) rc = skx_get_src_id(d, 0xf0, &src_id); if (rc < 0) goto fail; - rc = skx_get_node_id(d, &node_id); - if (rc < 0) - goto fail; - edac_dbg(2, "src_id=%d node_id=%d\n", src_id, node_id); + + edac_dbg(2, "src_id = %d\n", src_id); for (i = 0; i < SKX_NUM_IMC; i++) { d->imc[i].mc = mc++; d->imc[i].lmc = i; d->imc[i].src_id = src_id; - d->imc[i].node_id = node_id; rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev, "Skylake Socket", EDAC_MOD_STR, skx_get_dimm_config, cfg); diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 85ec3196664d3..9e4719d75aec4 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "edac_module.h" #include "skx_common.h" @@ -252,33 +253,51 @@ void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log) } EXPORT_SYMBOL_GPL(skx_set_decode); -int skx_get_src_id(struct skx_dev *d, int off, u8 *id) +static int skx_get_pkg_id(struct skx_dev *d, u8 *id) { - u32 reg; + int node; + int cpu; - if (pci_read_config_dword(d->util_all, off, ®)) { - skx_printk(KERN_ERR, "Failed to read src id\n"); - return -ENODEV; + node = pcibus_to_node(d->util_all->bus); + if (numa_valid_node(node)) { + for_each_cpu(cpu, cpumask_of_pcibus(d->util_all->bus)) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && cpu_to_node(cpu) == node) { + *id = topology_physical_package_id(cpu); + return 0; + } + } } - *id = GET_BITFIELD(reg, 12, 14); - return 0; + skx_printk(KERN_ERR, "Failed to get package ID from NUMA information\n"); + return -ENODEV; } -EXPORT_SYMBOL_GPL(skx_get_src_id); -int skx_get_node_id(struct skx_dev *d, u8 *id) +int skx_get_src_id(struct skx_dev *d, int off, u8 *id) { u32 reg; - if (pci_read_config_dword(d->util_all, 0xf4, ®)) { - skx_printk(KERN_ERR, "Failed to read node id\n"); + /* + * The 3-bit source IDs in PCI configuration space registers are limited + * to 8 unique IDs, and each ID is local to a UPI/QPI domain. + * + * Source IDs cannot be used to map devices to sockets on UV systems + * because they can exceed 8 sockets and have multiple UPI/QPI domains + * with identical, repeating source IDs. + */ + if (is_uv_system()) + return skx_get_pkg_id(d, id); + + if (pci_read_config_dword(d->util_all, off, ®)) { + skx_printk(KERN_ERR, "Failed to read src id\n"); return -ENODEV; } - *id = GET_BITFIELD(reg, 0, 2); + *id = GET_BITFIELD(reg, 12, 14); return 0; } -EXPORT_SYMBOL_GPL(skx_get_node_id); +EXPORT_SYMBOL_GPL(skx_get_src_id); static int get_width(u32 mtr) { @@ -540,7 +559,7 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, pvt->imc = imc; mci->ctl_name = kasprintf(GFP_KERNEL, "%s#%d IMC#%d", ctl_name, - imc->node_id, imc->lmc); + imc->src_id, imc->lmc); if (!mci->ctl_name) { rc = -ENOMEM; goto fail0; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index b89fc9bd10ec6..bd2d9eaeb4483 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -117,7 +117,7 @@ struct skx_dev { bool hbm_mc; u8 mc; /* system wide mc# */ u8 lmc; /* socket relative mc# */ - u8 src_id, node_id; + u8 src_id; struct skx_channel { struct pci_dev *cdev; struct pci_dev *edev; @@ -261,7 +261,6 @@ void skx_set_res_cfg(struct res_config *cfg); void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc); int skx_get_src_id(struct skx_dev *d, int off, u8 *id); -int skx_get_node_id(struct skx_dev *d, u8 *id); int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list); From d508b9b601b599f92a98784a8b0113b64a3f43a6 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 17 Apr 2025 23:07:20 +0800 Subject: [PATCH 16/23] EDAC/i10nm: Explicitly set the modes of the RRL register sets commit 4878e1e90056230cefd580136d0e6d5689a7b770 upstream. The i10nm_edac driver uses the default modes (either patrol scrub read or on-demand read) of the RRL register sets configured by the BIOS. Explicitly set the modes during the loading of the i10nm_edac driver with the module parameter retry_rd_err_log=2. Intel-SIG: commit 4878e1e90056 EDAC/i10nm: Explicitly set the modes of the RRL register sets Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: Feng Xu Link: https://lore.kernel.org/r/20250417150724.1170168-4-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 7629444587a09..48688b54ba627 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -73,6 +73,7 @@ #define I10NM_SAD_NM_CACHEABLE(reg) GET_BITFIELD(reg, 5, 5) #define RETRY_RD_ERR_LOG_UC BIT(1) +#define RETRY_RD_ERR_LOG_EN_PATSPR BIT(13) #define RETRY_RD_ERR_LOG_NOOVER BIT(14) #define RETRY_RD_ERR_LOG_EN BIT(15) #define RETRY_RD_ERR_LOG_NOOVER_UC (BIT(14) | BIT(1)) @@ -114,12 +115,15 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable rrl_ctl[2] = d2; s &= ~RETRY_RD_ERR_LOG_NOOVER_UC; + s |= RETRY_RD_ERR_LOG_EN_PATSPR; s |= RETRY_RD_ERR_LOG_EN; d &= ~RETRY_RD_ERR_LOG_NOOVER_UC; + d &= ~RETRY_RD_ERR_LOG_EN_PATSPR; d |= RETRY_RD_ERR_LOG_EN; if (offsets_demand2) { d2 &= ~RETRY_RD_ERR_LOG_UC; + d2 &= ~RETRY_RD_ERR_LOG_EN_PATSPR; d2 |= RETRY_RD_ERR_LOG_NOOVER; d2 |= RETRY_RD_ERR_LOG_EN; } @@ -129,18 +133,24 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable s |= RETRY_RD_ERR_LOG_UC; if (rrl_ctl[0] & RETRY_RD_ERR_LOG_NOOVER) s |= RETRY_RD_ERR_LOG_NOOVER; + if (!(rrl_ctl[0] & RETRY_RD_ERR_LOG_EN_PATSPR)) + s &= ~RETRY_RD_ERR_LOG_EN_PATSPR; if (!(rrl_ctl[0] & RETRY_RD_ERR_LOG_EN)) s &= ~RETRY_RD_ERR_LOG_EN; if (rrl_ctl[1] & RETRY_RD_ERR_LOG_UC) d |= RETRY_RD_ERR_LOG_UC; if (rrl_ctl[1] & RETRY_RD_ERR_LOG_NOOVER) d |= RETRY_RD_ERR_LOG_NOOVER; + if (rrl_ctl[1] & RETRY_RD_ERR_LOG_EN_PATSPR) + d |= RETRY_RD_ERR_LOG_EN_PATSPR; if (!(rrl_ctl[1] & RETRY_RD_ERR_LOG_EN)) d &= ~RETRY_RD_ERR_LOG_EN; if (offsets_demand2) { if (rrl_ctl[2] & RETRY_RD_ERR_LOG_UC) d2 |= RETRY_RD_ERR_LOG_UC; + if (rrl_ctl[2] & RETRY_RD_ERR_LOG_EN_PATSPR) + d2 |= RETRY_RD_ERR_LOG_EN_PATSPR; if (!(rrl_ctl[2] & RETRY_RD_ERR_LOG_NOOVER)) d2 &= ~RETRY_RD_ERR_LOG_NOOVER; if (!(rrl_ctl[2] & RETRY_RD_ERR_LOG_EN)) From 5ac440a06abcb4b107427f37021c53ab94d3b2f7 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 17 Apr 2025 23:07:21 +0800 Subject: [PATCH 17/23] EDAC/{skx_common,i10nm}: Structure the per-channel RRL registers commit 1a8a6af663a7f16c9b2779cf728187775735047b upstream. As the number of RRL (retry_rd_err_log) registers per memory channel increases, the positions of the RRL control bits and the widths of the RRL registers vary across different CPU generations. Adding RRL support for a new CPU requires handling these differences throughout the RRL-related code. Structure the offsets, widths, control bit positions, set numbers, modes, etc., of the per-channel RRL registers and make them configurable to facilitate easier RRL support for new CPUs. No functional changes are intended. Intel-SIG: commit 1a8a6af663a7 EDAC/{skx_common,i10nm}: Structure the per-channel RRL registers Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: Feng Xu Link: https://lore.kernel.org/r/20250417150724.1170168-5-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 92 ++++++++++++++++++++++++--------------- drivers/edac/skx_common.h | 21 +++++---- 2 files changed, 69 insertions(+), 44 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 48688b54ba627..2ac3fd8c12a3a 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -86,15 +86,38 @@ static int retry_rd_err_log; static int decoding_via_mca; static bool mem_cfg_2lm; -static u32 offsets_scrub_icx[] = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8}; -static u32 offsets_scrub_spr[] = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8}; -static u32 offsets_scrub_spr_hbm0[] = {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828, 0x0ed8}; -static u32 offsets_scrub_spr_hbm1[] = {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8}; -static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0}; -static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0}; -static u32 offsets_demand2_spr[] = {0x22c70, 0x22d80, 0x22f18, 0x22d58, 0x22c64, 0x20f10}; -static u32 offsets_demand_spr_hbm0[] = {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0}; -static u32 offsets_demand_spr_hbm1[] = {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0}; +static struct reg_rrl icx_reg_rrl_ddr = { + .set_num = 2, + .offsets = { + {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8}, + {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0}, + }, +}; + +static struct reg_rrl spr_reg_rrl_ddr = { + .set_num = 3, + .offsets = { + {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8}, + {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0}, + {0x22c70, 0x22d80, 0x22f18, 0x22d58, 0x22c64, 0x20f10}, + }, +}; + +static struct reg_rrl spr_reg_rrl_hbm_pch0 = { + .set_num = 2, + .offsets = { + {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828, 0x0ed8}, + {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0}, + }, +}; + +static struct reg_rrl spr_reg_rrl_hbm_pch1 = { + .set_num = 2, + .offsets = { + {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8}, + {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0}, + }, +}; static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable, u32 *rrl_ctl, u32 *offsets_scrub, u32 *offsets_demand, @@ -185,9 +208,11 @@ static void enable_retry_rd_err_log(bool enable) chan = d->imc[i].chan; for (j = 0; j < chan_num; j++) __enable_retry_rd_err_log(imc, j, enable, chan[j].rrl_ctl[0], - res_cfg->offsets_scrub, - res_cfg->offsets_demand, - res_cfg->offsets_demand2); + res_cfg->reg_rrl_ddr->offsets[0], + res_cfg->reg_rrl_ddr->offsets[1], + res_cfg->reg_rrl_ddr->set_num > 2 ? + res_cfg->reg_rrl_ddr->offsets[2] : NULL); + } imc_num += res_cfg->hbm_imc_num; @@ -201,12 +226,12 @@ static void enable_retry_rd_err_log(bool enable) chan = d->imc[i].chan; for (j = 0; j < chan_num; j++) { __enable_retry_rd_err_log(imc, j, enable, chan[j].rrl_ctl[0], - res_cfg->offsets_scrub_hbm0, - res_cfg->offsets_demand_hbm0, + res_cfg->reg_rrl_hbm[0]->offsets[0], + res_cfg->reg_rrl_hbm[0]->offsets[1], NULL); __enable_retry_rd_err_log(imc, j, enable, chan[j].rrl_ctl[1], - res_cfg->offsets_scrub_hbm1, - res_cfg->offsets_demand_hbm1, + res_cfg->reg_rrl_hbm[1]->offsets[0], + res_cfg->reg_rrl_hbm[1]->offsets[1], NULL); } } @@ -233,17 +258,18 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, pch = res->cs & 1; if (pch) - offsets = scrub_err ? res_cfg->offsets_scrub_hbm1 : - res_cfg->offsets_demand_hbm1; + offsets = scrub_err ? res_cfg->reg_rrl_hbm[1]->offsets[0] : + res_cfg->reg_rrl_hbm[1]->offsets[1]; else - offsets = scrub_err ? res_cfg->offsets_scrub_hbm0 : - res_cfg->offsets_demand_hbm0; + offsets = scrub_err ? res_cfg->reg_rrl_hbm[0]->offsets[0] : + res_cfg->reg_rrl_hbm[0]->offsets[1]; } else { if (scrub_err) { - offsets = res_cfg->offsets_scrub; + offsets = res_cfg->reg_rrl_ddr->offsets[0]; } else { - offsets = res_cfg->offsets_demand; - xffsets = res_cfg->offsets_demand2; + offsets = res_cfg->reg_rrl_ddr->offsets[1]; + if (res_cfg->reg_rrl_ddr->set_num > 2) + xffsets = res_cfg->reg_rrl_ddr->offsets[2]; } } @@ -883,8 +909,7 @@ static struct res_config i10nm_cfg0 = { .ddr_mdev_bdf = {0, 12, 0}, .hbm_mdev_bdf = {0, 12, 1}, .sad_all_offset = 0x108, - .offsets_scrub = offsets_scrub_icx, - .offsets_demand = offsets_demand_icx, + .reg_rrl_ddr = &icx_reg_rrl_ddr, }; static struct res_config i10nm_cfg1 = { @@ -902,8 +927,7 @@ static struct res_config i10nm_cfg1 = { .ddr_mdev_bdf = {0, 12, 0}, .hbm_mdev_bdf = {0, 12, 1}, .sad_all_offset = 0x108, - .offsets_scrub = offsets_scrub_icx, - .offsets_demand = offsets_demand_icx, + .reg_rrl_ddr = &icx_reg_rrl_ddr, }; static struct res_config spr_cfg = { @@ -926,13 +950,9 @@ static struct res_config spr_cfg = { .ddr_mdev_bdf = {0, 12, 0}, .hbm_mdev_bdf = {0, 12, 1}, .sad_all_offset = 0x300, - .offsets_scrub = offsets_scrub_spr, - .offsets_scrub_hbm0 = offsets_scrub_spr_hbm0, - .offsets_scrub_hbm1 = offsets_scrub_spr_hbm1, - .offsets_demand = offsets_demand_spr, - .offsets_demand2 = offsets_demand2_spr, - .offsets_demand_hbm0 = offsets_demand_spr_hbm0, - .offsets_demand_hbm1 = offsets_demand_spr_hbm1, + .reg_rrl_ddr = &spr_reg_rrl_ddr, + .reg_rrl_hbm[0] = &spr_reg_rrl_hbm_pch0, + .reg_rrl_hbm[1] = &spr_reg_rrl_hbm_pch1, }; static struct res_config gnr_cfg = { @@ -1120,7 +1140,7 @@ static int __init i10nm_init(void) mce_register_decode_chain(&i10nm_mce_dec); skx_setup_debug("i10nm_test"); - if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { + if (retry_rd_err_log && res_cfg->reg_rrl_ddr) { skx_set_decode(i10nm_mc_decode, show_retry_rd_err_log); if (retry_rd_err_log == 2) enable_retry_rd_err_log(true); @@ -1140,7 +1160,7 @@ static void __exit i10nm_exit(void) { edac_dbg(2, "\n"); - if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) { + if (retry_rd_err_log && res_cfg->reg_rrl_ddr) { skx_set_decode(NULL, NULL); if (retry_rd_err_log == 2) enable_retry_rd_err_log(false); diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index bd2d9eaeb4483..cda7a16b8bee4 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -82,6 +82,15 @@ /* Max RRL register sets per {,sub-,pseudo-}channel. */ #define NUM_RRL_SET 3 +/* Max RRL registers per set. */ +#define NUM_RRL_REG 6 + +/* RRL registers per {,sub-,pseudo-}channel. */ +struct reg_rrl { + /* RRL register parts. */ + int set_num; + u32 offsets[NUM_RRL_SET][NUM_RRL_REG]; +}; /* * Each cpu socket contains some pci devices that provide global @@ -238,14 +247,10 @@ struct res_config { /* HBM mdev device BDF */ struct pci_bdf hbm_mdev_bdf; int sad_all_offset; - /* Offsets of retry_rd_err_log registers */ - u32 *offsets_scrub; - u32 *offsets_scrub_hbm0; - u32 *offsets_scrub_hbm1; - u32 *offsets_demand; - u32 *offsets_demand2; - u32 *offsets_demand_hbm0; - u32 *offsets_demand_hbm1; + /* RRL register sets per DDR channel */ + struct reg_rrl *reg_rrl_ddr; + /* RRL register sets per HBM channel */ + struct reg_rrl *reg_rrl_hbm[2]; }; typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci, From bc45d52c704f4356bc90f704ef912c65906e95f3 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 17 Apr 2025 23:07:22 +0800 Subject: [PATCH 18/23] EDAC/{skx_common,i10nm}: Refactor enable_retry_rd_err_log() commit ba3985c1faf5eb72084ddc31204b076c2a450263 upstream. Refactor enable_retry_rd_err_log() using helper functions for both DDR and HBM, making the RRL control bits configurable instead of hard-coded. Additionally, explicitly define the four RRL modes for better readability. No functional changes intended. Intel-SIG: commit ba3985c1faf5 EDAC/{skx_common,i10nm}: Refactor enable_retry_rd_err_log() Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: Feng Xu Link: https://lore.kernel.org/r/20250417150724.1170168-6-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 237 ++++++++++++++++++++++---------------- drivers/edac/skx_common.h | 20 ++++ 2 files changed, 156 insertions(+), 101 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 2ac3fd8c12a3a..0fef84ea81281 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -72,11 +72,6 @@ #define I10NM_SAD_ENABLE(reg) GET_BITFIELD(reg, 0, 0) #define I10NM_SAD_NM_CACHEABLE(reg) GET_BITFIELD(reg, 5, 5) -#define RETRY_RD_ERR_LOG_UC BIT(1) -#define RETRY_RD_ERR_LOG_EN_PATSPR BIT(13) -#define RETRY_RD_ERR_LOG_NOOVER BIT(14) -#define RETRY_RD_ERR_LOG_EN BIT(15) -#define RETRY_RD_ERR_LOG_NOOVER_UC (BIT(14) | BIT(1)) #define RETRY_RD_ERR_LOG_OVER_UC_V (BIT(2) | BIT(1) | BIT(0)) static struct list_head *i10nm_edac_list; @@ -88,153 +83,193 @@ static bool mem_cfg_2lm; static struct reg_rrl icx_reg_rrl_ddr = { .set_num = 2, + .modes = {LRE_SCRUB, LRE_DEMAND}, .offsets = { {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8}, {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0}, }, + .widths = {4, 4, 4, 4, 4, 8}, + .uc_mask = BIT(1), + .en_patspr_mask = BIT(13), + .noover_mask = BIT(14), + .en_mask = BIT(15), }; static struct reg_rrl spr_reg_rrl_ddr = { .set_num = 3, + .modes = {LRE_SCRUB, LRE_DEMAND, FRE_DEMAND}, .offsets = { {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8}, {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0}, {0x22c70, 0x22d80, 0x22f18, 0x22d58, 0x22c64, 0x20f10}, }, + .widths = {4, 4, 8, 4, 4, 8}, + .uc_mask = BIT(1), + .en_patspr_mask = BIT(13), + .noover_mask = BIT(14), + .en_mask = BIT(15), }; static struct reg_rrl spr_reg_rrl_hbm_pch0 = { .set_num = 2, + .modes = {LRE_SCRUB, LRE_DEMAND}, .offsets = { {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828, 0x0ed8}, {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0}, }, + .widths = {4, 4, 8, 4, 4, 8}, + .uc_mask = BIT(1), + .en_patspr_mask = BIT(13), + .noover_mask = BIT(14), + .en_mask = BIT(15), }; static struct reg_rrl spr_reg_rrl_hbm_pch1 = { .set_num = 2, + .modes = {LRE_SCRUB, LRE_DEMAND}, .offsets = { {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8}, {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0}, }, + .widths = {4, 4, 8, 4, 4, 8}, + .uc_mask = BIT(1), + .en_patspr_mask = BIT(13), + .noover_mask = BIT(14), + .en_mask = BIT(15), }; -static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable, u32 *rrl_ctl, - u32 *offsets_scrub, u32 *offsets_demand, - u32 *offsets_demand2) +static u64 read_imc_reg(struct skx_imc *imc, int chan, u32 offset, u8 width) { - u32 s, d, d2; + switch (width) { + case 4: + return I10NM_GET_REG32(imc, chan, offset); + case 8: + return I10NM_GET_REG64(imc, chan, offset); + default: + i10nm_printk(KERN_ERR, "Invalid readd RRL 0x%x width %d\n", offset, width); + return 0; + } +} - s = I10NM_GET_REG32(imc, chan, offsets_scrub[0]); - d = I10NM_GET_REG32(imc, chan, offsets_demand[0]); - if (offsets_demand2) - d2 = I10NM_GET_REG32(imc, chan, offsets_demand2[0]); +static void write_imc_reg(struct skx_imc *imc, int chan, u32 offset, u8 width, u64 val) +{ + switch (width) { + case 4: + return I10NM_SET_REG32(imc, chan, offset, (u32)val); + default: + i10nm_printk(KERN_ERR, "Invalid write RRL 0x%x width %d\n", offset, width); + } +} + +static void enable_rrl(struct skx_imc *imc, int chan, struct reg_rrl *rrl, + int rrl_set, bool enable, u32 *rrl_ctl) +{ + enum rrl_mode mode = rrl->modes[rrl_set]; + u32 offset = rrl->offsets[rrl_set][0], v; + u8 width = rrl->widths[0]; + bool first, scrub; + + /* First or last read error. */ + first = (mode == FRE_SCRUB || mode == FRE_DEMAND); + /* Patrol scrub or on-demand read error. */ + scrub = (mode == FRE_SCRUB || mode == LRE_SCRUB); + + v = read_imc_reg(imc, chan, offset, width); if (enable) { - /* Save default configurations */ - rrl_ctl[0] = s; - rrl_ctl[1] = d; - if (offsets_demand2) - rrl_ctl[2] = d2; - - s &= ~RETRY_RD_ERR_LOG_NOOVER_UC; - s |= RETRY_RD_ERR_LOG_EN_PATSPR; - s |= RETRY_RD_ERR_LOG_EN; - d &= ~RETRY_RD_ERR_LOG_NOOVER_UC; - d &= ~RETRY_RD_ERR_LOG_EN_PATSPR; - d |= RETRY_RD_ERR_LOG_EN; - - if (offsets_demand2) { - d2 &= ~RETRY_RD_ERR_LOG_UC; - d2 &= ~RETRY_RD_ERR_LOG_EN_PATSPR; - d2 |= RETRY_RD_ERR_LOG_NOOVER; - d2 |= RETRY_RD_ERR_LOG_EN; - } + /* Save default configurations. */ + *rrl_ctl = v; + v &= ~rrl->uc_mask; + + if (first) + v |= rrl->noover_mask; + else + v &= ~rrl->noover_mask; + + if (scrub) + v |= rrl->en_patspr_mask; + else + v &= ~rrl->en_patspr_mask; + + v |= rrl->en_mask; } else { - /* Restore default configurations */ - if (rrl_ctl[0] & RETRY_RD_ERR_LOG_UC) - s |= RETRY_RD_ERR_LOG_UC; - if (rrl_ctl[0] & RETRY_RD_ERR_LOG_NOOVER) - s |= RETRY_RD_ERR_LOG_NOOVER; - if (!(rrl_ctl[0] & RETRY_RD_ERR_LOG_EN_PATSPR)) - s &= ~RETRY_RD_ERR_LOG_EN_PATSPR; - if (!(rrl_ctl[0] & RETRY_RD_ERR_LOG_EN)) - s &= ~RETRY_RD_ERR_LOG_EN; - if (rrl_ctl[1] & RETRY_RD_ERR_LOG_UC) - d |= RETRY_RD_ERR_LOG_UC; - if (rrl_ctl[1] & RETRY_RD_ERR_LOG_NOOVER) - d |= RETRY_RD_ERR_LOG_NOOVER; - if (rrl_ctl[1] & RETRY_RD_ERR_LOG_EN_PATSPR) - d |= RETRY_RD_ERR_LOG_EN_PATSPR; - if (!(rrl_ctl[1] & RETRY_RD_ERR_LOG_EN)) - d &= ~RETRY_RD_ERR_LOG_EN; - - if (offsets_demand2) { - if (rrl_ctl[2] & RETRY_RD_ERR_LOG_UC) - d2 |= RETRY_RD_ERR_LOG_UC; - if (rrl_ctl[2] & RETRY_RD_ERR_LOG_EN_PATSPR) - d2 |= RETRY_RD_ERR_LOG_EN_PATSPR; - if (!(rrl_ctl[2] & RETRY_RD_ERR_LOG_NOOVER)) - d2 &= ~RETRY_RD_ERR_LOG_NOOVER; - if (!(rrl_ctl[2] & RETRY_RD_ERR_LOG_EN)) - d2 &= ~RETRY_RD_ERR_LOG_EN; + /* Restore default configurations. */ + if (*rrl_ctl & rrl->uc_mask) + v |= rrl->uc_mask; + + if (first) { + if (!(*rrl_ctl & rrl->noover_mask)) + v &= ~rrl->noover_mask; + } else { + if (*rrl_ctl & rrl->noover_mask) + v |= rrl->noover_mask; } + + if (scrub) { + if (!(*rrl_ctl & rrl->en_patspr_mask)) + v &= ~rrl->en_patspr_mask; + } else { + if (*rrl_ctl & rrl->en_patspr_mask) + v |= rrl->en_patspr_mask; + } + + if (!(*rrl_ctl & rrl->en_mask)) + v &= ~rrl->en_mask; } - I10NM_SET_REG32(imc, chan, offsets_scrub[0], s); - I10NM_SET_REG32(imc, chan, offsets_demand[0], d); - if (offsets_demand2) - I10NM_SET_REG32(imc, chan, offsets_demand2[0], d2); + write_imc_reg(imc, chan, offset, width, v); +} + +static void enable_rrls(struct skx_imc *imc, int chan, struct reg_rrl *rrl, + bool enable, u32 *rrl_ctl) +{ + for (int i = 0; i < rrl->set_num; i++) + enable_rrl(imc, chan, rrl, i, enable, rrl_ctl + i); +} + +static void enable_rrls_ddr(struct skx_imc *imc, bool enable) +{ + struct reg_rrl *rrl_ddr = res_cfg->reg_rrl_ddr; + int i, chan_num = res_cfg->ddr_chan_num; + struct skx_channel *chan = imc->chan; + + if (!imc->mbase) + return; + + for (i = 0; i < chan_num; i++) + enable_rrls(imc, i, rrl_ddr, enable, chan[i].rrl_ctl[0]); +} + +static void enable_rrls_hbm(struct skx_imc *imc, bool enable) +{ + struct reg_rrl **rrl_hbm = res_cfg->reg_rrl_hbm; + int i, chan_num = res_cfg->hbm_chan_num; + struct skx_channel *chan = imc->chan; + + if (!imc->mbase || !imc->hbm_mc || !rrl_hbm[0] || !rrl_hbm[1]) + return; + + for (i = 0; i < chan_num; i++) { + enable_rrls(imc, i, rrl_hbm[0], enable, chan[i].rrl_ctl[0]); + enable_rrls(imc, i, rrl_hbm[1], enable, chan[i].rrl_ctl[1]); + } } static void enable_retry_rd_err_log(bool enable) { - int i, j, imc_num, chan_num; - struct skx_channel *chan; - struct skx_imc *imc; struct skx_dev *d; + int i, imc_num; edac_dbg(2, "\n"); list_for_each_entry(d, i10nm_edac_list, list) { imc_num = res_cfg->ddr_imc_num; - chan_num = res_cfg->ddr_chan_num; - - for (i = 0; i < imc_num; i++) { - imc = &d->imc[i]; - if (!imc->mbase) - continue; - - chan = d->imc[i].chan; - for (j = 0; j < chan_num; j++) - __enable_retry_rd_err_log(imc, j, enable, chan[j].rrl_ctl[0], - res_cfg->reg_rrl_ddr->offsets[0], - res_cfg->reg_rrl_ddr->offsets[1], - res_cfg->reg_rrl_ddr->set_num > 2 ? - res_cfg->reg_rrl_ddr->offsets[2] : NULL); - - } + for (i = 0; i < imc_num; i++) + enable_rrls_ddr(&d->imc[i], enable); imc_num += res_cfg->hbm_imc_num; - chan_num = res_cfg->hbm_chan_num; - - for (; i < imc_num; i++) { - imc = &d->imc[i]; - if (!imc->mbase || !imc->hbm_mc) - continue; - - chan = d->imc[i].chan; - for (j = 0; j < chan_num; j++) { - __enable_retry_rd_err_log(imc, j, enable, chan[j].rrl_ctl[0], - res_cfg->reg_rrl_hbm[0]->offsets[0], - res_cfg->reg_rrl_hbm[0]->offsets[1], - NULL); - __enable_retry_rd_err_log(imc, j, enable, chan[j].rrl_ctl[1], - res_cfg->reg_rrl_hbm[1]->offsets[0], - res_cfg->reg_rrl_hbm[1]->offsets[1], - NULL); - } - } + for (; i < imc_num; i++) + enable_rrls_hbm(&d->imc[i], enable); } } diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index cda7a16b8bee4..7fc0c5af43e7a 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -85,11 +85,31 @@ /* Max RRL registers per set. */ #define NUM_RRL_REG 6 +/* Modes of RRL register set. */ +enum rrl_mode { + /* Last read error from patrol scrub. */ + LRE_SCRUB, + /* Last read error from demand. */ + LRE_DEMAND, + /* First read error from patrol scrub. */ + FRE_SCRUB, + /* First read error from demand. */ + FRE_DEMAND, +}; + /* RRL registers per {,sub-,pseudo-}channel. */ struct reg_rrl { /* RRL register parts. */ int set_num; + enum rrl_mode modes[NUM_RRL_SET]; u32 offsets[NUM_RRL_SET][NUM_RRL_REG]; + /* RRL register widths in byte per set. */ + u8 widths[NUM_RRL_REG]; + /* RRL control bits of the first register per set. */ + u32 uc_mask; + u32 en_patspr_mask; + u32 noover_mask; + u32 en_mask; }; /* From e0b267e7c25bd1f07c26f3d759e77993ba05725b Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 17 Apr 2025 23:07:23 +0800 Subject: [PATCH 19/23] EDAC/{skx_common,i10nm}: Refactor show_retry_rd_err_log() commit 126168fa2c3e16113ea75a656fff5156a54a5726 upstream. Make the {valid bit, overwritten status, number} of RRL registers and the {number, offsets, widths} of per-channel CORRERRCNT registers configurable. Refactor show_retry_rd_err_log() to use the configurable fields of struct reg_rrl, making the code more scalable and simpler. No functional changes intended. Intel-SIG: commit 126168fa2c3e EDAC/{skx_common,i10nm}: Refactor show_retry_rd_err_log() Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: Feng Xu Link: https://lore.kernel.org/r/20250417150724.1170168-7-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 162 +++++++++++++++++--------------------- drivers/edac/skx_common.h | 11 ++- 2 files changed, 81 insertions(+), 92 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 0fef84ea81281..7249dbac35aec 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -72,8 +72,6 @@ #define I10NM_SAD_ENABLE(reg) GET_BITFIELD(reg, 0, 0) #define I10NM_SAD_NM_CACHEABLE(reg) GET_BITFIELD(reg, 5, 5) -#define RETRY_RD_ERR_LOG_OVER_UC_V (BIT(2) | BIT(1) | BIT(0)) - static struct list_head *i10nm_edac_list; static struct res_config *res_cfg; @@ -83,20 +81,28 @@ static bool mem_cfg_2lm; static struct reg_rrl icx_reg_rrl_ddr = { .set_num = 2, + .reg_num = 6, .modes = {LRE_SCRUB, LRE_DEMAND}, .offsets = { {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8}, {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0}, }, .widths = {4, 4, 4, 4, 4, 8}, + .v_mask = BIT(0), .uc_mask = BIT(1), + .over_mask = BIT(2), .en_patspr_mask = BIT(13), .noover_mask = BIT(14), .en_mask = BIT(15), + + .cecnt_num = 4, + .cecnt_offsets = {0x22c18, 0x22c1c, 0x22c20, 0x22c24}, + .cecnt_widths = {4, 4, 4, 4}, }; static struct reg_rrl spr_reg_rrl_ddr = { .set_num = 3, + .reg_num = 6, .modes = {LRE_SCRUB, LRE_DEMAND, FRE_DEMAND}, .offsets = { {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8}, @@ -104,38 +110,58 @@ static struct reg_rrl spr_reg_rrl_ddr = { {0x22c70, 0x22d80, 0x22f18, 0x22d58, 0x22c64, 0x20f10}, }, .widths = {4, 4, 8, 4, 4, 8}, + .v_mask = BIT(0), .uc_mask = BIT(1), + .over_mask = BIT(2), .en_patspr_mask = BIT(13), .noover_mask = BIT(14), .en_mask = BIT(15), + + .cecnt_num = 4, + .cecnt_offsets = {0x22c18, 0x22c1c, 0x22c20, 0x22c24}, + .cecnt_widths = {4, 4, 4, 4}, }; static struct reg_rrl spr_reg_rrl_hbm_pch0 = { .set_num = 2, + .reg_num = 6, .modes = {LRE_SCRUB, LRE_DEMAND}, .offsets = { {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828, 0x0ed8}, {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0}, }, .widths = {4, 4, 8, 4, 4, 8}, + .v_mask = BIT(0), .uc_mask = BIT(1), + .over_mask = BIT(2), .en_patspr_mask = BIT(13), .noover_mask = BIT(14), .en_mask = BIT(15), + + .cecnt_num = 4, + .cecnt_offsets = {0x2818, 0x281c, 0x2820, 0x2824}, + .cecnt_widths = {4, 4, 4, 4}, }; static struct reg_rrl spr_reg_rrl_hbm_pch1 = { .set_num = 2, + .reg_num = 6, .modes = {LRE_SCRUB, LRE_DEMAND}, .offsets = { {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8}, {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0}, }, .widths = {4, 4, 8, 4, 4, 8}, + .v_mask = BIT(0), .uc_mask = BIT(1), + .over_mask = BIT(2), .en_patspr_mask = BIT(13), .noover_mask = BIT(14), .en_mask = BIT(15), + + .cecnt_num = 4, + .cecnt_offsets = {0x2c18, 0x2c1c, 0x2c20, 0x2c24}, + .cecnt_widths = {4, 4, 4, 4}, }; static u64 read_imc_reg(struct skx_imc *imc, int chan, u32 offset, u8 width) @@ -276,110 +302,64 @@ static void enable_retry_rd_err_log(bool enable) static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, int len, bool scrub_err) { + int i, j, n, ch = res->channel, pch = res->cs & 1; struct skx_imc *imc = &res->dev->imc[res->imc]; - u32 log0, log1, log2, log3, log4; - u32 corr0, corr1, corr2, corr3; - u32 lxg0, lxg1, lxg3, lxg4; - u32 *xffsets = NULL; - u64 log2a, log5; - u64 lxg2a, lxg5; - u32 *offsets; - int n, pch; + u32 offset, status_mask; + struct reg_rrl *rrl; + u64 log, corr; + bool scrub; + u8 width; if (!imc->mbase) return; - if (imc->hbm_mc) { - pch = res->cs & 1; + rrl = imc->hbm_mc ? res_cfg->reg_rrl_hbm[pch] : res_cfg->reg_rrl_ddr; - if (pch) - offsets = scrub_err ? res_cfg->reg_rrl_hbm[1]->offsets[0] : - res_cfg->reg_rrl_hbm[1]->offsets[1]; - else - offsets = scrub_err ? res_cfg->reg_rrl_hbm[0]->offsets[0] : - res_cfg->reg_rrl_hbm[0]->offsets[1]; - } else { - if (scrub_err) { - offsets = res_cfg->reg_rrl_ddr->offsets[0]; - } else { - offsets = res_cfg->reg_rrl_ddr->offsets[1]; - if (res_cfg->reg_rrl_ddr->set_num > 2) - xffsets = res_cfg->reg_rrl_ddr->offsets[2]; - } - } + if (!rrl) + return; - log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]); - log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]); - log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]); - log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]); - log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]); - - if (xffsets) { - lxg0 = I10NM_GET_REG32(imc, res->channel, xffsets[0]); - lxg1 = I10NM_GET_REG32(imc, res->channel, xffsets[1]); - lxg3 = I10NM_GET_REG32(imc, res->channel, xffsets[3]); - lxg4 = I10NM_GET_REG32(imc, res->channel, xffsets[4]); - lxg5 = I10NM_GET_REG64(imc, res->channel, xffsets[5]); - } + status_mask = rrl->over_mask | rrl->uc_mask | rrl->v_mask; - if (res_cfg->type == SPR) { - log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]); - n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx", - log0, log1, log2a, log3, log4, log5); + n = snprintf(msg, len, " retry_rd_err_log["); + for (i = 0; i < rrl->set_num; i++) { + scrub = (rrl->modes[i] == FRE_SCRUB || rrl->modes[i] == LRE_SCRUB); + if (scrub_err != scrub) + continue; - if (len - n > 0) { - if (xffsets) { - lxg2a = I10NM_GET_REG64(imc, res->channel, xffsets[2]); - n += snprintf(msg + n, len - n, " %.8x %.8x %.16llx %.8x %.8x %.16llx]", - lxg0, lxg1, lxg2a, lxg3, lxg4, lxg5); - } else { - n += snprintf(msg + n, len - n, "]"); - } - } - } else { - log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]); - n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]", - log0, log1, log2, log3, log4, log5); - } + for (j = 0; j < rrl->reg_num && len - n > 0; j++) { + offset = rrl->offsets[i][j]; + width = rrl->widths[j]; + log = read_imc_reg(imc, ch, offset, width); - if (imc->hbm_mc) { - if (pch) { - corr0 = I10NM_GET_REG32(imc, res->channel, 0x2c18); - corr1 = I10NM_GET_REG32(imc, res->channel, 0x2c1c); - corr2 = I10NM_GET_REG32(imc, res->channel, 0x2c20); - corr3 = I10NM_GET_REG32(imc, res->channel, 0x2c24); - } else { - corr0 = I10NM_GET_REG32(imc, res->channel, 0x2818); - corr1 = I10NM_GET_REG32(imc, res->channel, 0x281c); - corr2 = I10NM_GET_REG32(imc, res->channel, 0x2820); - corr3 = I10NM_GET_REG32(imc, res->channel, 0x2824); + if (width == 4) + n += snprintf(msg + n, len - n, "%.8llx ", log); + else + n += snprintf(msg + n, len - n, "%.16llx ", log); + + /* Clear RRL status if RRL in Linux control mode. */ + if (retry_rd_err_log == 2 && !j && (log & status_mask)) + write_imc_reg(imc, ch, offset, width, log & ~status_mask); } - } else { - corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18); - corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c); - corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20); - corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24); } - if (len - n > 0) - snprintf(msg + n, len - n, - " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]", - corr0 & 0xffff, corr0 >> 16, - corr1 & 0xffff, corr1 >> 16, - corr2 & 0xffff, corr2 >> 16, - corr3 & 0xffff, corr3 >> 16); - - /* Clear status bits */ - if (retry_rd_err_log == 2) { - if (log0 & RETRY_RD_ERR_LOG_OVER_UC_V) { - log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V; - I10NM_SET_REG32(imc, res->channel, offsets[0], log0); - } + /* Move back one space. */ + n--; + n += snprintf(msg + n, len - n, "]"); + + if (len - n > 0) { + n += snprintf(msg + n, len - n, " correrrcnt["); + for (i = 0; i < rrl->cecnt_num && len - n > 0; i++) { + offset = rrl->cecnt_offsets[i]; + width = rrl->cecnt_widths[i]; + corr = read_imc_reg(imc, ch, offset, width); - if (xffsets && (lxg0 & RETRY_RD_ERR_LOG_OVER_UC_V)) { - lxg0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V; - I10NM_SET_REG32(imc, res->channel, xffsets[0], lxg0); + n += snprintf(msg + n, len - n, "%.4llx %.4llx ", + corr & 0xffff, corr >> 16); } + + /* Move back one space. */ + n--; + n += snprintf(msg + n, len - n, "]"); } } diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 7fc0c5af43e7a..b04f4d835a9cb 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -84,6 +84,8 @@ #define NUM_RRL_SET 3 /* Max RRL registers per set. */ #define NUM_RRL_REG 6 +/* Max correctable error count registers. */ +#define NUM_CECNT_REG 4 /* Modes of RRL register set. */ enum rrl_mode { @@ -100,16 +102,23 @@ enum rrl_mode { /* RRL registers per {,sub-,pseudo-}channel. */ struct reg_rrl { /* RRL register parts. */ - int set_num; + int set_num, reg_num; enum rrl_mode modes[NUM_RRL_SET]; u32 offsets[NUM_RRL_SET][NUM_RRL_REG]; /* RRL register widths in byte per set. */ u8 widths[NUM_RRL_REG]; /* RRL control bits of the first register per set. */ + u32 v_mask; u32 uc_mask; + u32 over_mask; u32 en_patspr_mask; u32 noover_mask; u32 en_mask; + + /* CORRERRCNT register parts. */ + int cecnt_num; + u32 cecnt_offsets[NUM_CECNT_REG]; + u8 cecnt_widths[NUM_CECNT_REG]; }; /* From cd011369f5e7ab9e83c9481c58322f4993d0372f Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 17 Apr 2025 23:07:24 +0800 Subject: [PATCH 20/23] EDAC/{skx_common,i10nm}: Add RRL support for Intel Granite Rapids server commit 5904dc561ef21e69f0b9dca39d1a66e34b7ea764 upstream. Compared to previous generations, Granite Rapids defines the RRL control bits {en_patspr, noover, en} in different positions, adds an extra RRL set for the new mode of the first patrol-scrub read error, and extends the number of CORRERRCNT registers from 4 to 8, encoding one counter per CORRERRCNT register. Add a Granite Rapids reg_rrl configuration table and adjust the code to accommodate the differences mentioned above for RRL support. Intel-SIG: commit 5904dc561ef2 EDAC/{skx_common,i10nm}: Add RRL support for Intel Granite Rapids server Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: Feng Xu Link: https://lore.kernel.org/r/20250417150724.1170168-8-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 37 +++++++++++++++++++++++++++++++++++-- drivers/edac/skx_common.h | 4 ++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 7249dbac35aec..7d69530107c95 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -164,6 +164,29 @@ static struct reg_rrl spr_reg_rrl_hbm_pch1 = { .cecnt_widths = {4, 4, 4, 4}, }; +static struct reg_rrl gnr_reg_rrl_ddr = { + .set_num = 4, + .reg_num = 6, + .modes = {FRE_SCRUB, FRE_DEMAND, LRE_SCRUB, LRE_DEMAND}, + .offsets = { + {0x2f10, 0x2f20, 0x2f30, 0x2f50, 0x2f60, 0xba0}, + {0x2f14, 0x2f24, 0x2f38, 0x2f54, 0x2f64, 0xba8}, + {0x2f18, 0x2f28, 0x2f40, 0x2f58, 0x2f68, 0xbb0}, + {0x2f1c, 0x2f2c, 0x2f48, 0x2f5c, 0x2f6c, 0xbb8}, + }, + .widths = {4, 4, 8, 4, 4, 8}, + .v_mask = BIT(0), + .uc_mask = BIT(1), + .over_mask = BIT(2), + .en_patspr_mask = BIT(14), + .noover_mask = BIT(15), + .en_mask = BIT(12), + + .cecnt_num = 8, + .cecnt_offsets = {0x2c10, 0x2c14, 0x2c18, 0x2c1c, 0x2c20, 0x2c24, 0x2c28, 0x2c2c}, + .cecnt_widths = {4, 4, 4, 4, 4, 4, 4, 4}, +}; + static u64 read_imc_reg(struct skx_imc *imc, int chan, u32 offset, u8 width) { switch (width) { @@ -353,8 +376,17 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, width = rrl->cecnt_widths[i]; corr = read_imc_reg(imc, ch, offset, width); - n += snprintf(msg + n, len - n, "%.4llx %.4llx ", - corr & 0xffff, corr >> 16); + /* CPUs {ICX,SPR} encode two counters per 4-byte CORRERRCNT register. */ + if (res_cfg->type <= SPR) { + n += snprintf(msg + n, len - n, "%.4llx %.4llx ", + corr & 0xffff, corr >> 16); + } else { + /* CPUs {GNR} encode one counter per CORRERRCNT register. */ + if (width == 4) + n += snprintf(msg + n, len - n, "%.8llx ", corr); + else + n += snprintf(msg + n, len - n, "%.16llx ", corr); + } } /* Move back one space. */ @@ -985,6 +1017,7 @@ static struct res_config gnr_cfg = { .uracu_bdf = {0, 0, 1}, .ddr_mdev_bdf = {0, 5, 1}, .sad_all_offset = 0x300, + .reg_rrl_ddr = &gnr_reg_rrl_ddr, }; static const struct x86_cpu_id i10nm_cpuids[] = { diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index b04f4d835a9cb..15cb774d88ece 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -81,11 +81,11 @@ #define MCACOD_EXT_MEM_ERR 0x280 /* Max RRL register sets per {,sub-,pseudo-}channel. */ -#define NUM_RRL_SET 3 +#define NUM_RRL_SET 4 /* Max RRL registers per set. */ #define NUM_RRL_REG 6 /* Max correctable error count registers. */ -#define NUM_CECNT_REG 4 +#define NUM_CECNT_REG 8 /* Modes of RRL register set. */ enum rrl_mode { From 56ab3e8e221cc76ecc59bc641da9baa6fc82c1a9 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 24 Apr 2025 16:14:54 +0800 Subject: [PATCH 21/23] EDAC/i10nm: Fix the bitwise operation between variables of different sizes commit 2b2408aca90b86c1ef51c19d834e5f6db0a1ff30 upstream. The tool of Smatch static checker reported the following warning: drivers/edac/i10nm_base.c:364 show_retry_rd_err_log() warn: should bitwise negate be 'ullong'? This warning was due to the bitwise NOT/AND operations between 'status_mask' (a u32 type) and 'log' (a u64 type), which resulted in the high 32 bits of 'log' were cleared. This was a false positive warning, as only the low 32 bits of 'log' was written to the first RRL memory controller register (a u32 type). To improve code sanity, fix this warning by changing 'status_mask' to a u64 type, ensuring it matches the size of 'log' for bitwise operations. Intel-SIG: commit 2b2408aca90b EDAC/i10nm: Fix the bitwise operation between variables of different sizes Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/aAih0KmEVq7ch6v2@stanley.mountain/ Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20250424081454.2952632-1-qiuxu.zhuo@intel.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 7d69530107c95..51ed02eab31e0 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -327,10 +327,10 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, { int i, j, n, ch = res->channel, pch = res->cs & 1; struct skx_imc *imc = &res->dev->imc[res->imc]; - u32 offset, status_mask; + u64 log, corr, status_mask; struct reg_rrl *rrl; - u64 log, corr; bool scrub; + u32 offset; u8 width; if (!imc->mbase) From 30d4279e9d6f03441a6f561b9906232749109feb Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 4 Jul 2025 23:16:07 +0800 Subject: [PATCH 22/23] EDAC/i10nm: Add Intel Granite Rapids-D support commit 9ad08c1115646533097c8a799ad046bf5127b04a upstream. The Granite Rapids-D CPU model uses memory controller registers similar to those of the Granite Rapids server CPU but with a different memory controller MMIO base. Add the Granite Rapids-D CPU model ID and use the new memory controller MMIO base for EDAC support. Intel-SIG: commit 9ad08c111564 EDAC/i10nm: Add Intel Granite Rapids-D support Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Tested-by: VikasX Chougule Link: https://lore.kernel.org/r/20250704151609.7833-2-qiuxu.zhuo@intel.com [ Zhang Rui: resolve conflict (use old X86 Macro) and amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 51ed02eab31e0..828b48490617c 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -62,6 +62,7 @@ ((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000) #define I10NM_GNR_IMC_MMIO_OFFSET 0x24c000 +#define I10NM_GNR_D_IMC_MMIO_OFFSET 0x206000 #define I10NM_GNR_IMC_MMIO_SIZE 0x4000 #define I10NM_HBM_IMC_MMIO_SIZE 0x9000 #define I10NM_DDR_IMC_CH_CNT(reg) GET_BITFIELD(reg, 21, 24) @@ -687,6 +688,14 @@ static struct pci_dev *get_gnr_mdev(struct skx_dev *d, int logical_idx, int *phy return NULL; } +static u32 get_gnr_imc_mmio_offset(void) +{ + if (boot_cpu_data.x86_vfm == INTEL_GRANITERAPIDS_D) + return I10NM_GNR_D_IMC_MMIO_OFFSET; + + return I10NM_GNR_IMC_MMIO_OFFSET; +} + /** * get_ddr_munit() - Get the resource of the i-th DDR memory controller. * @@ -715,7 +724,7 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi return NULL; *offset = I10NM_GET_IMC_MMIO_OFFSET(reg) + - I10NM_GNR_IMC_MMIO_OFFSET + + get_gnr_imc_mmio_offset() + physical_idx * I10NM_GNR_IMC_MMIO_SIZE; *size = I10NM_GNR_IMC_MMIO_SIZE; @@ -1029,6 +1038,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SAPPHIRERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(EMERALDRAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(GRANITERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(GRANITERAPIDS_D, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_DARKMONT_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg), From 83c5a0f3f467495089e66cf2181d1ac7f92e83fc Mon Sep 17 00:00:00 2001 From: Wang Haoran Date: Tue, 15 Jul 2025 21:17:00 +0800 Subject: [PATCH 23/23] EDAC/{skx_common,i10nm}: Use scnprintf() for safer buffer handling commit 35928bc38db69a2af26624e35a250c1e0f9a6a3f upstream. snprintf() is fragile when its return value will be used to append additional data to a buffer. Use scnprintf() instead. Intel-SIG: commit 35928bc38db6 EDAC/{skx_common,i10nm}: Use scnprintf() for safer buffer handling Add EDAC basic support and RRL enhancement for CWF/SRF/GNR/GNR-D Signed-off-by: Wang Haoran Signed-off-by: Tony Luck Tested-by: Qiuxu Zhuo Reviewed-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20250715131700.1092720-1-haoranwangsec@gmail.com [ Zhang Rui: amend commit log ] Signed-off-by: Zhang Rui --- drivers/edac/i10nm_base.c | 18 +++++++++--------- drivers/edac/skx_common.c | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 828b48490617c..68e87dc889cf2 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -344,7 +344,7 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, status_mask = rrl->over_mask | rrl->uc_mask | rrl->v_mask; - n = snprintf(msg, len, " retry_rd_err_log["); + n = scnprintf(msg, len, " retry_rd_err_log["); for (i = 0; i < rrl->set_num; i++) { scrub = (rrl->modes[i] == FRE_SCRUB || rrl->modes[i] == LRE_SCRUB); if (scrub_err != scrub) @@ -356,9 +356,9 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, log = read_imc_reg(imc, ch, offset, width); if (width == 4) - n += snprintf(msg + n, len - n, "%.8llx ", log); + n += scnprintf(msg + n, len - n, "%.8llx ", log); else - n += snprintf(msg + n, len - n, "%.16llx ", log); + n += scnprintf(msg + n, len - n, "%.16llx ", log); /* Clear RRL status if RRL in Linux control mode. */ if (retry_rd_err_log == 2 && !j && (log & status_mask)) @@ -368,10 +368,10 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, /* Move back one space. */ n--; - n += snprintf(msg + n, len - n, "]"); + n += scnprintf(msg + n, len - n, "]"); if (len - n > 0) { - n += snprintf(msg + n, len - n, " correrrcnt["); + n += scnprintf(msg + n, len - n, " correrrcnt["); for (i = 0; i < rrl->cecnt_num && len - n > 0; i++) { offset = rrl->cecnt_offsets[i]; width = rrl->cecnt_widths[i]; @@ -379,20 +379,20 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, /* CPUs {ICX,SPR} encode two counters per 4-byte CORRERRCNT register. */ if (res_cfg->type <= SPR) { - n += snprintf(msg + n, len - n, "%.4llx %.4llx ", + n += scnprintf(msg + n, len - n, "%.4llx %.4llx ", corr & 0xffff, corr >> 16); } else { /* CPUs {GNR} encode one counter per CORRERRCNT register. */ if (width == 4) - n += snprintf(msg + n, len - n, "%.8llx ", corr); + n += scnprintf(msg + n, len - n, "%.8llx ", corr); else - n += snprintf(msg + n, len - n, "%.16llx ", corr); + n += scnprintf(msg + n, len - n, "%.16llx ", corr); } } /* Move back one space. */ n--; - n += snprintf(msg + n, len - n, "]"); + n += scnprintf(msg + n, len - n, "]"); } } diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 9e4719d75aec4..cc5f2ea4318e8 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -669,12 +669,12 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, } if (res->decoded_by_adxl) { - len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", + len = scnprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", overflow ? " OVERFLOW" : "", (uncorrected_error && recoverable) ? " recoverable" : "", mscod, errcode, adxl_msg); } else { - len = snprintf(skx_msg, MSG_SIZE, + len = scnprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x ProcessorSocketId:0x%x MemoryControllerId:0x%x PhysicalRankId:0x%x Row:0x%x Column:0x%x Bank:0x%x BankGroup:0x%x", overflow ? " OVERFLOW" : "", (uncorrected_error && recoverable) ? " recoverable" : "",