diff --git a/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb.rule b/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb.rule index 50a0b85e399..2d74e40cbbd 100644 --- a/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb.rule +++ b/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb.rule @@ -1004,7 +1004,7 @@ rule rRDFFIR group gRDFFIR filter singlebit, - cs_root_cause(14,15,17,35,37,40) + cs_root_cause(14,15,17,35,37) { /** RDFFIR[0] * Mainline read MPE on rank 0 @@ -1174,7 +1174,7 @@ group gRDFFIR /** RDFFIR[40] * RDDATA valid error */ - (rRDFFIR, bit(40)) ? rdf_rcd_parity_error_UERE; + (rRDFFIR, bit(40)) ? mem_port_th_32perDay; /** RDFFIR[41] * SCOM status register parity error diff --git a/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb_actions.rule b/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb_actions.rule index 1c7d4354ccf..d5b6e3fad74 100644 --- a/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb_actions.rule +++ b/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb_actions.rule @@ -299,58 +299,77 @@ actionclass verify_chip_mark_5 { funccall("AnalyzeFetchMpe_5"); }; actionclass verify_chip_mark_6 { funccall("AnalyzeFetchMpe_6"); }; actionclass verify_chip_mark_7 { funccall("AnalyzeFetchMpe_7"); }; -# TODO RTC 208211 /** Mainline NCE/TCE handling */ actionclass mainline_nce_tce_handling { - TBDDefaultCallout; + funccall("AnalyzeFetchNceTce"); }; /** Handle Mainline AUEs/IAUEs */ actionclass mainline_aue_iaue_handling { - TBDDefaultCallout; + funccall("AnalyzeFetchAueIaue"); + mem_port_L; + threshold1; }; /** Mainline UE handling */ actionclass mainline_ue_handling { - TBDDefaultCallout; + threshold( field(33 / 30 min ) ); # To prevent flooding. Will be unmasked + # when background scrubbing resumes after + # targeted diagnostics is complete. + funccall("AnalyzeFetchUe"); }; actionclass mainline_ue_handling_UERE { - TBDDefaultCallout; SueSource; + mainline_ue_handling; }; /** Handle Mainline IUEs */ actionclass mainline_iue_handling { - TBDDefaultCallout; + # An IUE itself is not a SUE source, however, a threshold of IUEs will + # trigger a port failure, which will generate SUEs. The port failure could + # also crash the machine so we want to make sure this bit is flagged as an + # SUE just in case it is needed in the checkstop analysis. + SueSource; + # Thresholding done in the plugin + funccall("AnalyzeMainlineIue"); }; /** Handle Maintenance IUEs */ actionclass maintenance_iue_handling { - TBDDefaultCallout; + # An IUE itself is not a SUE source, however, a threshold of IUEs will + # trigger a port failure, which will generate SUEs. The port failure could + # also crash the machine so we want to make sure this bit is flagged as an + # SUE just in case it is needed in the checkstop analysis. + SueSource; + # Thresholding done in the plugin + funccall("AnalyzeMaintIue"); }; actionclass memory_impe_handling { - TBDDefaultCallout; + funccall("AnalyzeImpe"); }; /** Handle Maintenance AUEs */ actionclass maintenance_aue_handling { - TBDDefaultCallout; + funccall("AnalyzeMaintAue"); + mem_port_L; + threshold1; }; /** Handle Maintenance IAUEs */ actionclass maintenance_iaue_handling { - TBDDefaultCallout; + all_dimm_H_memport_L; + threshold1; }; /** RDF RCD Parity Error */ @@ -369,8 +388,7 @@ actionclass rdf_rcd_parity_error_UERE /** SRQ RCD Parity Error */ actionclass srq_rcd_parity_error { - funccall("CalloutAttachedDimmsHigh"); - callout(connected(TYPE_MEM_PORT,0), MRU_LOW); + all_dimm_H_memport_L; threshold32pday; }; @@ -382,14 +400,15 @@ actionclass srq_rcd_parity_error_UERE actionclass mem_port_failure { - TBDDefaultCallout; + all_dimm_H_memport_L; + threshold1; # Threshold 1 }; ################################################################################ # Analyze groups ################################################################################ -actionclass analyzeOCMB_LFIR { analyze(gOCMB_LFIR); }; +actionclass analyzeOCMB_LFIR { analyze(gOCMB_LFIR); }; actionclass analyzeMMIOFIR { analyze(gMMIOFIR); }; actionclass analyzeSRQFIR { analyze(gSRQFIR); }; actionclass analyzeMCBISTFIR { analyze(gMCBISTFIR); }; diff --git a/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb_regs.rule b/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb_regs.rule index 2a36dd1e541..fcb48e05d72 100644 --- a/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb_regs.rule +++ b/src/usr/diag/prdf/common/plat/explorer/explorer_ocmb_regs.rule @@ -430,3 +430,22 @@ capture group default; }; + ############################################################################ + # Misc + ############################################################################ + + register FARB0 + { + name "MB_SIM.SRQ.MBA_FARB0Q"; + scomaddr 0x08011415; + capture group default; + }; + + register EXP_MSR + { + name "Explorer Mark Shadow Register"; + scomaddr 0x08011C0C; + capture group default; + }; + + diff --git a/src/usr/diag/prdf/common/plat/explorer/prdfExplorerPlugins_common.C b/src/usr/diag/prdf/common/plat/explorer/prdfExplorerPlugins_common.C index 69d63b34557..229af4cd6fb 100644 --- a/src/usr/diag/prdf/common/plat/explorer/prdfExplorerPlugins_common.C +++ b/src/usr/diag/prdf/common/plat/explorer/prdfExplorerPlugins_common.C @@ -114,6 +114,8 @@ int32_t Ddr4PhyInterrupt( ExtensibleChip * i_chip, } PRDF_PLUGIN_DEFINE( explorer_ocmb, Ddr4PhyInterrupt ); +//------------------------------------------------------------------------------ + /** * @brief OCMB_LFIR[39:46] - Foxhound Fatal * @param i_chip An OCMB chip. @@ -215,6 +217,8 @@ int32_t CalloutAttachedDimmsHigh( ExtensibleChip * i_chip, } PRDF_PLUGIN_DEFINE( explorer_ocmb, CalloutAttachedDimmsHigh ); +//------------------------------------------------------------------------------ + /** * @brief RDF RCD Parity Error * @param i_chip An OCMB chip. @@ -275,6 +279,8 @@ int32_t RdfRcdParityError( ExtensibleChip * i_chip, } PRDF_PLUGIN_DEFINE( explorer_ocmb, RdfRcdParityError ); +//------------------------------------------------------------------------------ + /** * @brief RDFFIR[0:7] - Mainline MPE. * @param i_chip OCMB chip. @@ -302,6 +308,170 @@ PLUGIN_FETCH_MPE_ERROR( 7 ) #undef PLUGIN_FETCH_MPE_ERROR +//------------------------------------------------------------------------------ + +/** + * @brief RDFFIR[8:9] - Mainline NCE and/or TCE. + * @param i_chip OCMB chip. + * @param io_sc The step code data struct. + * @return SUCCESS + */ +int32_t AnalyzeFetchNceTce( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + MemEcc::analyzeFetchNceTce( i_chip, io_sc ); + return SUCCESS; // nothing to return to rule code +} +PRDF_PLUGIN_DEFINE( explorer_ocmb, AnalyzeFetchNceTce ); + +//------------------------------------------------------------------------------ + +/** + * @brief RDFFIR[14] - Mainline UE. + * @param i_chip OCMB chip. + * @param io_sc The step code data struct. + * @return SUCCESS + */ +int32_t AnalyzeFetchUe( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + MemEcc::analyzeFetchUe( i_chip, io_sc ); + return SUCCESS; // nothing to return to rule code +} +PRDF_PLUGIN_DEFINE( explorer_ocmb, AnalyzeFetchUe ); + +//------------------------------------------------------------------------------ + +/** + * @brief RDFFIR[17] - Mainline read IUE. + * @param i_chip OCMB chip. + * @param io_sc The step code data struct. + * @return PRD_NO_CLEAR_FIR_BITS if IUE threshold is reached, else SUCCESS. + */ +int32_t AnalyzeMainlineIue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + int32_t rc = SUCCESS; + MemEcc::analyzeMainlineIue( i_chip, io_sc ); + + #ifdef __HOSTBOOT_MODULE + + if ( MemEcc::queryIueTh(i_chip, io_sc) ) + rc = PRD_NO_CLEAR_FIR_BITS; + + #endif + + return rc; // nothing to return to rule code +} +PRDF_PLUGIN_DEFINE( explorer_ocmb, AnalyzeMainlineIue ); + +//------------------------------------------------------------------------------ + +/** + * @brief RDFFIR[37] - Maint IUE. + * @param i_chip OCMB chip. + * @param io_sc The step code data struct. + * @return PRD_NO_CLEAR_FIR_BITS if IUE threshold is reached, else SUCCESS. + */ +int32_t AnalyzeMaintIue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + int32_t rc = SUCCESS; + MemEcc::analyzeMaintIue( i_chip, io_sc ); + + #ifdef __HOSTBOOT_MODULE + + if ( MemEcc::queryIueTh(i_chip, io_sc) ) + rc = PRD_NO_CLEAR_FIR_BITS; + + #endif + + return rc; // nothing to return to rule code +} +PRDF_PLUGIN_DEFINE( explorer_ocmb, AnalyzeMaintIue ); + +//------------------------------------------------------------------------------ + +/** + * @brief RDFFIR[19,39] - Mainline and Maint IMPE + * @param i_chip OCMB chip. + * @param io_sc The step code data struct. + * @return SUCCESS + */ +int32_t AnalyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) +{ + MemEcc::analyzeImpe( i_chip, io_sc ); + return SUCCESS; // nothing to return to rule code +} +PRDF_PLUGIN_DEFINE( explorer_ocmb, AnalyzeImpe ); + +//------------------------------------------------------------------------------ + +/** + * @brief RDFFIR[13,16] - Mainline AUE and IAUE + * @param i_chip OCMB chip. + * @param io_sc The step code data struct. + * @return SUCCESS + */ +int32_t AnalyzeFetchAueIaue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[explorer_ocmb::AnalyzeFetchAueIaue] " + + MemAddr addr; + if ( SUCCESS != getMemReadAddr(i_chip, + MemAddr::READ_AUE_ADDR, + addr) ) + { + PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x,READ_AUE_ADDR) failed", + i_chip->getHuid() ); + } + else + { + MemRank rank = addr.getRank(); + MemoryMru mm { i_chip->getTrgt(), rank, MemoryMruData::CALLOUT_RANK }; + io_sc.service_data->SetCallout( mm, MRU_HIGH ); + } + + return SUCCESS; // nothing to return to rule code + + #undef PRDF_FUNC +} +PRDF_PLUGIN_DEFINE( explorer_ocmb, AnalyzeFetchAueIaue ); + +//------------------------------------------------------------------------------ + +/** + * @brief RDFFIR[33] - Maintenance AUE + * @param i_chip OCMB chip. + * @param io_sc The step code data struct. + * @return SUCCESS + */ +int32_t AnalyzeMaintAue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[explorer_ocmb::AnalyzeMaintAue] " + + MemAddr addr; + if ( SUCCESS != getMemMaintAddr(i_chip, addr) ) + { + PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", + i_chip->getHuid() ); + } + else + { + MemRank rank = addr.getRank(); + MemoryMru mm { i_chip->getTrgt(), rank, MemoryMruData::CALLOUT_RANK }; + io_sc.service_data->SetCallout( mm, MRU_HIGH ); + } + + return SUCCESS; // nothing to return to rule code + + #undef PRDF_FUNC +} +PRDF_PLUGIN_DEFINE( explorer_ocmb, AnalyzeMaintAue ); + + //############################################################################## // // TLXFIR diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemDbUtils.H b/src/usr/diag/prdf/common/plat/mem/prdfMemDbUtils.H index affd8c5b31f..80586976efd 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemDbUtils.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemDbUtils.H @@ -301,17 +301,6 @@ void banTps( ExtensibleChip * i_chip, getOcmbDataBundle(i_chip)->iv_maskMainlineNceTce = true; } -template<> inline -void banTps( ExtensibleChip * i_chip, - const MemRank & i_rank ) -{ - // Call banTps for the parent OCMB - ExtensibleChip * ocmbChip = PlatServices::getConnectedParent( i_chip, - TARGETING::TYPE_OCMB_CHIP ); - banTps( ocmbChip, i_rank ); -} - - #endif // Hostboot Runtime only } // end namespace MemDbUtils diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C index fb2abb0c394..e097114bb95 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C @@ -150,23 +150,22 @@ uint32_t handleMemUe( ExtensibleChip * i_chip, uint32_t o_rc = SUCCESS; PRDF_ERR( PRDF_FUNC "Function not supported yet" ); - /* TODO RTC 208211 do { // First check to see if this is a side-effect UE. - SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister("DDRPHYFIR"); + SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister("OCMB_LFIR"); o_rc = fir->Read(); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "Read() failed on DDRPHYFIR: i_chip=0x%08x", + PRDF_ERR( PRDF_FUNC "Read() failed on OCMB_LFIR: i_chip=0x%08x", i_chip->getHuid() ); break; } - // Check DDRPHYFIR[54:55,57:59] to determine if this is a side-effect. - if ( 0 != (fir->GetBitFieldJustified(54,6) & 0x37) ) + // Check OCMB_LFIR[38] to determine if this is a side-effect. + if ( fir->IsBitSet(38) ) { - // This is a side-effect. Callout the MCA. + // This is a side-effect. Callout the OCMB. PRDF_TRAC( PRDF_FUNC "Memory UE is side-effect of DDRPHY error" ); io_sc.service_data->SetCallout( i_chip->getTrgt() ); io_sc.service_data->setServiceCall(); @@ -174,7 +173,8 @@ uint32_t handleMemUe( ExtensibleChip * i_chip, else { // Handle the memory UE. - o_rc = __handleMemUe( i_chip, i_addr, i_type, io_sc ); + o_rc = __handleMemUe( i_chip, i_addr, i_type, + io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "__handleMemUe(0x%08x,%d) failed", @@ -184,7 +184,6 @@ uint32_t handleMemUe( ExtensibleChip * i_chip, } } while (0); - */ return o_rc; @@ -383,6 +382,52 @@ uint32_t maskMemPort( ExtensibleChip * i_chip ) #undef PRDF_FUNC } +template<> +uint32_t maskMemPort( ExtensibleChip * i_chip ) +{ + #define PRDF_FUNC "[MemEcc::maskMemPort] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + do + { + // Mask all FIRs on the OCMB in the chiplet FIRs. + SCAN_COMM_REGISTER_CLASS * chipletMask = + i_chip->getRegister("OCMB_CHIPLET_FIR_MASK"); + SCAN_COMM_REGISTER_CLASS * chipletSpaMask = + i_chip->getRegister("OCMB_CHIPLET_SPA_FIR_MASK"); + + chipletMask->setAllBits(); + chipletSpaMask->setAllBits(); + + o_rc = chipletMask->Write() | chipletSpaMask->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on 0x%08x", i_chip->getHuid() ); + break; + } + + #ifdef __HOSTBOOT_RUNTIME + + // Dynamically deallocate the port. + if ( SUCCESS != MemDealloc::port( i_chip ) ) + { + PRDF_ERR( PRDF_FUNC "MemDealloc::port(0x%08x) " + "failed", i_chip->getHuid() ); + } + + #endif + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + #endif // __HOSTBOOT_MODULE //------------------------------------------------------------------------------ @@ -445,6 +490,62 @@ uint32_t triggerPortFail( ExtensibleChip * i_chip ) #undef PRDF_FUNC } +template<> +uint32_t triggerPortFail( ExtensibleChip * i_chip ) +{ + #define PRDF_FUNC "[MemEcc::triggerPortFail] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + OcmbDataBundle * db = getOcmbDataBundle( i_chip ); + + do + { + // trigger a port fail + // set FARB0[59] - MBA_FARB0Q_CFG_INJECT_PARITY_ERR_CONSTANT and + // FARB0[40] - MBA_FARB0Q_CFG_INJECT_PARITY_ERR_ADDR5 + SCAN_COMM_REGISTER_CLASS * farb0 = i_chip->getRegister("FARB0"); + + o_rc = farb0->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() FARB0 failed: i_chip=0x%08x", + i_chip->getHuid() ); + break; + } + + farb0->SetBit(59); + farb0->SetBit(40); + + o_rc = farb0->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() FARB0 failed: i_chip=0x%08x", + i_chip->getHuid() ); + break; + } + + // reset thresholds to prevent issuing multiple port failures on + // the same port + for ( auto & resetTh : db->iv_iueTh ) + { + resetTh.second.reset(); + } + + db->iv_iuePortFail = true; + + break; + }while(0); + + + return o_rc; + + #undef PRDF_FUNC +} + #endif // __HOSTBOOT_RUNTIME //------------------------------------------------------------------------------ @@ -475,6 +576,30 @@ bool queryIueTh( ExtensibleChip * i_chip, return iueAtTh; } +template<> +bool queryIueTh( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); + + bool iueAtTh = false; + + OcmbDataBundle * db = getOcmbDataBundle( i_chip ); + + // Loop through all our thresholds + for ( auto & th : db->iv_iueTh ) + { + // If threshold reached + if ( th.second.thReached(io_sc) ) + { + iueAtTh = true; + } + } + + return iueAtTh; +} + #endif //------------------------------------------------------------------------------ @@ -938,6 +1063,9 @@ uint32_t analyzeFetchUe( ExtensibleChip * i_chip, template uint32_t analyzeFetchUe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t analyzeFetchUe( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ @@ -1022,8 +1150,6 @@ uint32_t handleMemIue( ExtensibleChip * i_chip, #undef PRDF_FUNC } -//------------------------------------------------------------------------------ - template<> uint32_t handleMemIue( ExtensibleChip * i_chip, const MemRank & i_rank, @@ -1036,9 +1162,6 @@ uint32_t handleMemIue( ExtensibleChip * i_chip, uint32_t o_rc = SUCCESS; - PRDF_ERR( PRDF_FUNC "Function not supported yet" ); - /* TODO RTC 208211 - // Add the DIMM to the callout list. MemoryMru mm { i_chip->getTrgt(), i_rank, MemoryMruData::CALLOUT_RANK }; io_sc.service_data->SetCallout( mm ); @@ -1051,13 +1174,13 @@ uint32_t handleMemIue( ExtensibleChip * i_chip, if ( CHECK_STOP == io_sc.service_data->getPrimaryAttnType() ) break; // Get the data bundle from chip. - McaDataBundle * db = getMcaDataBundle( i_chip ); + OcmbDataBundle * db = getOcmbDataBundle( i_chip ); // If we have already caused a port fail, mask the IUE bits. if ( true == db->iv_iuePortFail ) { SCAN_COMM_REGISTER_CLASS * mask_or = - i_chip->getRegister("MCAECCFIR_MASK_OR"); + i_chip->getRegister("RDFFIR_MASK_OR"); mask_or->SetBit(17); mask_or->SetBit(37); @@ -1090,7 +1213,7 @@ uint32_t handleMemIue( ExtensibleChip * i_chip, // the error log has been committed. // Mask off the entire port to avoid collateral. - o_rc = MemEcc::maskMemPort( i_chip ); + o_rc = MemEcc::maskMemPort( i_chip ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort(0x%08x) failed", @@ -1103,8 +1226,6 @@ uint32_t handleMemIue( ExtensibleChip * i_chip, #endif // __HOSTBOOT_MODULE - */ - return o_rc; #undef PRDF_FUNC @@ -1112,14 +1233,14 @@ uint32_t handleMemIue( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ -template<> -uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc ) +template +uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[MemEcc::analyzeMainlineIue] " PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MCA == i_chip->getType() ); + PRDF_ASSERT( T == i_chip->getType() ); uint32_t o_rc = SUCCESS; @@ -1129,7 +1250,7 @@ uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, // not likely that we will have two independent failure modes at the // same time. So we just assume the address is correct. MemAddr addr; - o_rc = getMemReadAddr( i_chip, MemAddr::READ_RCE_ADDR, addr ); + o_rc = getMemReadAddr( i_chip, MemAddr::READ_RCE_ADDR, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x, READ_RCE_ADDR) failed", @@ -1138,7 +1259,7 @@ uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, } MemRank rank = addr.getRank(); - o_rc = handleMemIue( i_chip, rank, io_sc ); + o_rc = handleMemIue( i_chip, rank, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,m%ds%d) failed", @@ -1153,16 +1274,23 @@ uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, #undef PRDF_FUNC } +template +uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); + //------------------------------------------------------------------------------ -template<> -uint32_t analyzeMaintIue( ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc ) +template +uint32_t analyzeMaintIue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[MemEcc::analyzeMaintIue] " PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MCA == i_chip->getType() ); + PRDF_ASSERT( T == i_chip->getType() ); uint32_t o_rc = SUCCESS; @@ -1170,7 +1298,7 @@ uint32_t analyzeMaintIue( ExtensibleChip * i_chip, { // Use the current address in the MCBMCAT. MemAddr addr; - o_rc = getMemMaintAddr( i_chip, addr ); + o_rc = getMemMaintAddr( i_chip, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", @@ -1179,7 +1307,7 @@ uint32_t analyzeMaintIue( ExtensibleChip * i_chip, } MemRank rank = addr.getRank(); - o_rc = handleMemIue( i_chip, rank, io_sc ); + o_rc = handleMemIue( i_chip, rank, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,m%ds%d) failed", @@ -1194,6 +1322,13 @@ uint32_t analyzeMaintIue( ExtensibleChip * i_chip, #undef PRDF_FUNC } +template +uint32_t analyzeMaintIue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t analyzeMaintIue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); + //------------------------------------------------------------------------------ template<> @@ -1307,6 +1442,117 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, #undef PRDF_FUNC } +template<> +uint32_t analyzeImpe( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + + #define PRDF_FUNC "[MemEcc::analyzeImpe] " + + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + do + { + // get the mark shadow register + SCAN_COMM_REGISTER_CLASS * msr = i_chip->getRegister("EXP_MSR"); + + o_rc = msr->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on EXP_MSR: i_chip=0x%08x", + i_chip->getHuid() ); + break; + } + + TargetHandle_t trgt = i_chip->getTrgt(); + + // get galois field code - bits 8:15 of MSR + uint8_t galois = msr->GetBitFieldJustified( 8, 8 ); + + // get rank - bits 16:18 of MSR + uint8_t mrnk = msr->GetBitFieldJustified( 16, 3 ); + MemRank rank( mrnk ); + + // get symbol and DRAM + MemSymbol symbol = MemSymbol::fromGalois( trgt, rank, galois ); + if ( !symbol.isValid() ) + { + PRDF_ERR( PRDF_FUNC "Galois 0x%02x from EXP_MSR is invalid: 0x%08x," + "0x%02x", galois, i_chip->getHuid(), rank.getKey() ); + o_rc = FAIL; + break; + } + + // Add the DIMM to the callout list + MemoryMru memmru( trgt, rank, MemoryMruData::CALLOUT_RANK ); + io_sc.service_data->SetCallout( memmru ); + + #ifdef __HOSTBOOT_MODULE + // get data bundle from chip + OcmbDataBundle * db = getOcmbDataBundle( i_chip ); + uint8_t dram = symbol.getDram(); + + // Increment the count and check threshold. + if ( db->getImpeThresholdCounter()->inc(rank, dram, io_sc) ) + { + // Make the error log predictive if DRAM Repairs are disabled or if + // the number of DRAMs on this rank with IMPEs has reached threshold + if ( areDramRepairsDisabled() || + db->getImpeThresholdCounter()->queryDrams(rank, dram, io_sc) ) + { + io_sc.service_data->setServiceCall(); + } + else // Otherwise, place a chip mark on the failing DRAM. + { + MemMark chipMark( trgt, rank, galois ); + o_rc = MarkStore::writeChipMark( i_chip, rank, + chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeChipMark(0x%08x,0x%02x) failed", + i_chip->getHuid(), rank.getKey() ); + break; + } + + o_rc = MarkStore::chipMarkCleanup( i_chip, rank, + io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "chipMarkCleanup(0x%08x,0x%02x) failed", + i_chip->getHuid(), rank.getKey() ); + break; + } + } + } + + // If a predictive callout is made, mask both mainline and maintenance + // attentions. + if ( io_sc.service_data->queryServiceCall() ) + { + SCAN_COMM_REGISTER_CLASS * mask + = i_chip->getRegister( "RDFFIR_MASK_OR" ); + mask->SetBit(19); // mainline + mask->SetBit(39); // maintenance + o_rc = mask->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_MASK_OR: " + "0x%08x", i_chip->getHuid() ); + break; + } + } + #endif // __HOSTBOOT_MODULE + + } while (0); + + + return o_rc; + + #undef PRDF_FUNC +} + //------------------------------------------------------------------------------ template<> diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H index f06df3c0f50..0fd71dd8b8e 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H @@ -51,7 +51,7 @@ namespace MemEcc * @brief Adds the memory CE to the callout list and CE table. Will also issue * dynamic memory deallocation when appropriate. Returns true if TPS is * required. - * @param i_chip MCA, MBA, or MEM_PORT. + * @param i_chip MCA, MBA, or OCMB. * @param i_addr Failed address. * @param i_symbol Failed symbol. * @param o_doTps True if TPS is required. False otherwise. @@ -74,7 +74,7 @@ uint32_t handleMemCe( ExtensibleChip * i_chip, const MemAddr & i_addr, * of the DIMMs, the UE table will not be updated and no dynamic memory * deallocation. * - * @param i_chip MCA or MBA. + * @param i_chip MCA, OCMB, or MBA. * @param i_addr Failed address. * @param i_type The type of UE. * @param io_sc The step code data struct. @@ -96,7 +96,7 @@ uint32_t handleMemUe( ExtensibleChip * i_chip, const MemAddr & i_addr, * the port failure is issued in the PostAnalysis plugin after the error log has * been committed. * - * @param i_chip MCA chip. + * @param i_chip MCA or OCMB chip. * @param i_rank Rank containing the IUE. * @param io_sc The step code data struct. * @return Non-SUCCESS if an interal function fails, SUCCESS otherwise. @@ -107,7 +107,7 @@ uint32_t handleMemIue( ExtensibleChip * i_chip, const MemRank & i_rank, /** * @brief Handles a MPE attention. - * @param i_chip MCA or MBA. + * @param i_chip MCA, OCMB, or MBA. * @param i_addr Failed address. * @param i_type The type of UE. * @param io_sc The step code data struct. @@ -119,7 +119,7 @@ uint32_t handleMpe( ExtensibleChip * i_chip, const MemAddr & i_addr, /** * @brief Handles a MPE attention. - * @param i_chip MCA or MBA. + * @param i_chip MCA, OCMB, or MBA. * @param i_rank Target rank. * @param i_type The type of UE. * @param io_sc The step code data struct. @@ -135,7 +135,7 @@ uint32_t handleMpe( ExtensibleChip * i_chip, const MemRank & i_rank, /** * @brief Analyzes a fetch MPE attention. - * @param i_chip MCA or MBA. + * @param i_chip MCA, OCMB, or MBA. * @param i_rank Target rank. * @param io_sc The step code data struct. * @return Non-SUCCESS if an interal function fails, SUCCESS otherwise. @@ -146,7 +146,7 @@ uint32_t analyzeFetchMpe( ExtensibleChip * i_chip, const MemRank & i_rank, /** * @brief Analyzes a fetch NCE/TCE attention. - * @param i_chip MCA or MBA. + * @param i_chip MCA, OCMB, or MBA. * @param io_sc The step code data struct. * @return Non-SUCCESS if an interal function fails, SUCCESS otherwise. */ @@ -156,7 +156,7 @@ uint32_t analyzeFetchNceTce( ExtensibleChip * i_chip, /** * @brief Analyzes a fetch UE attention. - * @param i_chip MCA or MBA. + * @param i_chip MCA, OCMB, or MBA. * @param io_sc The step code data struct. * @return Non-SUCCESS if an interal function fails, SUCCESS otherwise. */ @@ -166,7 +166,7 @@ uint32_t analyzeFetchUe( ExtensibleChip * i_chip, /** * @brief Analyzes a fetch mainline IUE attention. - * @param i_chip MCA or MBA. + * @param i_chip MCA, OCMB, or MBA. * @param io_sc The step code data struct. * @return Non-SUCCESS if an interal function fails, SUCCESS otherwise. */ @@ -177,7 +177,7 @@ uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, /** * @brief Analyzes a fetch maint IUE attention. - * @param i_chip MCA or MBA. + * @param i_chip MCA, OCMB, or MBA. * @param io_sc The step code data struct. * @return Non-SUCCESS if an interal function fails, SUCCESS otherwise. */ @@ -187,7 +187,7 @@ uint32_t analyzeMaintIue( ExtensibleChip * i_chip, /** * @brief Analyzes a maint or mainline IMPE attention. - * @param i_chip MCA or MBA. + * @param i_chip MCA, OCMB, or MBA. * @param io_sc The step code data struct. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ @@ -208,7 +208,7 @@ uint32_t analyzeFetchRcePue( ExtensibleChip * i_chip, /** * @brief Will trigger a port fail. - * @param i_chip MCA chip + * @param i_chip MCA/OCMB chip * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise */ template @@ -221,7 +221,7 @@ uint32_t triggerPortFail( ExtensibleChip * i_chip ); /** * @brief Will query the data bundle and return if the IUE threshold has been * reached. - * @param i_chip MCA chip + * @param i_chip MCA/OCMB chip * @param io_sc The step code data struct. * @return True if IUE threshold is reached, false if not. */ @@ -231,7 +231,7 @@ bool queryIueTh( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); /** * @brief Will mask off an entire memory port. At runtime will issue dynamic * memory deallocation of the port. - * @param i_chip MCA chip + * @param i_chip MCA/OCMB chip * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise */ template diff --git a/src/usr/diag/prdf/plat/explorer/prdfExplorerPlugins.C b/src/usr/diag/prdf/plat/explorer/prdfExplorerPlugins.C index d4c82b2c1bc..abfaf55cf34 100644 --- a/src/usr/diag/prdf/plat/explorer/prdfExplorerPlugins.C +++ b/src/usr/diag/prdf/plat/explorer/prdfExplorerPlugins.C @@ -44,6 +44,49 @@ using namespace PlatServices; namespace explorer_ocmb { +//############################################################################## +// +// Special plugins +// +//############################################################################## + +/** + * @brief Plugin function called after analysis is complete but before PRD + * exits. + * @param i_chip An OCMB chip. + * @param io_sc The step code data struct. + * @note This is especially useful for any analysis that still needs to be + * done after the framework clears the FIR bits that were at attention. + * @return SUCCESS. + */ +int32_t PostAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[explorer_ocmb::PostAnalysis] " + + #ifdef __HOSTBOOT_RUNTIME + + // If the IUE threshold in our data bundle has been reached, we trigger + // a port fail. Once we trigger the port fail, the system may crash + // right away. Since PRD is running in the hypervisor, it is possible we + // may not get the error log. To better our chances, we trigger the port + // fail here after the error log has been committed. + if ( MemEcc::queryIueTh(i_chip, io_sc) ) + { + if ( SUCCESS != MemEcc::triggerPortFail(i_chip) ) + { + PRDF_ERR( PRDF_FUNC "triggerPortFail(0x%08x) failed", + i_chip->getHuid() ); + } + } + + #endif // __HOSTBOOT_RUNTIME + + return SUCCESS; // Always return SUCCESS for this plugin. + + #undef PRDF_FUNC +} +PRDF_PLUGIN_DEFINE( explorer_ocmb, PostAnalysis ); + //############################################################################## // // MCBISTFIR @@ -85,7 +128,6 @@ int32_t McbistCmdComplete( ExtensibleChip * i_chip, } PRDF_PLUGIN_DEFINE( explorer_ocmb, McbistCmdComplete ); - } // end namespace explorer_ocmb } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C index 9286a31ee77..054a35a277f 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C @@ -1140,6 +1140,7 @@ int32_t rank( ExtensibleChip * i_chip, MemRank i_rank ) } template int32_t rank( ExtensibleChip * i_chip, MemRank i_rank ); template int32_t rank( ExtensibleChip * i_chip, MemRank i_rank ); +template int32_t rank(ExtensibleChip * i_chip, MemRank i_rank); //------------------------------------------------------------------------------ @@ -1189,6 +1190,7 @@ int32_t port( ExtensibleChip * i_chip ) } template int32_t port( ExtensibleChip * i_chip ); template int32_t port( ExtensibleChip * i_chip ); +template int32_t port( ExtensibleChip * i_chip ); //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/prdfPlatServices.C b/src/usr/diag/prdf/plat/prdfPlatServices.C index d61bb7844c9..1a79ffc5e5b 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices.C +++ b/src/usr/diag/prdf/plat/prdfPlatServices.C @@ -396,7 +396,7 @@ uint32_t getMemAddrRange( ExtensibleChip * i_chip, #define PRDF_FUNC "[PlatServices::getMemAddrRange] " PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MEM_PORT == i_chip->getType() ); + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); /* TODO RTC 207273 - no HWP support yet uint32_t port = i_chip->getPos() % MAX_PORT_PER_OCMB;