Skip to content

Commit

Permalink
(a) Initial implementation of simple two-stages reservation stations …
Browse files Browse the repository at this point in the history
…(b) Improvement for FPU-32 stages controls to achieve higher pipelinization
  • Loading branch information
bandvig committed Oct 1, 2016
1 parent 8067c06 commit b2d50c6
Show file tree
Hide file tree
Showing 15 changed files with 2,856 additions and 937 deletions.
25 changes: 19 additions & 6 deletions doc/marocchino/marrochino_2_status_plans.txt
Expand Up @@ -55,7 +55,7 @@ pipelined module.
(b) Remove last stage of FETCH to improve performance
(c) Branch prediction is replaced by branch forwarding from EXECUTE
(in fact most of instructions those affect flag are 1-clock length, so
flag is usually computed by EXECUTED while FETCH is fetching delay slot)
flag is usually computed by EXECUTE while FETCH is fetching delay slot)
(d) CoreMark 109 Iterations/sec, i.e. 109/50 = 2.18 CoreMark/MHz


Expand All @@ -73,6 +73,12 @@ flag is usually computed by EXECUTED while FETCH is fetching delay slot)
exception processing more straight


1 October 2016
(a) Initial implementation of simple two-stages reservation stations
(b) Improvement for FPU-32 stages controls to achieve higher
pipelinization


Further development ideas:
Design is moving toward Tomasulo algorithm by natural reasons.
For example, to simplify logic that detects "unit busy" it would be
Expand All @@ -92,12 +98,19 @@ execution module. So, the obvious next steps are:


Primary TODOs:
- Implement two stages reservation stations at input of each
execution unit
- Implement full functional two stages reservation stations
for Tomasulo algorithm
- Implement destination register re-naming, use the extended addresses
for forwarding in reservation stations and remove data hazard
detection from pipe control. Use just "unit busy" flags from
reservation stations instead.
- Improvement control of FPU-32 pipelines
reservation stations instead
- Implement SRT-4 integer division
- Implement FPU-64
(http://opencores.org/or1k/Architecture_Specification#ORFPX64A32)
- Verify DU

- Verify Multi-Core operations

Secondary TODOs:
- Continue improvement FPU-32 controls for more pipelinization (see
MAROCCHINO_TODO comments in pfpu32_muldiv_marocchino.v)
- Implement write-back data cache
18 changes: 9 additions & 9 deletions doc/marocchino/marrochino_3_how_to.txt
Expand Up @@ -11,7 +11,7 @@ logic is removed temporary to reduce number of files for compilation.

mor1kx_marocchino_alone
#(
.FEATURE_DEBUGUNIT("NONE"), // MAROCCHINO_TODO: not implemented
.FEATURE_DEBUGUNIT("NONE"), // MAROCCHINO_TODO: not tested
// insn cache
.OPTION_ICACHE_BLOCK_WIDTH(5),
.OPTION_ICACHE_SET_WIDTH(8),
Expand Down Expand Up @@ -90,15 +90,17 @@ u_mor1kx_marocchino

The following files must be included into project:

your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_addsub.v
your_path/mor1kx/rtl/verilog/mor1kx_spram_en_w1st.v
your_path/mor1kx/rtl/verilog/mor1kx_dpram_en_w1st_sclk.v
your_path/mor1kx/rtl/verilog/mor1kx_ocb_marocchino.v
your_path/mor1kx/rtl/verilog/mor1kx_oman_marocchino.v
your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_addsub_marocchino.v
your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_cmp_marocchino.v
your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_f2i.v
your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_i2f.v
your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_muldiv.v
your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_f2i_marocchino.v
your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_i2f_marocchino.v
your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_muldiv_marocchino.v
your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_rnd_marocchino.v
your_path/mor1kx/rtl/verilog/pfpu32/pfpu32_top_marocchino.v
your_path/mor1kx/rtl/verilog/mor1kx_spram_en_w1st.v
your_path/mor1kx/rtl/verilog/mor1kx_dpram_en_w1st_sclk.v
your_path/mor1kx/rtl/verilog/mor1kx_bus_if_wb32.v
your_path/mor1kx/rtl/verilog/mor1kx_cache_lru.v
your_path/mor1kx/rtl/verilog/mor1kx_immu_marocchino.v
Expand All @@ -115,8 +117,6 @@ u_mor1kx_marocchino
your_path/mor1kx/rtl/verilog/mor1kx_decode_marocchino.v
your_path/mor1kx/rtl/verilog/mor1kx_execute_marocchino.v
your_path/mor1kx/rtl/verilog/mor1kx_rf_marocchino.v
your_path/mor1kx/rtl/verilog/mor1kx_ocb_marocchino.v
your_path/mor1kx/rtl/verilog/mor1kx_oman_marocchino.v
your_path/mor1kx/rtl/verilog/mor1kx_cpu_marocchino.v
your_path/mor1kx/rtl/verilog/mor1kx_marocchino_alone.v

Expand Down
63 changes: 45 additions & 18 deletions rtl/verilog/mor1kx_cpu_marocchino.v
Expand Up @@ -182,7 +182,6 @@ module mor1kx_cpu_marocchino

wire [OPTION_RF_ADDR_WIDTH-1:0] dcod_rfd_adr;
wire dcod_rf_wb;
wire do_rf_wb;
wire [OPTION_RF_ADDR_WIDTH-1:0] wb_rfd_adr;
wire wb_rf_wb;

Expand All @@ -206,9 +205,12 @@ module mor1kx_cpu_marocchino
wire dcod_do_branch;
wire [OPTION_OPERAND_WIDTH-1:0] dcod_do_branch_target;

// Delay conditional fetching till flag computation completion (see OMAN for details)
wire dcod_flag_await; // wait till flag ready & WB
wire dcod_op_brcond; // l.bf or l.bnf
// Signals to stall FETCH if we are waiting flag
// # flag is going to be written by multi-cycle instruction
// # like 64-bit FPU comparison or l.swa
wire dcod_flag_wb_mcycle;
// # conditional branch: l.bf or l.bnf
wire dcod_op_brcond;



Expand Down Expand Up @@ -265,7 +267,13 @@ module mor1kx_cpu_marocchino
wire grant_wb_to_mul;

// FPU-32 arithmetic part
wire [`OR1K_FPUOP_WIDTH-1:0] dcod_op_fp32_arith;
wire dcod_op_fp32_arith; // to OMAN and FPU32_ARITH
wire dcod_op_fp32_add; // to FPU32_ARITH
wire dcod_op_fp32_sub; // to FPU32_ARITH
wire dcod_op_fp32_mul; // to FPU32_ARITH
wire dcod_op_fp32_div; // to FPU32_ARITH
wire dcod_op_fp32_i2f; // to FPU32_ARITH
wire dcod_op_fp32_f2i; // to FPU32_ARITH
wire fp32_arith_busy; // idicates that arihmetic units are busy
wire fp32_arith_valid;
wire grant_wb_to_fp32_arith;
Expand All @@ -274,7 +282,8 @@ module mor1kx_cpu_marocchino
wire wb_except_fp32_arith; // generate FPx exception by FPx flags

// FPU-32 comparison part
wire [`OR1K_FPUOP_WIDTH-1:0] dcod_op_fp32_cmp;
wire dcod_op_fp32_cmp;
wire [2:0] dcod_opc_fp32_cmp;
wire wb_fp32_flag_set;
wire wb_fp32_flag_clear;
wire wb_fp32_cmp_inv;
Expand All @@ -283,6 +292,7 @@ module mor1kx_cpu_marocchino
wire wb_except_fp32_cmp;

// Forwarding comparision flag
wire busy_op_1clk_cmp; // integer or fp32
wire exec_op_1clk_cmp; // integer or fp32
wire exec_flag_set; // integer or fp32 comparison result

Expand Down Expand Up @@ -528,8 +538,11 @@ module mor1kx_cpu_marocchino
.dcod_rfb_i (dcod_rfb), // DECODE & DECODE->EXE
.dcod_do_branch_o (dcod_do_branch), // DECODE & DECODE->EXE
.dcod_do_branch_target_o (dcod_do_branch_target), // DECODE & DECODE->EXE
// Delay conditional fetching till flag computation completion (see OMAN for details)
.dcod_flag_await_o (dcod_flag_await), // DECODE & DECODE->EXE
// Signals to stall FETCH if we are waiting flag
// # flag is going to be written by multi-cycle instruction
// # like 64-bit FPU comparison or l.swa
.dcod_flag_wb_mcycle_o (dcod_flag_wb_mcycle), // DECODE & DECODE->EXE
// # conditional branch
.dcod_op_brcond_o (dcod_op_brcond), // DECODE & DECODE->EXE
// LSU related
.dcod_imm16_o (dcod_imm16), // DECODE & DECODE->EXE
Expand Down Expand Up @@ -562,6 +575,7 @@ module mor1kx_cpu_marocchino
// Set flag related
.dcod_op_setflag_o (dcod_op_setflag), // DECODE & DECODE->EXE
.dcod_op_fp32_cmp_o (dcod_op_fp32_cmp), // DECODE & DECODE->EXE
.dcod_opc_fp32_cmp_o (dcod_opc_fp32_cmp), // DECODE & DECODE->EXE
// Multiplier related
.dcod_op_mul_o (dcod_op_mul), // DECODE & DECODE->EXE
// Divider related
Expand All @@ -570,6 +584,12 @@ module mor1kx_cpu_marocchino
.dcod_op_div_unsigned_o (dcod_op_div_unsigned), // DECODE & DECODE->EXE
// FPU arithmmetic related
.dcod_op_fp32_arith_o (dcod_op_fp32_arith), // DECODE & DECODE->EXE
.dcod_op_fp32_add_o (dcod_op_fp32_add), // DECODE & DECODE->EXE
.dcod_op_fp32_sub_o (dcod_op_fp32_sub), // DECODE & DECODE->EXE
.dcod_op_fp32_mul_o (dcod_op_fp32_mul), // DECODE & DECODE->EXE
.dcod_op_fp32_div_o (dcod_op_fp32_div), // DECODE & DECODE->EXE
.dcod_op_fp32_i2f_o (dcod_op_fp32_i2f), // DECODE & DECODE->EXE
.dcod_op_fp32_f2i_o (dcod_op_fp32_f2i), // DECODE & DECODE->EXE
// MTSPR / MFSPR
.dcod_op_mfspr_o (dcod_op_mfspr), // DECODE & DECODE->EXE
.dcod_op_mtspr_o (dcod_op_mtspr), // DECODE & DECODE->EXE
Expand Down Expand Up @@ -758,6 +778,7 @@ module mor1kx_cpu_marocchino

// FP32 comparison flag
.dcod_op_fp32_cmp_i (dcod_op_fp32_cmp), // 1CLK
.dcod_opc_fp32_cmp_i (dcod_opc_fp32_cmp), // 1CLK
.except_fpu_enable_i (except_fpu_enable), // 1CLK
.ctrl_fpu_mask_flags_inv_i (ctrl_fpu_mask_flags[`OR1K_FPCSR_IVF - `OR1K_FPCSR_OVF]), // 1CLK
.ctrl_fpu_mask_flags_inf_i (ctrl_fpu_mask_flags[`OR1K_FPCSR_INF - `OR1K_FPCSR_OVF]), // 1CLK
Expand All @@ -770,6 +791,7 @@ module mor1kx_cpu_marocchino
.wb_except_fp32_cmp_o (wb_except_fp32_cmp), // 1CLK

// Forwarding comparision flag result for conditional branch take/not
.busy_op_1clk_cmp_o (busy_op_1clk_cmp), // 1CLK
.exec_op_1clk_cmp_o (exec_op_1clk_cmp), // 1CLK
.exec_flag_set_o (exec_flag_set) // 1CLK
);
Expand All @@ -790,7 +812,7 @@ module mor1kx_cpu_marocchino
.rst (rst), // FPU32_ARITH

// pipeline control inputs
.flush_i (pipeline_flush), // FPU32_ARITH
.pipeline_flush_i (pipeline_flush), // FPU32_ARITH
.padv_decode_i (padv_decode), // FPU32_ARITH
.padv_wb_i (padv_wb), // FPU32_ARITH
.grant_wb_to_fp32_arith_i (grant_wb_to_fp32_arith), // FPU32_ARITH
Expand All @@ -806,6 +828,12 @@ module mor1kx_cpu_marocchino

// Operands and commands
.dcod_op_fp32_arith_i (dcod_op_fp32_arith), // FPU32_ARITH
.dcod_op_fp32_add_i (dcod_op_fp32_add), // FPU32_ARITH
.dcod_op_fp32_sub_i (dcod_op_fp32_sub), // FPU32_ARITH
.dcod_op_fp32_mul_i (dcod_op_fp32_mul), // FPU32_ARITH
.dcod_op_fp32_div_i (dcod_op_fp32_div), // FPU32_ARITH
.dcod_op_fp32_i2f_i (dcod_op_fp32_i2f), // FPU32_ARITH
.dcod_op_fp32_f2i_i (dcod_op_fp32_f2i), // FPU32_ARITH
// from DECODE
.dcod_rfa_i (dcod_rfa), // FPU32_ARITH
.dcod_rfb_i (dcod_rfb), // FPU32_ARITH
Expand Down Expand Up @@ -1030,20 +1058,20 @@ module mor1kx_cpu_marocchino
.dcod_op_1clk_i (dcod_op_1clk), // OMAN
.dcod_op_div_i (dcod_op_div), // OMAN
.dcod_op_mul_i (dcod_op_mul), // OMAN
.dcod_op_fp32_arith_i (dcod_op_fp32_arith[(`OR1K_FPUOP_WIDTH-1)]), // OMAN
.dcod_op_fp32_arith_i (dcod_op_fp32_arith), // OMAN
.dcod_op_ls_i (dcod_op_lsu_load | dcod_op_lsu_store), // OMAN
.dcod_op_lsu_atomic_i (dcod_op_lsu_atomic), // OMAN
.dcod_op_rfe_i (dcod_op_rfe), // OMAN

// DECODE non-latched additional information related instruction
// part #1: iformation stored in order control buffer
.dcod_delay_slot_i (dcod_delay_slot), // OMAN
.dcod_flag_await_i (dcod_flag_await), // OMAN
.dcod_flag_wb_i (dcod_flag_wb), // OMAN
.dcod_carry_wb_i (dcod_carry_wb), // OMAN
.dcod_rf_wb_i (dcod_rf_wb), // OMAN
.dcod_rfd_adr_i (dcod_rfd_adr), // OMAN
.pc_decode_i (pc_decode), // OMAN
.dcod_rfd_adr_i (dcod_rfd_adr), // OMAN
.dcod_rf_wb_i (dcod_rf_wb), // OMAN
.dcod_carry_wb_i (dcod_carry_wb), // OMAN
.dcod_flag_wb_mcycle_i (dcod_flag_wb_mcycle), // OMAN
.dcod_flag_wb_i (dcod_flag_wb), // OMAN
.dcod_delay_slot_i (dcod_delay_slot), // OMAN
// part #2: information required for data dependancy detection
.dcod_rfa_req_i (dcod_rfa_req), // OMAN
.dcod_rfa_adr_i (dcod_rfa_adr), // OMAN
Expand All @@ -1053,6 +1081,7 @@ module mor1kx_cpu_marocchino
.dcod_carry_req_i (dcod_carry_req), // OMAN
.dcod_op_jr_i (dcod_op_jr), // OMAN
.dcod_op_brcond_i (dcod_op_brcond), // OMAN
.busy_op_1clk_cmp_i (busy_op_1clk_cmp), // OMAN
// part #3: information required for create enable for
// for external (timer/ethernet/uart/etc) interrupts
.dcod_op_lsu_store_i (dcod_op_lsu_store), // OMAN
Expand Down Expand Up @@ -1100,8 +1129,6 @@ module mor1kx_cpu_marocchino
.grant_wb_to_mul_o (grant_wb_to_mul), // OMAN
.grant_wb_to_fp32_arith_o (grant_wb_to_fp32_arith), // OMAN
.grant_wb_to_lsu_o (grant_wb_to_lsu), // OMAN
// common flag signaling that WB ir required
.do_rf_wb_o (do_rf_wb), // OMAN

// Support IBUS error handling in CTRL
.exec_jump_or_branch_o (exec_jump_or_branch), // OMAN
Expand Down
68 changes: 49 additions & 19 deletions rtl/verilog/mor1kx_decode_marocchino.v
Expand Up @@ -81,9 +81,12 @@ module mor1kx_decode_marocchino
output dcod_do_branch_o,
output [OPTION_OPERAND_WIDTH-1:0] dcod_do_branch_target_o,

// Delay conditional fetching till flag computation completion (see OMAN for details)
output dcod_flag_await_o, // wait till flag ready & WB
output dcod_op_brcond_o, // l.bf or l.bnf
// Signals to stall FETCH if we are waiting flag
// # flag is going to be written by multi-cycle instruction
// # like 64-bit FPU comparison or l.swa
output dcod_flag_wb_mcycle_o,
// # conditional branch: l.bf or l.bnf
output dcod_op_brcond_o,

// LSU related
output [`OR1K_IMM_WIDTH-1:0] dcod_imm16_o,
Expand Down Expand Up @@ -117,7 +120,8 @@ module mor1kx_decode_marocchino
output [OPTION_OPERAND_WIDTH-1:0] dcod_jal_result_o,
// Set flag related
output dcod_op_setflag_o,
output [`OR1K_FPUOP_WIDTH-1:0] dcod_op_fp32_cmp_o,
output dcod_op_fp32_cmp_o,
output [2:0] dcod_opc_fp32_cmp_o,

// Multiplier related
output dcod_op_mul_o,
Expand All @@ -127,8 +131,14 @@ module mor1kx_decode_marocchino
output dcod_op_div_signed_o,
output dcod_op_div_unsigned_o,

// FPU related
output [`OR1K_FPUOP_WIDTH-1:0] dcod_op_fp32_arith_o,
// FPU arithmetic part related
output dcod_op_fp32_arith_o,
output dcod_op_fp32_add_o,
output dcod_op_fp32_sub_o,
output dcod_op_fp32_mul_o,
output dcod_op_fp32_div_o,
output dcod_op_fp32_i2f_o,
output dcod_op_fp32_f2i_o,

// MTSPR / MFSPR
output dcod_op_mfspr_o,
Expand Down Expand Up @@ -262,14 +272,34 @@ module mor1kx_decode_marocchino


// --- FPU-32 arithmetic part ---
assign dcod_op_fp32_arith_o =
{(FEATURE_FPU != "NONE") & (opc_insn == `OR1K_OPCODE_FPU) & ~dcod_insn_i[3],
dcod_insn_i[`OR1K_FPUOP_WIDTH-2:0]};
assign dcod_op_fp32_arith_o = (FEATURE_FPU != "NONE") & (opc_insn == `OR1K_OPCODE_FPU) & (~dcod_insn_i[3]);
// fpu arithmetic opc:
// ===================
// 0000 = add
// 0001 = substract
// 0010 = multiply
// 0011 = divide
// 0100 = i2f
// 0101 = f2i
assign dcod_op_fp32_add_o = dcod_op_fp32_arith_o & (dcod_insn_i[2:0] == 3'd0);
assign dcod_op_fp32_sub_o = dcod_op_fp32_arith_o & (dcod_insn_i[2:0] == 3'd1);
assign dcod_op_fp32_mul_o = dcod_op_fp32_arith_o & (dcod_insn_i[2:0] == 3'd2);
assign dcod_op_fp32_div_o = dcod_op_fp32_arith_o & (dcod_insn_i[2:0] == 3'd3);
assign dcod_op_fp32_i2f_o = dcod_op_fp32_arith_o & (dcod_insn_i[2:0] == 3'd4);
assign dcod_op_fp32_f2i_o = dcod_op_fp32_arith_o & (dcod_insn_i[2:0] == 3'd5);

// --- FPU-32 comparison part ---
assign dcod_op_fp32_cmp_o =
{(FEATURE_FPU != "NONE") & (opc_insn == `OR1K_OPCODE_FPU) & dcod_insn_i[3],
dcod_insn_i[`OR1K_FPUOP_WIDTH-2:0]};
assign dcod_op_fp32_cmp_o = (FEATURE_FPU != "NONE") & (opc_insn == `OR1K_OPCODE_FPU) & dcod_insn_i[3];
// fpu comparison opc:
// ===================
// 1000 = EQ
// 1001 = NE
// 1010 = GT
// 1011 = GE
// 1100 = LT
// 1101 = LE
assign dcod_opc_fp32_cmp_o = dcod_insn_i[2:0];



// Immediate in l.mtspr is broken up, reassemble
Expand Down Expand Up @@ -687,19 +717,19 @@ module mor1kx_decode_marocchino


// Which instructions writes comparison flag?
assign dcod_flag_wb_o = dcod_op_setflag_o |
dcod_op_fp32_cmp_o[(`OR1K_FPUOP_WIDTH-1)] |
assign dcod_flag_wb_o = dcod_op_setflag_o |
dcod_op_fp32_cmp_o |
(opc_insn == `OR1K_OPCODE_SWA);
// Which instructions require comparison flag?
// # l.cmov
assign dcod_flag_req_o = dcod_op_cmov_o;


// Multicycle instructions which cause stall branch taking in FETCH
// They are multicycle "set flag" like l.swa or float64 comparison,
// so they couldn't be forwarded into FETCH immediately from EXECUTE.
assign dcod_flag_await_o = (opc_insn == `OR1K_OPCODE_SWA);
// Conditional branches to stall FETCH if we are waiting flag
// Signals to stall FETCH if we are waiting flag
// # flag is going to be written by multi-cycle instruction
// # like 64-bit FPU comparison or l.swa
assign dcod_flag_wb_mcycle_o = (opc_insn == `OR1K_OPCODE_SWA);
// # conditional branch
assign dcod_op_brcond_o = dcod_op_bf | dcod_op_bnf;


Expand Down

0 comments on commit b2d50c6

Please sign in to comment.