Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

TMU prefetch (WIP): tag memory (untested)

  • Loading branch information...
commit cfaf39947ad494d6ebab27e9f328d9a48898b2ab 1 parent 8cfa0da
@sbourdeauducq sbourdeauducq authored
View
40 cores/tmu2/rtl/tmu2.v
@@ -1,6 +1,6 @@
/*
* Milkymist SoC
- * Copyright (C) 2007, 2008, 2009, 2010 Sebastien Bourdeauducq
+ * Copyright (C) 2007, 2008, 2009, 2010, 2011 Sebastien Bourdeauducq
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -136,7 +136,7 @@ tmu2_ctlif #(
.alpha(alpha)
);
-/* Stage 1 - Fetch vertices */
+/* Stage - Fetch vertices */
wire fetchvertex_busy;
wire fetchvertex_pipe_stb;
wire fetchvertex_pipe_ack;
@@ -187,7 +187,7 @@ tmu2_fetchvertex fetchvertex(
.dry(dry)
);
-/* Stage 2 - Vertical interpolation division operands */
+/* Stage - Vertical interpolation division operands */
wire vdivops_busy;
wire vdivops_pipe_stb;
wire vdivops_pipe_ack;
@@ -243,7 +243,7 @@ tmu2_vdivops vdivops(
.dry_f(dry_f)
);
-/* Stage 3 - Vertical division */
+/* Stage - Vertical division */
wire vdiv_busy;
wire vdiv_pipe_stb;
wire vdiv_pipe_ack;
@@ -313,7 +313,7 @@ tmu2_vdiv vdiv(
.dry_f(dry_f2)
);
-/* Stage 4 - Vertical interpolation */
+/* Stage - Vertical interpolation */
wire vinterp_busy;
wire vinterp_pipe_stb;
wire vinterp_pipe_ack;
@@ -363,7 +363,7 @@ tmu2_vinterp vinterp(
.tey(tey)
);
-/* Stage 5 - Horizontal interpolation division operands */
+/* Stage - Horizontal interpolation division operands */
wire hdivops_busy;
wire hdivops_pipe_stb;
wire hdivops_pipe_ack;
@@ -403,7 +403,7 @@ tmu2_hdivops hdivops(
.diff_y(diff_y)
);
-/* Stage 6 - Horizontal division */
+/* Stage - Horizontal division */
wire hdiv_busy;
wire hdiv_pipe_stb;
wire hdiv_pipe_ack;
@@ -451,7 +451,7 @@ tmu2_hdiv hdiv(
.diff_y_r(diff_y_r)
);
-/* Stage 7 - Horizontal interpolation */
+/* Stage - Horizontal interpolation */
wire hinterp_busy;
wire hinterp_pipe_stb;
wire hinterp_pipe_ack;
@@ -489,7 +489,7 @@ tmu2_hinterp hinterp(
.ty(ty)
);
-/* Stage 8 - Mask texture coordinates */
+/* Stage - Mask texture coordinates */
wire mask_busy;
wire mask_pipe_stb;
wire mask_pipe_ack;
@@ -522,7 +522,7 @@ tmu2_mask mask(
.ty_m(ty_m)
);
-/* Stage 9 - Clamp texture coordinates and filter out off-screen points */
+/* Stage - Clamp texture coordinates and filter out off-screen points */
wire clamp_busy;
wire clamp_pipe_stb;
wire clamp_pipe_ack;
@@ -557,7 +557,7 @@ tmu2_clamp clamp(
.ty_c(ty_c)
);
-/* Stage 10 - Address generator */
+/* Stage - Address generator */
wire adrgen_busy;
wire adrgen_pipe_stb;
wire adrgen_pipe_ack;
@@ -600,7 +600,7 @@ tmu2_adrgen #(
.y_frac(y_frac)
);
-/* Stage 11a - Buffer */
+/* Stage - Buffer */
wire buffer1_busy;
wire buffer1_pipe_stb;
wire buffer1_pipe_ack;
@@ -629,7 +629,7 @@ tmu2_buffer #(
.dat_o({dadr_buf, tadra_buf, tadrb_buf, tadrc_buf, tadrd_buf, x_frac_buf, y_frac_buf})
);
-/* Stage 11b - Texel cache */
+/* Stage - Texel cache */
wire texcache_busy;
wire texcache_pipe_stb;
wire texcache_pipe_ack;
@@ -677,7 +677,7 @@ tmu2_texcache #(
.y_frac_f(y_frac_f)
);
-/* Stage 11c - Buffer */
+/* Stage - Buffer */
wire buffer2_busy;
wire buffer2_pipe_stb;
wire buffer2_pipe_ack;
@@ -706,7 +706,7 @@ tmu2_buffer #(
.dat_o({dadr_f_buf, tcolora_buf, tcolorb_buf, tcolorc_buf, tcolord_buf, x_frac_f_buf, y_frac_f_buf})
);
-/* Stage 11 - Blend neighbouring pixels for bilinear filtering */
+/* Stage - Blend neighbouring pixels for bilinear filtering */
wire blend_busy;
wire blend_pipe_stb;
wire blend_pipe_ack;
@@ -736,7 +736,7 @@ tmu2_blend #(
.color(color)
);
-/* Stage 11 - Apply decay effect and chroma key filtering. */
+/* Stage - Apply decay effect and chroma key filtering. */
wire decay_busy;
wire decay_pipe_stb;
wire decay_pipe_ack;
@@ -767,7 +767,7 @@ tmu2_decay #(
);
`ifdef TMU_HAS_ALPHA
-/* Stage 12 - Fetch destination pixel for alpha blending */
+/* Stage - Fetch destination pixel for alpha blending */
wire fdest_busy;
wire fdest_pipe_stb;
wire fdest_pipe_ack;
@@ -803,7 +803,7 @@ tmu2_fdest #(
.dcolor(dcolor)
);
-/* Stage 13 - Alpha blending */
+/* Stage - Alpha blending */
wire alpha_busy;
wire alpha_pipe_stb;
wire alpha_pipe_ack;
@@ -837,7 +837,7 @@ assign fmldr_adr = {fml_depth{1'bx}};
assign fmldr_stb = 1'b0;
`endif
-/* Stage 14 - Burst assembler */
+/* Stage - Burst assembler */
reg burst_flush;
wire burst_busy;
wire burst_pipe_stb;
@@ -874,7 +874,7 @@ tmu2_burst #(
.burst_do(burst_do)
);
-/* Stage 15 - Pixel output */
+/* Stage - Pixel output */
wire pixout_busy;
tmu2_pixout #(
View
50 cores/tmu2/rtl/tmu2_dpram.v
@@ -1,6 +1,6 @@
/*
* Milkymist SoC
- * Copyright (C) 2007, 2008, 2009, 2010 Sebastien Bourdeauducq
+ * Copyright (C) 2007, 2008, 2009, 2010, 2011 Sebastien Bourdeauducq
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -15,61 +15,39 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-/* Double-port RAM with double write-capable port */
+/* Double port RAM (1 read-only + 1 write-only), read-through */
module tmu2_dpram #(
parameter depth = 11, /* < log2 of the capacity in words */
parameter width = 32
) (
input sys_clk,
- input ce,
- input [depth-1:0] a,
- input we,
- input [width-1:0] di,
- output reg [width-1:0] do,
+ input [depth-1:0] ra,
+ input re,
+ output [width-1:0] rd,
- input [depth-1:0] a2,
- input we2,
- input [width-1:0] di2,
- output reg [width-1:0] do2
+ input [depth-1:0] wa,
+ input we,
+ input [width-1:0] wd
);
reg [width-1:0] ram[0:(1 << depth)-1];
-reg [width-1:0] do_tmp;
+reg [depth-1:0] rar;
always @(posedge sys_clk) begin
- if(ce) begin
- if(we)
- ram[a] <= di;
- do_tmp <= ram[a];
- do <= do_tmp;
- end
+ if(re)
+ rar <= ra;
+ if(we)
+ ram[wa] <= wd;
end
-reg [width-1:0] do2_tmp;
+assign rd = ram[rar];
-always @(posedge sys_clk) begin
- if(ce) begin
- if(we2)
- ram[a2] <= di2;
- do2_tmp <= ram[a2];
- do2 <= do2_tmp;
- end
-end
// synthesis translate_off
-/*
- * For some reason, in Verilog the result of an undefined multiplied by zero
- * seems to be undefined.
- * This causes problems with pixels that texcache won't fetch because some fractional
- * parts are zero: the blend unit yields an undefined result on those, instead of ignoring
- * the contribution of the undefined pixel.
- * Work around this by initializing the memories.
- */
-
integer i;
initial begin
for(i=0;i<(1 << depth);i=i+1)
View
72 cores/tmu2/rtl/tmu2_dpram_sw.v
@@ -1,72 +0,0 @@
-/*
- * Milkymist SoC
- * Copyright (C) 2007, 2008, 2009, 2010 Sebastien Bourdeauducq
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-/* Double-port RAM with single write-capable port */
-
-module tmu2_dpram_sw #(
- parameter depth = 11, /* < log2 of the capacity in words */
- parameter width = 32
-) (
- input sys_clk,
- input ce,
-
- input [depth-1:0] a,
- input we,
- input [width-1:0] di,
- output reg [width-1:0] do,
-
- input [depth-1:0] a2,
- output reg [width-1:0] do2
-);
-
-reg [width-1:0] ram[0:(1 << depth)-1];
-
-reg [width-1:0] do_tmp;
-reg [width-1:0] do2_tmp;
-
-always @(posedge sys_clk) begin
- if(ce) begin
- if(we)
- ram[a] <= di;
- do_tmp <= ram[a];
- do2_tmp <= ram[a2];
- do <= do_tmp;
- do2 <= do2_tmp;
- end
-end
-
-
-// synthesis translate_off
-
-/*
- * For some reason, in Verilog the result of an undefined multiplied by zero
- * seems to be undefined.
- * This causes problems with pixels that texcache won't fetch because some fractional
- * parts are zero: the blend unit yields an undefined result on those, instead of ignoring
- * the contribution of the undefined pixel.
- * Work around this by initializing the memories.
- */
-
-integer i;
-initial begin
- for(i=0;i<(1 << depth);i=i+1)
- ram[i] = 0;
-end
-
-// synthesis translate_on
-
-endmodule
View
79 cores/tmu2/rtl/tmu2_qpram.v
@@ -1,79 +0,0 @@
-/*
- * Milkymist SoC
- * Copyright (C) 2007, 2008, 2009, 2010 Sebastien Bourdeauducq
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-module tmu2_qpram #(
- parameter depth = 11, /* < log2 of the capacity in words */
- parameter width = 8
-) (
- input sys_clk,
- input ce,
-
- /* Read port 1 */
- input [depth-1:0] a1,
- output [width-1:0] d1,
-
- /* Read port 2 */
- input [depth-1:0] a2,
- output [width-1:0] d2,
-
- /* Read port 3 */
- input [depth-1:0] a3,
- output [width-1:0] d3,
-
- /* Read port 4 */
- input [depth-1:0] a4,
- output [width-1:0] d4,
-
- /* Write port - we=1 disables read ports 1 and 3 */
- input we,
- input [depth-1:0] aw,
- input [width-1:0] dw
-);
-
-tmu2_dpram_sw #(
- .depth(depth),
- .width(width)
-) ram1 (
- .sys_clk(sys_clk),
- .ce(ce),
-
- .a(we ? aw : a1),
- .we(we),
- .di(dw),
- .do(d1),
-
- .a2(a2),
- .do2(d2)
-);
-
-tmu2_dpram_sw #(
- .depth(depth),
- .width(width)
-) ram2 (
- .sys_clk(sys_clk),
- .ce(ce),
-
- .a(we ? aw : a3),
- .we(we),
- .di(dw),
- .do(d3),
-
- .a2(a4),
- .do2(d4)
-);
-
-endmodule
View
82 cores/tmu2/rtl/tmu2_qpram32.v
@@ -1,82 +0,0 @@
-/*
- * Milkymist SoC
- * Copyright (C) 2007, 2008, 2009, 2010 Sebastien Bourdeauducq
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-module tmu2_qpram32 #(
- parameter depth = 11 /* < log2 of the capacity in 32-bit words */
-) (
- input sys_clk,
- input ce,
-
- /* 32-bit read port 1 */
- input [depth-1:0] a1,
- output [31:0] d1,
-
- /* 32-bit read port 2 */
- input [depth-1:0] a2,
- output [31:0] d2,
-
- /* 32-bit read port 3 */
- input [depth-1:0] a3,
- output [31:0] d3,
-
- /* 32-bit read port 4 */
- input [depth-1:0] a4,
- output [31:0] d4,
-
- /* 64-bit write port - we=1 disables read ports */
- input we,
- input [depth-1-1:0] aw,
- input [63:0] dw
-);
-
-tmu2_dpram #(
- .depth(depth),
- .width(32)
-) ram1 (
- .sys_clk(sys_clk),
- .ce(ce),
-
- .a(we ? {aw, 1'b0} : a1),
- .we(we),
- .di(dw[63:32]),
- .do(d1),
-
- .a2(we ? {aw, 1'b1} : a2),
- .we2(we),
- .di2(dw[31:0]),
- .do2(d2)
-);
-
-tmu2_dpram #(
- .depth(depth),
- .width(32)
-) ram2 (
- .sys_clk(sys_clk),
- .ce(ce),
-
- .a(we ? {aw, 1'b0} : a3),
- .we(we),
- .di(dw[63:32]),
- .do(d3),
-
- .a2(we ? {aw, 1'b1} : a4),
- .we2(we),
- .di2(dw[31:0]),
- .do2(d4)
-);
-
-endmodule
View
281 cores/tmu2/rtl/tmu2_tagmem.v
@@ -0,0 +1,281 @@
+/*
+ * Milkymist SoC
+ * Copyright (C) 2007, 2008, 2009, 2010, 2011 Sebastien Bourdeauducq
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+module tmu2_tagmem #(
+ parameter cache_depth = 13,
+ parameter fml_depth = 26
+) (
+ input sys_clk,
+ input sys_rst,
+
+ input flush,
+ output reg busy,
+
+ input pipe_stb_i,
+ output reg pipe_ack_o,
+ input [fml_depth-1-1:0] dadr,
+ input [fml_depth-1:0] tadra,
+ input [fml_depth-1:0] tadrb,
+ input [fml_depth-1:0] tadrc,
+ input [fml_depth-1:0] tadrd,
+ input [5:0] x_frac,
+ input [5:0] y_frac,
+
+ output reg pipe_stb_o,
+ input pipe_ack_i,
+ output reg [fml_depth-1-1:0] dadr_f,
+ output reg [fml_depth-1:0] tadra_f,
+ output reg [fml_depth-1:0] tadrb_f,
+ output reg [fml_depth-1:0] tadrc_f,
+ output reg [fml_depth-1:0] tadrd_f,
+ output reg [5:0] x_frac_f,
+ output reg [5:0] y_frac_f,
+ output miss_a,
+ output miss_b,
+ output miss_c,
+ output miss_d
+);
+
+/* Extract cache indices. */
+wire [cache_depth-1-5:0] ci_a = tadra[cache_depth-1:5];
+wire [cache_depth-1-5:0] ci_b = tadrb[cache_depth-1:5];
+wire [cache_depth-1-5:0] ci_c = tadrc[cache_depth-1:5];
+wire [cache_depth-1-5:0] ci_d = tadrd[cache_depth-1:5];
+
+/* Determine 'valid' channels, i.e. channels that will have
+ * an influence on the result of the bilinear filter.
+ * Channel a is always valid.
+ */
+wire valid_b = x_frac != 6'd0;
+wire valid_c = y_frac != 6'd0;
+wire valid_d = (x_frac != 6'd0) & (y_frac != 6'd0);
+
+/* Group channels that have the same cache address.
+ * In each group, elect a 'leader' channel.
+ * The leader is chosen arbitrarily among the valid channels in the group.
+ * The consequence of this is that it is sufficient and necessary
+ * to take care of cache misses on the leader channels only.
+ */
+wire lead_a = ~(valid_b & (ci_a == ci_b)) & ~(valid_c & (ci_a == ci_c)) & ~(valid_d & (ci_a == ci_d));
+wire lead_b = valid_b & ~(valid_c & (ci_b == ci_c)) & ~(valid_d & (ci_b == ci_d));
+wire lead_c = valid_c & ~(valid_d & (ci_c == ci_d));
+wire lead_d = valid_d;
+
+/* Tag memory */
+reg tag_re;
+reg tag_we;
+wire [cache_depth-5-1:0] tag_wa;
+reg [fml_depth-cache_depth-1:0] tag_wd;
+
+wire [fml_depth-cache_depth-1:0] tag_rd_a;
+tmu2_dpram #(
+ .depth(cache_depth-5),
+ .width(fml_depth-cache_depth)
+) tag_a (
+ .sys_clk(sys_clk),
+
+ .ra(ci_a),
+ .re(tag_re),
+ .rd(tag_rd_a),
+
+ .wa(tag_wa),
+ .we(tag_we),
+ .wd(tag_wd)
+);
+
+wire [fml_depth-cache_depth-1:0] tag_rd_b;
+tmu2_dpram #(
+ .depth(cache_depth-5),
+ .width(fml_depth-cache_depth)
+) tag_b (
+ .sys_clk(sys_clk),
+
+ .ra(ci_b),
+ .re(tag_re),
+ .rd(tag_rd_b),
+
+ .wa(tag_wa),
+ .we(tag_we),
+ .wd(tag_wd)
+);
+
+wire [fml_depth-cache_depth-1:0] tag_rd_c;
+tmu2_dpram #(
+ .depth(cache_depth-5),
+ .width(fml_depth-cache_depth)
+) tag_c (
+ .sys_clk(sys_clk),
+
+ .ra(ci_c),
+ .re(tag_re),
+ .rd(tag_rd_c),
+
+ .wa(tag_wa),
+ .we(tag_we),
+ .wd(tag_wd)
+);
+
+wire [fml_depth-cache_depth-1:0] tag_rd_d;
+tmu2_dpram #(
+ .depth(cache_depth-5),
+ .width(fml_depth-cache_depth)
+) tag_d (
+ .sys_clk(sys_clk),
+
+ .ra(ci_d),
+ .re(tag_re),
+ .rd(tag_rd_d),
+
+ .wa(tag_wa),
+ .we(tag_we),
+ .wd(tag_wd)
+);
+
+/* Miss detection */
+reg req_valid;
+reg [fml_depth-cache_depth-1:0] ct_a_r;
+reg [fml_depth-cache_depth-1:0] ct_b_r;
+reg [fml_depth-cache_depth-1:0] ct_c_r;
+reg [fml_depth-cache_depth-1:0] ct_d_r;
+reg lead_a_r;
+reg lead_b_r;
+reg lead_c_r;
+reg lead_d_r;
+
+always @(posedge sys_clk) begin
+ if(sys_rst)
+ req_valid <= 1'b0;
+ else if(tag_re)
+ req_valid <= pipe_stb_i;
+ if(tag_re) begin
+ ct_a_r <= tadra[fml_depth-1:cache_depth];
+ ct_b_r <= tadrb[fml_depth-1:cache_depth];
+ ct_c_r <= tadrc[fml_depth-1:cache_depth];
+ ct_d_r <= tadrd[fml_depth-1:cache_depth];
+ lead_a_r <= lead_a;
+ lead_b_r <= lead_b;
+ lead_c_r <= lead_c;
+ lead_d_r <= lead_d;
+ end
+end
+
+assign miss_a = lead_a_r & (ct_a_r != tag_rd_a);
+assign miss_b = lead_b_r & (ct_b_r != tag_rd_b);
+assign miss_c = lead_c_r & (ct_c_r != tag_rd_c);
+assign miss_d = lead_d_r & (ct_d_r != tag_rd_d);
+
+wire more_than_one_miss = (missd_a & missd_b) | (missd_a & missd_c) | (missd_a & missd_d)
+ | (missd_b & missd_c) | (missd_b & missd_d)
+ | (missd_c & missd_d);
+
+/* Tag rewrite */
+reg [1:0] tag_sel;
+always @(*) begin
+ case(tag_sel)
+ 2'd0: tag_wd = ct_a_r;
+ 2'd1: tag_wd = ct_b_r;
+ 2'd2: tag_wd = ct_c_r;
+ default: tag_wd = ct_d_r;
+ endcase
+end
+
+/* Control logic */
+reg state;
+reg next_state;
+
+parameter RUNNING = 1'd0;
+parameter RESOLVE_MISS = 1'd1;
+
+always @(posedge sys_clk) begin
+ if(sys_rst)
+ state <= RUNNING;
+ else
+ state <= next_state;
+end
+
+always @(*) begin
+ next_state = state;
+
+ pipe_ack_o = 1'b0;
+ pipe_stb_o = 1'b0;
+
+ tag_re = 1'b0;
+ tag_we = 1'b0;
+ tag_sel = 2'd0;
+
+ case(state)
+ RUNNING: begin
+ pipe_ack_o = 1'b1;
+ tag_re = 1'b1;
+ if(req_valid) begin
+ pipe_stb_o = 1'b1;
+ tag_we = 1'b1;
+ if(missd_a)
+ tag_sel = 2'd0;
+ else if(missd_b)
+ tag_sel = 2'd1;
+ else if(missd_c)
+ tag_sel = 2'd2;
+ else if(missd_d)
+ tag_sel = 2'd3;
+ else
+ tag_we = 1'b0;
+ if(~pipe_ack_i) begin
+ tag_re = 1'b0;
+ tag_we = 1'b0;
+ pipe_ack_o = 1'b0;
+ end
+ if(more_than_one_miss) begin
+ tag_re = 1'b0;
+ if(pipe_ack_i)
+ next_state = RESOLVE_MISS;
+ end
+ end
+ end
+ RESOLVE_MISS: begin
+ tag_we = 1'b1;
+ if(missd_a)
+ tag_sel = 2'd0;
+ else if(missd_b)
+ tag_sel = 2'd1;
+ else if(missd_c)
+ tag_sel = 2'd2;
+ else if(missd_d)
+ tag_sel = 2'd3;
+ else begin
+ tag_we = 1'b0;
+ tag_re = 1'b1;
+ next_state = RUNNING;
+ end
+ end
+ endcase
+end
+
+/* Forward data */
+always @(posedge sys_clk) begin
+ if(tag_re) begin
+ dadr_f <= dadr;
+ tadra_f <= tadra;
+ tadrb_f <= tadrb;
+ tadrc_f <= tadrc;
+ tadrd_f <= tadrd;
+ x_frac_f <= x_frac;
+ y_frac_f <= y_frac;
+ end
+end
+
+endmodule
View
370 cores/tmu2/rtl/tmu2_texcache.v
@@ -1,6 +1,6 @@
/*
* Milkymist SoC
- * Copyright (C) 2007, 2008, 2009, 2010 Sebastien Bourdeauducq
+ * Copyright (C) 2007, 2008, 2009, 2010, 2011 Sebastien Bourdeauducq
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -15,6 +15,12 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+/*
+ * The general idea behind this module is explained in the paper
+ * "Prefetching in a Texture Cache Architecture"
+ * by Homan Igehy, Matthew Eldridge, and Kekoa Proudfoot, Stanford University
+ */
+
module tmu2_texcache #(
parameter cache_depth = 13, /* < log2 of the capacity in 8-bit words */
parameter fml_depth = 26
@@ -65,374 +71,12 @@ module tmu2_texcache #(
*
*/
-/* MEMORIES */
-wire [fml_depth-1:0] indexa;
-wire [fml_depth-1:0] indexb;
-wire [fml_depth-1:0] indexc;
-wire [fml_depth-1:0] indexd;
-
-reg ram_ce;
-
-wire [31:0] datamem_d1;
-wire [31:0] datamem_d2;
-wire [31:0] datamem_d3;
-wire [31:0] datamem_d4;
-
-reg datamem_we;
-wire [cache_depth-3-1:0] datamem_aw;
-
-tmu2_qpram32 #(
- .depth(cache_depth-2)
-) datamem (
- .sys_clk(sys_clk),
- .ce(ram_ce),
-
- .a1(indexa[cache_depth-1:2]),
- .d1(datamem_d1),
- .a2(indexb[cache_depth-1:2]),
- .d2(datamem_d2),
- .a3(indexc[cache_depth-1:2]),
- .d3(datamem_d3),
- .a4(indexd[cache_depth-1:2]),
- .d4(datamem_d4),
-
- .we(datamem_we),
- .aw(datamem_aw),
- .dw(fml_di)
-);
-
-wire [1+fml_depth-cache_depth-1:0] tagmem_d1; /* < valid bit + tag */
-wire [1+fml_depth-cache_depth-1:0] tagmem_d2;
-wire [1+fml_depth-cache_depth-1:0] tagmem_d3;
-wire [1+fml_depth-cache_depth-1:0] tagmem_d4;
-
-reg tagmem_we;
-wire [cache_depth-1-5:0] tagmem_aw;
-wire [1+fml_depth-cache_depth-1:0] tagmem_dw;
-
-tmu2_qpram #(
- .depth(cache_depth-5),
- .width(1+fml_depth-cache_depth)
-) tagmem (
- .sys_clk(sys_clk),
- .ce(ram_ce),
- .a1(indexa[cache_depth-1:5]),
- .d1(tagmem_d1),
- .a2(indexb[cache_depth-1:5]),
- .d2(tagmem_d2),
- .a3(indexc[cache_depth-1:5]),
- .d3(tagmem_d3),
- .a4(indexd[cache_depth-1:5]),
- .d4(tagmem_d4),
-
- .we(tagmem_we),
- .aw(tagmem_aw),
- .dw(tagmem_dw)
-);
-
-/* REQUEST TRACKER */
-reg invalidate_req;
-wire rqvalid_0 = pipe_stb_i & ~invalidate_req;
-wire [fml_depth-1-1:0] dadr_0 = dadr;
-wire [5:0] x_frac_0 = x_frac;
-wire [5:0] y_frac_0 = y_frac;
wire [fml_depth-1:0] tadra8_0 = {tadra, 1'b0};
wire [fml_depth-1:0] tadrb8_0 = {tadrb, 1'b0};
wire [fml_depth-1:0] tadrc8_0 = {tadrc, 1'b0};
wire [fml_depth-1:0] tadrd8_0 = {tadrd, 1'b0};
-reg rqvalid_1;
-reg [fml_depth-1-1:0] dadr_1;
-reg [5:0] x_frac_1;
-reg [5:0] y_frac_1;
-reg [fml_depth-1:0] tadra8_1;
-reg [fml_depth-1:0] tadrb8_1;
-reg [fml_depth-1:0] tadrc8_1;
-reg [fml_depth-1:0] tadrd8_1;
-
-reg rqvalid_2;
-reg [fml_depth-1-1:0] dadr_2;
-reg [5:0] x_frac_2;
-reg [5:0] y_frac_2;
-reg ignore_b_2;
-reg ignore_c_2;
-reg ignore_d_2;
-reg [fml_depth-1:0] tadra8_2;
-reg [fml_depth-1:0] tadrb8_2;
-reg [fml_depth-1:0] tadrc8_2;
-reg [fml_depth-1:0] tadrd8_2;
-
-wire rqt_ce;
-
-always @(posedge sys_clk) begin
- if(sys_rst) begin
- rqvalid_1 <= 1'b0;
- rqvalid_2 <= 1'b0;
- end else begin
- if(rqt_ce) begin
- rqvalid_1 <= rqvalid_0;
- dadr_1 <= dadr_0;
- x_frac_1 <= x_frac_0;
- y_frac_1 <= y_frac_0;
- tadra8_1 <= tadra8_0;
- tadrb8_1 <= tadrb8_0;
- tadrc8_1 <= tadrc8_0;
- tadrd8_1 <= tadrd8_0;
-
- rqvalid_2 <= rqvalid_1;
- dadr_2 <= dadr_1;
- x_frac_2 <= x_frac_1;
- y_frac_2 <= y_frac_1;
- ignore_b_2 <= x_frac_1 == 6'd0;
- ignore_c_2 <= y_frac_1 == 6'd0;
- ignore_d_2 <= (x_frac_1 == 6'd0) | (y_frac_1 == 6'd0);
- tadra8_2 <= tadra8_1;
- tadrb8_2 <= tadrb8_1;
- tadrc8_2 <= tadrc8_1;
- tadrd8_2 <= tadrd8_1;
- end
- end
-end
-
-/* OUTPUT DATA GENERATOR */
-assign dadr_f = dadr_2;
-assign x_frac_f = x_frac_2;
-assign y_frac_f = y_frac_2;
-
-assign tcolora = tadra8_2[1] ? datamem_d1[15:0] : datamem_d1[31:16];
-assign tcolorb = tadrb8_2[1] ? datamem_d2[15:0] : datamem_d2[31:16];
-assign tcolorc = tadrc8_2[1] ? datamem_d3[15:0] : datamem_d3[31:16];
-assign tcolord = tadrd8_2[1] ? datamem_d4[15:0] : datamem_d4[31:16];
-
-/* INDEX GENERATOR */
-reg index_sel;
-
-assign indexa = index_sel ? tadra8_2 : tadra8_0;
-assign indexb = index_sel ? tadrb8_2 : tadrb8_0;
-assign indexc = index_sel ? tadrc8_2 : tadrc8_0;
-assign indexd = index_sel ? tadrd8_2 : tadrd8_0;
-
-/* HIT DETECTION */
-wire valid_a = tagmem_d1[1+fml_depth-cache_depth-1];
-wire [fml_depth-1-cache_depth:0] tag_a = tagmem_d1[fml_depth-cache_depth-1:0];
-wire valid_b = tagmem_d2[1+fml_depth-cache_depth-1];
-wire [fml_depth-1-cache_depth:0] tag_b = tagmem_d2[fml_depth-cache_depth-1:0];
-wire valid_c = tagmem_d3[1+fml_depth-cache_depth-1];
-wire [fml_depth-1-cache_depth:0] tag_c = tagmem_d3[fml_depth-cache_depth-1:0];
-wire valid_d = tagmem_d4[1+fml_depth-cache_depth-1];
-wire [fml_depth-1-cache_depth:0] tag_d = tagmem_d4[fml_depth-cache_depth-1:0];
-
-wire hit_a = valid_a & (tag_a == tadra8_2[fml_depth-1:cache_depth]);
-wire hit_b = ignore_b_2 | (valid_b & (tag_b == tadrb8_2[fml_depth-1:cache_depth]));
-wire hit_c = ignore_c_2 | (valid_c & (tag_c == tadrc8_2[fml_depth-1:cache_depth]));
-wire hit_d = ignore_d_2 | (valid_d & (tag_d == tadrd8_2[fml_depth-1:cache_depth]));
-
-`ifdef VERIFY_TEXCACHE
-integer x, y;
-reg [15:0] expected;
-always @(posedge sys_clk) begin
- if(pipe_stb_o & pipe_ack_i) begin
- x = (tadra8_2/2) % 512;
- y = (tadra8_2/2) / 512;
- $image_get(0, x, y, expected);
- if(tcolora !== expected) begin
- $display("CACHE TEST FAILED [A]! (%d, %d): expected %x, got %x", x, y, expected, tcolora);
- $finish;
- end
- if(~ignore_b_2) begin
- x = (tadrb8_2/2) % 512;
- y = (tadrb8_2/2) / 512;
- $image_get(0, x, y, expected);
- if(tcolorb !== expected) begin
- $display("CACHE TEST FAILED [B]! (%d, %d): expected %x, got %x", x, y, expected, tcolorb);
- $finish;
- end
- end
- if(~ignore_c_2) begin
- x = (tadrc8_2/2) % 512;
- y = (tadrc8_2/2) / 512;
- $image_get(0, x, y, expected);
- if(tcolorc !== expected) begin
- $display("CACHE TEST FAILED [C]! (%d, %d): expected %x, got %x", x, y, expected, tcolorc);
- $finish;
- end
- end
- if(~ignore_d_2) begin
- x = (tadrd8_2/2) % 512;
- y = (tadrd8_2/2) / 512;
- $image_get(0, x, y, expected);
- if(tcolord !== expected) begin
- $display("CACHE TEST FAILED [D]! (%d, %d): expected %x, got %x", x, y, expected, tcolord);
- $finish;
- end
- end
- end
-end
-`endif
-
-/* FLUSH & MISS HANDLING */
-reg [fml_depth-1:0] fetch_adr;
-reg fetch_adr_ce;
-
-always @(posedge sys_clk) begin
- if(fetch_adr_ce) begin
- if(~hit_a)
- fetch_adr <= tadra8_2;
- else if(~hit_b)
- fetch_adr <= tadrb8_2;
- else if(~hit_c)
- fetch_adr <= tadrc8_2;
- else if(~hit_d)
- fetch_adr <= tadrd8_2;
- end
-end
-
-reg flush_mode;
-wire flush_done;
-reg [cache_depth-1-5:0] flush_counter;
-always @(posedge sys_clk) begin
- if(flush_mode)
- flush_counter <= flush_counter + 1'd1;
- else
- flush_counter <= {cache_depth-5{1'b0}};
-end
-assign flush_done = &flush_counter;
-
-reg write_valid;
-assign tagmem_aw = flush_mode ? flush_counter : fetch_adr[cache_depth-1:5];
-assign tagmem_dw = {write_valid, fetch_adr[fml_depth-1:cache_depth]};
-
-reg [1:0] burst_counter;
-assign datamem_aw = {fetch_adr[cache_depth-1:5], burst_counter};
-
-assign fml_adr = {fetch_adr[fml_depth-1:5], 5'd0};
-
-/* FSM-BASED CONTROLLER */
-reg [3:0] state;
-reg [3:0] next_state;
-
-parameter IDLE = 4'd0;
-parameter DATA1 = 4'd1;
-parameter DATA2 = 4'd2;
-parameter DATA3 = 4'd3;
-parameter DATA4 = 4'd4;
-parameter HANDLED_MISS0 = 4'd5;
-parameter HANDLED_MISS1 = 4'd6;
-parameter HANDLED_MISS = 4'd7;
-parameter FLUSHPIPE1 = 4'd8;
-parameter FLUSHPIPE2 = 4'd9;
-parameter FLUSH = 4'd10;
-
-always @(posedge sys_clk) begin
- if(sys_rst)
- state <= IDLE;
- else
- state <= next_state;
-end
-
-assign rqt_ce = pipe_ack_o | invalidate_req;
-
-always @(*) begin
- next_state = state;
-
- tagmem_we = 1'b0;
- write_valid = 1'b1;
-
- datamem_we = 1'b0;
- burst_counter = 2'bx;
-
- flush_mode = 1'b0;
-
- fml_stb = 1'b0;
-
- busy = 1'b1;
- pipe_stb_o = 1'b0;
- pipe_ack_o = 1'b0;
-
- invalidate_req = 1'b0;
- fetch_adr_ce = 1'b0;
-
- index_sel = 1'b0;
-
- ram_ce = 1'b1;
- case(state)
- IDLE: begin
- busy = rqvalid_1|rqvalid_2;
- pipe_stb_o = rqvalid_2 & hit_a & hit_b & hit_c & hit_d;
- pipe_ack_o = ~rqvalid_2 | ((hit_a & hit_b & hit_c & hit_d) & pipe_ack_i);
- ram_ce = ~rqvalid_2 | ((hit_a & hit_b & hit_c & hit_d) & pipe_ack_i);
- fetch_adr_ce = 1'b1;
- if(rqvalid_2 & (~hit_a | ~hit_b | ~hit_c | ~hit_d)) begin
- next_state = DATA1;
- end else if(flush)
- next_state = FLUSH;
- end
- DATA1: begin
- index_sel = 1'b1;
- fml_stb = 1'b1;
- burst_counter = 2'd0;
- datamem_we = 1'b1;
- tagmem_we = 1'b1;
- if(fml_ack)
- next_state = DATA2;
- end
- DATA2: begin
- index_sel = 1'b1;
- burst_counter = 2'd1;
- datamem_we = 1'b1;
- next_state = DATA3;
- end
- DATA3: begin
- index_sel = 1'b1;
- burst_counter = 2'd2;
- datamem_we = 1'b1;
- next_state = DATA4;
- end
- DATA4: begin
- index_sel = 1'b1;
- burst_counter = 2'd3;
- datamem_we = 1'b1;
- fetch_adr_ce = 1'b1;
- if(~hit_a | ~hit_b | ~hit_c | ~hit_d)
- next_state = DATA1;
- else
- next_state = HANDLED_MISS0;
- end
- /* wait for the written data to make its way through the pipelined RAM */
- HANDLED_MISS0: begin
- index_sel = 1'b1;
- next_state = HANDLED_MISS1;
- end
- HANDLED_MISS1: begin
- index_sel = 1'b1;
- next_state = HANDLED_MISS;
- end
- HANDLED_MISS: begin
- index_sel = 1'b1;
- pipe_stb_o = 1'b1;
- if(pipe_ack_i) begin
- invalidate_req = 1'b1;
- next_state = FLUSHPIPE1;
- end
- end
- FLUSHPIPE1: begin
- index_sel = 1'b1;
- next_state = FLUSHPIPE2;
- end
- FLUSHPIPE2: begin
- index_sel = 1'b1;
- next_state = IDLE;
- end
- FLUSH: begin
- tagmem_we = 1'b1;
- write_valid = 1'b0;
- flush_mode = 1'b1;
- if(flush_done)
- next_state = IDLE;
- end
- endcase
-end
endmodule
View
2  cores/tmu2/test/Makefile
@@ -1,7 +1,7 @@
# Usage: make TB=test_bench_file.v [ARBSRC=/path_to/fmlarb.v]
#
-ARBSRC?=../../fmlarb/rtl/fmlarb.v
+ARBSRC?=$(wildcard ../../fmlarb/rtl/*)
SOURCES= \
../rtl/tmu2_adrgen.v \
../rtl/tmu2_clamp.v \
Please sign in to comment.
Something went wrong with that request. Please try again.