Skip to content

Commit

Permalink
crude kernel copy timer works
Browse files Browse the repository at this point in the history
  • Loading branch information
R. Clint Whaley committed Sep 16, 2016
1 parent 08ba6f6 commit 9cc32b8
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 14 deletions.
26 changes: 19 additions & 7 deletions AtlasBase/Clint/atlas-make.base
Expand Up @@ -2427,6 +2427,13 @@ x@(rout) : $(INCAdir)/atlas_type.h @(rout).o @(dep) $(parsedeps)
@endwhile
@ENDSKIP

help :
@@echo "flags for amm kerns preMCFLAGS='', else preKCFLAGS=''\n"
@@echo "make PREcpytime kfnam=FILENAME mu=X nu=X knam=ROUTINE_NAME ta=[N,T] betan=[X,1,n]\n"
@@echo "make xPREammtime_pt mmrout=FILENAME mu=X nu=X ku=X mb=X nb=X kb=X M=X N=X K=X\n"
@@echo "make PREammmtst mu=X nu=X M=X N=X K=X mmrout=<FILE>\n"
@@echo "make gen_amm_[sse,avx,gvec,scalar] mu=X nu=X vlen=VL kmaj=VL rt=FILE\n"

DoSzBlk :
rm -f szblk.c xszblk
echo "int getsz(int M, int N, int mu, int nu, int vlen)" > szblk.c
Expand Down Expand Up @@ -2988,13 +2995,18 @@ x@(pre)nbtune : @(pre)nbtune.o @(pre)ammlib
$(L3Bdir)/kernel/@(trsm) :
cd $(L3Bdir)/kernel ; make @(trsm)
@ptyp !
x@(pre)trstime_pt : @(pre)trstime_pt.o $(L3Bdir)/kernel/@(trsm)
$(CLINKER) $(CLINKFLAGS) -o x@(pre)trstime_pt @(pre)trstime_pt.o \
$(SYSdir)/time.o $(L3Bdir)/kernel/@(trsm) $(ATLASlib) $(LIBS)
@(pre)trsmKtime : x@(pre)trstime_pt
$(ATLRUN) $(MMTdir) x@(pre)trstime_pt -p $(NPROC) $(TIDLIST) \
-m $(mb) -n $(nb) -k $(mb) -um 1 -un 1 -uk 1 $(FMFS) \
-Ma 0 -Mb 0 -Mc 1 $(outF)
@(pre)cpytime :
$(KC) $(KCFLAGS) -c -DBETA${betan}=1 -D@(typ)=1 -DTIME_COPY \
-DCOPYK=$(knam) -o @(pre)cpytime_pt${bn}.o $(mySRCdir)/mmtime_pt.c
$(@up@(pre)KC) $(@up@(pre)KCFLAGS) $(CDEFS) -D@(typ)=1 -DBETA${betan}=1 \
-DTRANS$(ta)_=1 -DALPHA${betan}=1 -DATL_MU=$(mu) -DATL_NU=$(nu) \
-o @(pre)cpykern.o -c $(kfnam)
$(CLINKER) $(CLINKFLAGS) -o x@(pre)cpytime_pt @(pre)cpytime_pt${bn}.o \
@(pre)cpykern.o $(SYSdir)/time.o $(LIBS)
$(ATLRUN) $(MMTdir) xsammtime_pt -p $(NPROC) $(TIDLIST) \
-m $(mb) -n $(nb) -k $(kb) -um $(mu) -un $(nu) -uk $(ku) $(FMFS) \
-V $(vlen) -Ma $(mvA) -Mb $(mvB) -Mc $(mvC) $(outF)

x@(pre)trstime_pt3f : @(pre)trstime_pt3f.o $(L3Bdir)/kernel/@(trsm)
$(CLINKER) $(CLINKFLAGS) -o $@ @(pre)trstime_pt3f.o \
$(SYSdir)/time.o $(L3Bdir)/kernel/@(trsm) $(ATLASlib) $(LIBS)
Expand Down
151 changes: 144 additions & 7 deletions AtlasBase/Clint/atlas.base
Expand Up @@ -36498,6 +36498,27 @@ int main(int nargs, char **args)
const TYPE *T, TYPE *B, ATL_CINT ldb, TYPE *W);
#endif
#endif
#elif defined(TIME_COPY)
#ifndef COPYK
#define COPYK ATL_USERCPMM
#endif
#ifdef REAL
#ifdef COPY_C
void COPYK(const size_t, const size_t, const TYPE, const TYPE*,
const size_t, const TYPE, TYPE*);
#else
void COPYK(const size_t, const size_t, const TYPE,
const TYPE*, const size_t, TYPE*);
#endif
#else
#ifdef COPY_C
void COPYK(const size_t, const size_t, const TYPE*, const TYPE*,
const TYPE*, const TYPE*, TYPE *, const size_t);
#else
void COPYK(const size_t, const size_t, const TYPE,
const TYPE*, const size_t, TYPE*);
#endif
#endif
#elif defined(TCPLX)
size_t rszA, rszB, rszC;
void CAMM_b0(ATL_CSZT mblks, ATL_CSZT nblks, ATL_CSZT K, const TYPE *A,
Expand Down Expand Up @@ -36580,6 +36601,11 @@ int main(int nargs, char **args)
#endif

struct kmm_struct{
#ifdef TIME_COPY
TYPE *A;
size_t nmblks, nnblks, lda;
int COLWISE;
#endif
int mb, nb, kb; /* C: mbxnb, At: kbxmb, B: kbXnb */
int mu, nu, ku; /* needed to compute mblks/nblks */
int movA, movB, movC; /* which mat move in flush array? */
Expand All @@ -36599,6 +36625,8 @@ static double getMflops(double m, double n, double k)
return(1e-6*m*m*n);
#elif defined(TIME_SYRKK)
return(1e-6*n*(n+1)*k);
#elif defined(TIME_COPY)
return(m*n);
#else
return(1e-6*2.0*m*n*k);
#endif
Expand Down Expand Up @@ -36629,28 +36657,69 @@ double GetKmmMflop
CINT mb, CINT nb, CINT kb, /* C: mbxnb, At: kbxmb, B: kbXnb */
CINT mu, CINT nu, CINT ku, CINT vlen,
int movA, int movB, int movC, /* which mat move in flush array? */
struct kmm_struct *pd, /* problem definition */
size_t FLSIZE, /* min area to move in in bytes */
CINT reps /* # calls to kmm in one timing */
)
/*
* Returns MFLOP rate of matmul kernel KMM
*/
{
CINT mblks = mb/mu, nblks = nb/nu;
const int NOMOVE = !(movA|movB|movC);
size_t szA, szB, szC, extra, setsz, nsets, tsz, i, j, incA, incB, incC, n;
void *vp=NULL;
TYPE *C, *A, *B, *a, *b, *c;
TYPE *ep, *sp; /* extra & set ptrs */
double t0, t1, mf;
size_t szA, szB, szC, extra, setsz, nsets, tsz, i, j, incA, incB, incC, n;
#ifdef TIME_COPY
const size_t lda=pd->lda;
size_t INCBLK, INCPAN, II, JJ, NN, MM;
unsigned int B0, B1;
TYPE *AA=pd->A, *aa;
#ifdef TCPLX
TYPE alpha[2] = {0.0, 0.0}, *beta = alpha;
#else
TYPE alpha=0.0, beta=0.0;
#endif
#else
const TYPE alpha=1.0;
TYPE beta=1.0;
void *vp=NULL;
#endif
CINT mblks = mb/mu, nblks = nb/nu;
const int NOMOVE = !(movA|movB|movC);
unsigned int seed = mb*kb + (nb<<14);

#ifdef TIME_COPY
if (pd->COLWISE)
{
NN = pd->nnblks;
MM = pd->nmblks;
INCBLK = mb SHIFT;
INCPAN = lda*(nb SHIFT);
B0 = mb;
B1 = nb;
}
else
{
NN = pd->nmblks;
MM = pd->nnblks;
INCBLK = lda * (nb SHIFT);
INCPAN = (mb SHIFT);
B0 = nb;
B1 = mb;
}
AA += pd->iam * INCPAN;
INCPAN -= INCBLK*MM;
II = JJ = 0;
#endif
/*
* Get size for each matrix, and round up to ensure we keep alignment
*/
szA = ATL_getszA(mb, kb, mu, ku, vlen);
szB = ATL_getszB(kb, nb, ku, nu, vlen);
#ifdef TIME_COPY
szA = szB = 0;
#else
szA = ATL_getszA(mb, kb, mu, ku, vlen);
szB = ATL_getszB(kb, nb, ku, nu, vlen);
#endif
szC = ATL_getszC(mb, nb, mu, nu, vlen);
/*
* Compute the setsz & extra
Expand Down Expand Up @@ -36732,6 +36801,26 @@ double GetKmmMflop
Mjoin(PATL,ktrsmLLN_rk4)(mb, nb, 1.0, a, c, mb, B);
#elif defined(TIME_SYRKK)
KMM(mblks, nblks, kb, a, a, c, an, an, cn);
#elif defined(TIME_COPY)
#ifdef COPY_C
#else
#ifdef TREAL
COPYK(B0, B1, alpha, AA, lda, c);
#else
COPYK(B0, B1, alpha, AA, lda, c+(incC>>1), c);
#endif
#endif
aa += INCBLK;
if (++II == MM)
{
II = 0;
aa += INCPAN;
if (++JJ == NN)
{
JJ = 0;
aa = AA;
}
}
#else
KMM(mblks, nblks, kb, a, b, c, an, bn, cn);
#endif
Expand Down Expand Up @@ -36785,7 +36874,7 @@ void *TimeOnCore(void *vp)
while(!chkin[i]);
#endif
kp->mf = GetKmmMflop(kp->mb, kp->nb, kp->kb, kp->mu, kp->nu, kp->ku,
kp->vlen, kp->movA, kp->movB, kp->movC,
kp->vlen, kp->movA, kp->movB, kp->movC, kp,
kp->FLSIZE, kp->reps);
return(NULL);
}
Expand Down Expand Up @@ -36885,6 +36974,9 @@ void PrintUsage(char *name, int iarg, char *arg)
fprintf(stderr, " -n <#> : nb = #\n");
fprintf(stderr, " -k <#> : kb = #\n");
fprintf(stderr, " -V <veclen>\n");
#ifdef TIME_COPY
fprintf(stderr, " -D <M> <lda> <N> <COLWISE>: copy only\n");
#endif
fprintf(stderr, " -u[mnk] <#> : M/N/K loop unrolling is #\n");
fprintf(stderr, " -r <#> : set the # of times to call KMM\n");
fprintf(stderr, " -R <mf>: set # reps to force <mf> MFLOPs\n");
Expand All @@ -36904,6 +36996,10 @@ struct kmm_struct *GetFlags(int nargs, char **args, FILE **fpout)
*fpout = NULL;
kp = malloc(sizeof(struct kmm_struct));
assert(kp);
#ifdef TIME_COPY
kp->nmblks = kp->nnblks = kp->lda = 2000;
kp->COLWISE = 0;
#endif
kp->pids = NULL;
kp->p = 1;
kp->mb = kp->nb = kp->kb = 40;
Expand All @@ -36929,6 +37025,17 @@ struct kmm_struct *GetFlags(int nargs, char **args, FILE **fpout)
PrintUsage(args[0], i, "out of arguments");
kp->vlen = atoi(args[i]);
break;
#ifdef TIME_COPY
case 'D': /* -D <M> <lda> <N> <COL> */
if (i+4 >= nargs)
PrintUsage(args[0], i, "out of arguments");
kp->nmblks = atol(args[i+1]);
kp->lda = atol(args[i+2]);
kp->nnblks = atol(args[i+3]);
kp->COLWISE = atoi(args[i+4]);
i += 4;
break;
#endif
case 'F':
if (++i >= nargs)
PrintUsage(args[0], i, "out of arguments");
Expand Down Expand Up @@ -37079,6 +37186,22 @@ struct kmm_struct *GetFlags(int nargs, char **args, FILE **fpout)
kp->pids[j] = j;
#endif
}
#ifdef TIME_COPY /* Reduce M/N to nmblks, nnblks, handle P */
{
const size_t gap = kp->lda - kp->nmblks;
unsigned int nb = kp->mb;

kp->nmblks = (kp->nmblks+nb-1)/nb;
if (!kp->COLWISE)
kp->nmblks = (kp->nmblks >= kp->p) ? kp->nmblks : kp->p;
kp->lda = kp->nmblks + gap;
nb = kp->nb;
kp->nnblks = (kp->nnblks+nb-1)/nb;
if (kp->COLWISE)
kp->nnblks = (kp->nnblks >= kp->p) ? kp->nnblks : kp->p;
kp->movA = kp->movB = 0;
}
#endif
#ifdef TIME_SYRKK
kp->movB = 0;
#endif
Expand All @@ -37098,6 +37221,20 @@ int main(int nargs, char **args)
#ifndef ATL_NCPU
assert(p < 2);
#endif
#ifdef TIME_COPY
{
TYPE *A;
size_t k, N;
unsigned int seed;

N = kp->lda * kp->nnblks * kp->nb;
seed = kp->lda+(kp->nnblks<<8);
kp->A = A = malloc(ATL_MulBySize(N));
assert(A);
for (k=0; k < N; k++)
A[k] = dumb_prand(&seed);
}
#endif
dp = TimeOnCores(kp);
free(kp);
GetStat(p, dp, &min, &max, &avg);
Expand Down

0 comments on commit 9cc32b8

Please sign in to comment.