1 #define _CUDA_NDARRAY_C 2 3 #include 4 #include 5 #include "theano_mod_helper.h" 6 7 #include 8 #include 9 10 #include "cuda_ndarray.cuh" 11 12 #ifndef CNMEM_DLLEXPORT 13 #define CNMEM_DLLEXPORT 14 #endif 15 16 #include "cnmem.h" 17 #include "cnmem.cpp" 18 19 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device. 20 #define COMPUTE_GPU_MEM_USED 0 21 22 //If true, we fill with NAN allocated device memory. 23 #define ALLOC_MEMSET 0 24 25 //If true, we print out when we free a device pointer, uninitialize a 26 //CudaNdarray, or allocate a device pointer 27 #define PRINT_FREE_MALLOC 0 28 29 //If true, we do error checking at the start of functions, to make sure there 30 //is not a pre-existing error when the function is called. 31 //You probably need to set the environment variable 32 //CUDA_LAUNCH_BLOCKING=1, and/or modify the CNDA_THREAD_SYNC 33 //preprocessor macro in cuda_ndarray.cuh 34 //if you want this to work. 35 #define PRECHECK_ERROR 0 36 37 cublasHandle_t handle = NULL; 38 int* err_var = NULL; 39 40 ///////////////////////// 41 // Alloc and Free 42 ///////////////////////// 43 44 static int g_gpu_context_active = 0; 45 46 47 PyObject * 48 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args); 49 static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure); 50 51 52 /** 53 * 54 * In the test program I'm using, the _outstanding_mallocs decreases with every call. 55 * This suggests there are more free() calls being made than alloc(), but I can't figure out why. 56 * 57 */ 58 int _outstanding_mallocs[] = {0,0}; 59 60 #if COMPUTE_GPU_MEM_USED 61 size_t _allocated_size = 0; 62 size_t _max_allocated_size = 0; 63 64 const int TABLE_SIZE = 10000; 65 struct table_struct{ 66 void* ptr; 67 size_t size; 68 }; 69 table_struct _alloc_size_table[TABLE_SIZE]; 70 #endif 71 72 void * device_malloc(size_t size) 73 { 74 return device_malloc(size, VERBOSE_DEVICE_MALLOC); 75 } 76 77 static bool g_use_cnmem = false; 78 static const int g_max_devices = 8; 79 int initCnmem(int card_number_provided, int card_nb, size_t mem) { 80 static bool cnmemInitialized = false; 81 if(cnmemInitialized) { 82 return 0; 83 } 84 // On stderr to be at the same place as "Using gpu device..." 85 int numDevices = 0; 86 cnmemDevice_t devices[g_max_devices]; 87 if(cudaGetDeviceCount(&numDevices) != cudaSuccess) { 88 PyErr_Format(PyExc_RuntimeError, 89 "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n", 90 cudaGetErrorString(cudaGetLastError())); 91 return -1; 92 } 93 if(card_number_provided){ 94 numDevices = 1; 95 int i = 0; 96 devices[i].device = card_nb; 97 devices[i].size = mem; 98 ///@TODO: thejaswi: add support for multiple streams 99 devices[i].numStreams = 0; 100 devices[i].streams = NULL; 101 devices[i].streamSizes = NULL; 102 }else{ 103 for(int i=0;ibase = NULL; 392 self->nd = -1; 393 self->host_structure = NULL; 394 self->data_allocated = 0; 395 self->dev_structure_fresh = 1; 396 self->dev_structure = NULL; 397 self->devdata = NULL; 398 } 399 400 static int 401 CudaNdarray_uninit(CudaNdarray*self) 402 { 403 #if PRINT_FREE_MALLOC 404 fprintf(stderr, "CudaNdarray_uninit %p\n", self); 405 #endif 406 int rval = 0; 407 if (self->data_allocated) { 408 assert(self->devdata); 409 if (device_free(self->devdata)) 410 { 411 fprintf(stderr, 412 "CudaNdarray_uninit: error freeing self->devdata. (self=%p, self->devata=%p)\n", 413 self, self->devdata); 414 rval = -1; 415 } 416 self->devdata = NULL; 417 self->data_allocated = 0; 418 } 419 if (self->dev_structure) 420 { 421 if (device_free(self->dev_structure)) 422 { 423 fprintf(stderr, 424 "CudaNdarray_uninit: error freeing dev_structure memory %p (self=%p)\n", 425 self->dev_structure, self); 426 rval = -1; 427 } 428 self->dev_structure = NULL; 429 } 430 if (self->host_structure) 431 { 432 free(self->host_structure); 433 self->host_structure = NULL; 434 } 435 self->nd = -1; 436 Py_XDECREF(self->base); 437 self->base = NULL; 438 return rval; 439 } 440 441 442 //make the rightmost coords change fastest 443 //TODO: why does a downward for-loop not work???? 444 //TODO: use the log2_dims and driver code to remove / and % 445 //TODO: skip the last division (when d == 0) 446 #define decl_k_elemwise_unary_rowmajor(name, F) \ 447 __global__ void name (unsigned int numEls, \ 448 unsigned int nd, \ 449 const int * dim, \ 450 const float * a_data, const int * a_str, \ 451 float * z_data, const int * z_str) \ 452 { \ 453 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; \ 454 const unsigned int numThreads = blockDim.x * gridDim.x; \ 455 \ 456 for (unsigned int i = idx; i < numEls; i += numThreads) \ 457 { \ 458 unsigned int ii = i; \ 459 const float * a_i = a_data; \ 460 float * z_i = z_data; \ 461 for (unsigned int _d = 0; _d < nd; ++_d) \ 462 { \ 463 unsigned int d = nd - _d-1; \ 464 int i_d = ii % dim[d]; /* i_d is our position in the d'th dimension */ \ 465 ii = ii / dim[d]; \ 466 a_i += i_d * a_str[d]; /* increment our a and z pointers by i_d elements */ \ 467 z_i += i_d * z_str[d]; \ 468 } \ 469 z_i[0] = F(a_i[0]); \ 470 } \ 471 } 472 473 template __device__ T unary_copy(T a) { return a; } 474 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy, unary_copy) 475 476 template __device__ T unary_exp(T a) { return exp(a); } 477 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp) 478 479 ///////////////////////////// 480 // Satisfying reqs to be Type 481 ///////////////////////////// 482 483 //DON'T use directly(if their is other CudaNdarray that point to it, it will cause problem)! use Py_DECREF() instead 484 static void 485 CudaNdarray_dealloc(CudaNdarray* self) 486 { 487 if (0) std::cerr << "CudaNdarray dealloc " << self << " " << self->devdata << '\n'; 488 if(Py_REFCNT(self) > 1) 489 printf("WARNING:CudaNdarray_dealloc called when there is still active reference to it.\n"); 490 CudaNdarray_uninit(self); 491 Py_TYPE(self)->tp_free((PyObject*)self); 492 --_outstanding_mallocs[1]; 493 if (0) 494 { 495 fprintf(stderr, "device_malloc_counts: (device) %i (obj) %i\n", 496 _outstanding_mallocs[0], 497 _outstanding_mallocs[1]); 498 } 499 } 500 501 static PyObject * 502 CudaNdarray_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 503 { 504 CudaNdarray *self; 505 506 self = (CudaNdarray *)type->tp_alloc(type, 0); 507 if (self != NULL) 508 { 509 CudaNdarray_null_init(self); 510 ++_outstanding_mallocs[1]; 511 } 512 return (PyObject *)self; 513 } 514 static int 515 CudaNdarray_init(CudaNdarray *self, PyObject *args, PyObject *kwds) 516 { 517 PyObject *arr=NULL; 518 519 if (! PyArg_ParseTuple(args, "O", &arr)) 520 return -1; 521 if (! PyArray_Check(arr)) 522 { 523 PyErr_SetString(PyExc_TypeError, "PyArray arg required"); 524 return -1; 525 } 526 int rval = CudaNdarray_CopyFromArray(self, (PyArrayObject*)arr); 527 return rval; 528 } 529 static PyMemberDef CudaNdarray_members[] = 530 { 531 /* 532 {"first", T_OBJECT_EX, offsetof(CudaNdarray, first), 0, 533 "first name"}, 534 {"last", T_OBJECT_EX, offsetof(CudaNdarray, last), 0, 535 "last name"}, 536 {"number", T_INT, offsetof(CudaNdarray, number), 0, 537 "noddy number"}, 538 */ 539 {NULL} /* Sentinel */ 540 }; 541 542 PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args) 543 { 544 PyObject * dtype = NULL; 545 if (args && !PyArg_ParseTuple(args, "|O", &dtype)) 546 return NULL; 547 if (dtype) { 548 PyArray_Descr* dtype2; 549 // PyArray_DescrConverter try to convert anything to a PyArray_Descr. 550 if(!PyArray_DescrConverter(dtype, &dtype2)) 551 { 552 PyObject * str = PyObject_Repr(dtype); 553 PyErr_Format(PyExc_TypeError, 554 "CudaNdarray dtype parameter not understood: %s", 555 PyString_AsString(str) 556 ); 557 Py_CLEAR(str); 558 return NULL; 559 } 560 int typeNum = dtype2->type_num; 561 Py_DECREF(dtype2); 562 if (typeNum != NPY_FLOAT32) 563 { 564 PyObject * str = PyObject_Repr(dtype); 565 PyErr_Format(PyExc_TypeError, 566 "CudaNdarray support only support float32 dtype, provided: %d", 567 typeNum 568 ); 569 Py_CLEAR(str); 570 return NULL; 571 } 572 } 573 574 int verbose = 0; 575 if(self->nd>=0 && CudaNdarray_SIZE(self)==0){ 576 npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp)); 577 assert (npydims); 578 for (int i = 0; i < self->nd; ++i) npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]); 579 PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM); 580 free(npydims); 581 if (!rval){ 582 return NULL; 583 } 584 assert (PyArray_ITEMSIZE((PyArrayObject *)rval) == sizeof(real)); 585 return rval; 586 } 587 if ((self->nd < 0) || (self->devdata == 0)) 588 { 589 PyErr_SetString(PyExc_ValueError, "can't copy from un-initialized CudaNdarray"); 590 return NULL; 591 } 592 CudaNdarray * contiguous_self = NULL; 593 if (CudaNdarray_is_c_contiguous(self)) 594 { 595 contiguous_self = self; 596 Py_INCREF(contiguous_self); 597 if (verbose) std::cerr << "CreateArrayObj already contiguous" << contiguous_self << '\n'; 598 } 599 else 600 { 601 contiguous_self = (CudaNdarray*)CudaNdarray_Copy(self); 602 if (verbose) std::cerr << "CreateArrayObj created contiguous" << contiguous_self << '\n'; 603 } 604 if (!contiguous_self) 605 { 606 return NULL; 607 } 608 609 npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp)); 610 assert (npydims); 611 for (int i = 0; i < self->nd; ++i) 612 npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]); 613 PyArrayObject * rval = (PyArrayObject *) PyArray_SimpleNew(self->nd, 614 npydims, 615 REAL_TYPENUM); 616 free(npydims); 617 if (!rval) 618 { 619 Py_DECREF(contiguous_self); 620 return NULL; 621 } 622 623 assert (PyArray_ITEMSIZE(rval) == sizeof(real)); 624 625 npy_intp rval_size = PyArray_SIZE(rval); 626 void *rval_data = PyArray_DATA(rval); 627 cudaError_t err; 628 CNDA_BEGIN_ALLOW_THREADS; 629 630 err = cudaMemcpy(rval_data, contiguous_self->devdata, 631 rval_size * sizeof(real), 632 cudaMemcpyDeviceToHost 633 ); 634 //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway 635 CNDA_END_ALLOW_THREADS; 636 637 if (cudaSuccess != err) 638 { 639 PyErr_Format(PyExc_RuntimeError, "error (%s)copying data to host", 640 cudaGetErrorString(err)); 641 Py_DECREF(rval); 642 rval = NULL; 643 } 644 645 Py_DECREF(contiguous_self); 646 return (PyObject *)rval; 647 } 648 649 // TODO-- we have two functions here, ZEROS and Zeros. 650 // ZEROS is meant to be called just from C code (you don't need to pass it PyObject * s) 651 // but this naming is very weird, makes it look like a macro 652 // we should figure out the correct convention and change to that 653 PyObject* CudaNdarray_ZEROS(int n, int * dims) 654 { 655 656 size_t total_elements = 1; 657 658 for(size_t i=0;i (SIZE_MAX / dims[i])) { 661 PyErr_Format(PyExc_RuntimeError, 662 "Can't store in size_t for the bytes requested %llu * %llu", 663 (unsigned long long)total_elements, 664 (unsigned long long)dims[i]); 665 return NULL; 666 } 667 total_elements*=dims[i]; 668 } 669 670 // total_elements now contains the size of the array, in reals 671 if (total_elements > (SIZE_MAX / sizeof(real))){ 672 PyErr_Format(PyExc_RuntimeError, 673 "Can't store in size_t for the bytes requested %llu * 4", 674 (unsigned long long)total_elements); 675 return NULL; 676 } 677 size_t total_size = total_elements * sizeof(real); 678 679 CudaNdarray* rval = (CudaNdarray*)CudaNdarray_New(); 680 if (!rval) 681 { 682 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to New failed"); 683 return NULL; 684 } 685 686 if (CudaNdarray_alloc_contiguous(rval, n, dims)) 687 { 688 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: allocation failed."); 689 Py_DECREF(rval); 690 return NULL; 691 } 692 693 // Fill with zeros 694 //fprintf(stdout, "Sizeof: %d\n", total_size); 695 if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size)) 696 { 697 PyErr_Format(PyExc_MemoryError, 698 "CudaNdarray_ZEROS: Error memsetting %llu bytes of device memory.", 699 (unsigned long long)total_size); 700 Py_DECREF(rval); 701 return NULL; 702 } 703 704 if (cnda_copy_structure_to_device(rval)) 705 { 706 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: syncing structure to device failed"); 707 Py_DECREF(rval); 708 return NULL; 709 } 710 return (PyObject*) rval; 711 } 712 713 // declared as a static method (hence 1st parameter is not used) 714 // Based on _Copy and _dimshuffle 715 PyObject* CudaNdarray_Zeros(PyObject* _unused, PyObject* shape) 716 { 717 if(!shape) 718 { 719 PyErr_SetString(PyExc_TypeError, "CudaNdarray_Zeros: function takes at least 1 argument (0 given)"); 720 return NULL; 721 } 722 if(!PySequence_Check(shape)) 723 { 724 PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence"); 725 return NULL; 726 } 727 728 int shplen = PySequence_Length(shape); 729 730 if (shplen == 0) 731 { 732 return CudaNdarray_ZEROS(0, NULL); 733 } 734 735 int* newdims = (int *)malloc(sizeof(int) * shplen); 736 737 if (!newdims) 738 { 739 PyErr_SetString(PyExc_MemoryError, 740 "CudaNdarray_Zeros: Failed to allocate temporary space"); 741 return NULL; 742 } 743 744 // start from the end to compute strides 745 for (int i = shplen-1; i >= 0; --i) 746 { 747 PyObject* shp_el_obj = PySequence_GetItem(shape, i); 748 if(shp_el_obj == NULL) 749 { 750 // shouldn't happen since we checked length before... 751 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_Zeros: Index out of bound in sequence"); 752 free(newdims); 753 return NULL; 754 } 755 756 int shp_el = PyInt_AsLong(shp_el_obj); 757 Py_DECREF(shp_el_obj); 758 759 if (shp_el < 0) 760 { 761 PyErr_SetString(PyExc_ValueError, "CudaNdarray_Zeros: shape must contain only non-negative values for size of a dimension"); 762 free(newdims); 763 return NULL; 764 } 765 766 newdims[i] = shp_el; 767 } 768 769 PyObject* rval = CudaNdarray_ZEROS(shplen,newdims); 770 771 free(newdims); 772 773 return (PyObject*)rval; 774 } 775 776 777 778 779 780 PyObject * CudaNdarray_Copy(const CudaNdarray * self) 781 { 782 PyObject * rval = CudaNdarray_New(); 783 if ((!rval) || (-1 == self->nd)) 784 { 785 return rval; 786 } 787 if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, self->nd, CudaNdarray_HOST_DIMS(self))) 788 { 789 Py_DECREF(rval); 790 return NULL; 791 } 792 if (CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)rval, self)) 793 { 794 Py_DECREF(rval); 795 return NULL; 796 } 797 return rval; 798 } 799 PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo) 800 { 801 assert(PyDict_Check(memo)); 802 PyObject * selfkey = PyInt_FromLong((long)self); 803 assert(selfkey); 804 if (PyDict_Contains(memo, selfkey)) 805 { 806 PyObject * rval = PyDict_GetItem(memo, selfkey); 807 Py_DECREF(selfkey); 808 Py_XINCREF(rval); 809 return rval; 810 } 811 else 812 { 813 PyObject * rval = CudaNdarray_Copy(self); 814 if (0) std::cerr << "DeepCopy created " << rval << " devdata " << ((CudaNdarray*)rval)->devdata << "\n"; 815 if (NULL == rval) 816 { 817 Py_DECREF(selfkey); 818 return NULL; 819 } 820 if (PyDict_SetItem(memo, selfkey, rval)) 821 { 822 Py_DECREF(rval); 823 Py_DECREF(selfkey); 824 return NULL; 825 } 826 Py_DECREF(selfkey); 827 return rval; 828 } 829 } 830 PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask) 831 { 832 if (!PySequence_Check(py_reduce_mask)) 833 { 834 PyErr_SetString(PyExc_TypeError, "reduce_mask must be sequence of ints"); 835 return NULL; 836 } 837 int len = PySequence_Length(py_reduce_mask); 838 if (len != self->nd) 839 { 840 PyErr_SetString(PyExc_TypeError, "length of reduce_mask must match self->nd"); 841 return NULL; 842 } 843 CudaNdarray * self_sum = (CudaNdarray*)CudaNdarray_New(); 844 if (!self_sum) 845 { 846 return NULL; 847 } 848 //TODO: allocate a fixed size dimshuffle_pattern_cache on the stack, 849 // and use it if it is big enough. 850 int * dimshuffle_pattern = (int*)malloc(len * 2 * sizeof(int)); 851 int * sum_dims = dimshuffle_pattern + len; 852 int n_remaining_dims = 0; 853 if (!dimshuffle_pattern) 854 { 855 Py_DECREF(self_sum); 856 PyErr_SetString(PyExc_MemoryError, "failed to alloc internal storage"); 857 return NULL; 858 } 859 for (int i = 0; i < len; ++i) 860 { 861 PyObject *o_i = PySequence_GetItem(py_reduce_mask, i); 862 int o_i_int = PyInt_AsLong(o_i); 863 Py_XDECREF(o_i); 864 if (PyErr_Occurred()) 865 { 866 Py_DECREF(self_sum); 867 free(dimshuffle_pattern); 868 return NULL; 869 } 870 if (o_i_int) // this is a dimension over which we are reducing 871 { 872 sum_dims[i] = 1; 873 } 874 else 875 { 876 sum_dims[i] = CudaNdarray_HOST_DIMS(self)[i]; 877 dimshuffle_pattern[n_remaining_dims++] = i; 878 } 879 } 880 if (0 || CudaNdarray_alloc_contiguous(self_sum, len, sum_dims) 881 || CudaNdarray_reduce_sum(self_sum, self) 882 || CudaNdarray_dimshuffle(self_sum, n_remaining_dims, dimshuffle_pattern)) 883 { 884 Py_DECREF(self_sum); 885 free(dimshuffle_pattern); 886 return NULL; 887 } 888 free(dimshuffle_pattern); 889 return (PyObject*)self_sum; 890 } 891 892 // Reshape self to the new shape gived by the tuple shape. 893 // 894 // If self is c contiguous, it return a view. Otherwise it always do a copy. 895 // TODO: make it return a view when the strides allow it even if it is not 896 // c contiguous 897 PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape) 898 { 899 if(!CudaNdarray_is_c_contiguous(self)) 900 { 901 // allocate new space 902 //TODO: test to see if we can re-use old one and take a new param to 903 // use this 904 CudaNdarray* rval = (CudaNdarray*) CudaNdarray_Copy(self); 905 if (!rval) 906 { 907 return NULL; 908 } 909 910 CudaNdarray* ret = (CudaNdarray*) CudaNdarray_Reshape(rval, shape); 911 Py_XDECREF(rval); 912 return (PyObject*)ret; 913 } 914 915 // check shape tuple 916 unsigned int rval_nd; 917 unsigned int * rval_dims; 918 size_t rval_size = 1; 919 920 if (PyTuple_Check(shape)){ 921 // copy shape to integer array 922 rval_nd = PyTuple_Size(shape); 923 }else if (PyInt_Check(shape)){ 924 rval_nd = 1; 925 }else{ 926 PyErr_SetString(PyExc_TypeError, "shape must be tuple of integers or an integer"); 927 return NULL; 928 } 929 rval_dims = (unsigned int*)malloc(rval_nd * sizeof(int)); 930 931 if(PyTuple_Check(shape)){ 932 for (int i = 0; i < rval_nd; ++i) 933 { 934 rval_dims[i] = PyInt_AsLong(PyTuple_GetItem(shape, i)); //GetItem returns borrowed reference 935 if (PyErr_Occurred()) //error in AsLong 936 { 937 free(rval_dims); 938 return NULL; 939 } 940 if(rval_dims[i]<0){ 941 PyErr_Format(PyExc_ValueError, "Reshape has invalid dimension %i (must be >=0)",rval_dims[i]); 942 free(rval_dims); 943 return NULL; 944 } 945 rval_size = rval_size * rval_dims[i]; 946 } 947 }else{ 948 rval_size = PyInt_AsLong(shape); 949 rval_dims[0] = rval_size; 950 } 951 // calculate new size, assert same as old size 952 if (rval_size != CudaNdarray_SIZE(self)) 953 { 954 PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %lld to %lld", CudaNdarray_SIZE(self), rval_size); 955 free(rval_dims); 956 return NULL; 957 } 958 if (rval_size==0) 959 { 960 PyObject * rval = CudaNdarray_NewDims(rval_nd, rval_dims); 961 free(rval_dims); 962 return rval; 963 } 964 965 //return a view, not a copy 966 //we can do this as we checked self is c_contiguous 967 CudaNdarray * rval = (CudaNdarray * )CudaNdarray_New(rval_nd); 968 969 if (!rval || 0 != rval->data_allocated 970 ||CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self)) 971 { 972 Py_XDECREF(rval); 973 free(rval_dims); 974 return NULL; 975 } 976 //set dim and stride 977 int size = 1; 978 for (int i = rval_nd-1; i >= 0; --i) 979 { 980 CudaNdarray_set_stride(rval, i, (rval_dims[i] == 1) ? 0 : size); 981 CudaNdarray_set_dim(rval, i, rval_dims[i]); 982 size = size * rval_dims[i]; 983 } 984 free(rval_dims); 985 return (PyObject*)rval; 986 } 987 988 PyObject * CudaNdarray_View(const CudaNdarray * self) 989 { 990 CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd); 991 if (!rval || CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self)) 992 { 993 Py_XDECREF(rval); 994 rval = NULL; 995 } 996 else 997 { 998 for (int i = 0; i < self->nd; ++i) 999 { 1000 CudaNdarray_set_dim(rval, i, CudaNdarray_HOST_DIMS(self)[i]); 1001 CudaNdarray_set_stride(rval, i, CudaNdarray_HOST_STRIDES(self)[i]); 1002 } 1003 } 1004 return (PyObject*)rval; 1005 } 1006 1007 /* 1008 * d0,... are the output dims 1009 * indices are a list of index to operate on 1010 * They are int32 viewed as float32. 1011 * a is the output 1012 * b is the input 1013 * dB0, the source leading dimensions size 1014 */ 1015 template 1016 __global__ void k_take_3(const int d0, const int d1, const int d2, 1017 const npy_int64* indices, 1018 float* a, 1019 const int sA0, const int sA1, const int sA2, 1020 const float* b, const int dB0, 1021 const int sB0, const int sB1, const int sB2, 1022 int* err){ 1023 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ 1024 npy_int64 idx = indices[i0]; 1025 if (idx<0) 1026 idx += dB0; // To allow negative indexing. 1027 if ((idx < 0) || (idx >= dB0)){ 1028 // Any value other the 0 probably work. But to be more safe, I want 1029 // to change all bits to prevent problem with concurrent write that 1030 // could cross cache line. But this should not happen with the 1031 // current code and driver. 1032 *err = 0xFFFF; 1033 continue; 1034 } 1035 for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){ 1036 for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){ 1037 int a_idx = i0*sA0 + i1*sA1 + i2*sA2; 1038 int b_idx = idx*sB0 + i1*sB1 + i2*sB2; 1039 a[a_idx] = b[b_idx]; 1040 } 1041 } 1042 } 1043 } 1044 1045 // We try to be similar to the PyArray_TakeFrom function 1046 //http://docs.scipy.org/doc/numpy/reference/c-api.array.html 1047 //TODO: support other clip mode then raise(clip, wrap) 1048 //self is the input that we copy data from. 1049 //The indices that we receive MUST be an CudaNdarray(float32) 1050 // that is in fact a view to int64 indices 1051 PyObject* 1052 CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ 1053 int verbose = 0; 1054 PyObject * indices_obj = NULL; 1055 //int axis; Default None, that mean the flattened array. 1056 PyObject * axis_obj = Py_None; 1057 PyObject * out_obj = Py_None; 1058 PyObject * clipmode_obj = NULL; 1059 int max_threads = 1; // max threads per blocks 1060 1061 if (! PyArg_ParseTuple(args, "O|OOOi", &indices_obj, &axis_obj, 1062 &out_obj, &clipmode_obj, &max_threads)) 1063 return NULL; 1064 1065 //Check argument indices 1066 //TODO: if not a numpy.ndarray, convert to numpy.ndarray 1067 //TODO: If a CudaNdarray, accept it and suppose the data is int32? is float32 number of int? 1068 //TODO: Support ndarray of other dtype then int32 1069 //TODO: support list of indices that are not c_contiguous 1070 CudaNdarray * indices = NULL; 1071 if (CudaNdarray_Check(indices_obj)) { 1072 if (verbose) printf("cudandarray indices\n"); 1073 indices = (CudaNdarray*) indices_obj; 1074 Py_INCREF(indices); 1075 } else if (PyArray_Check(indices_obj)) { 1076 if (verbose) printf("ndarray indices\n"); 1077 if (PyArray_TYPE((PyArrayObject *)indices_obj) != NPY_INT64) { 1078 PyErr_SetString(PyExc_TypeError, 1079 "CudaNdarray_TakeFrom: need a ndarray for indices" 1080 " with dtype int64"); 1081 return NULL; 1082 } 1083 if (PyArray_NDIM(((PyArrayObject*)indices_obj)) != 1) { 1084 PyErr_SetString(PyExc_TypeError, 1085 "CudaNdarray_TakeFrom: need a CudaNdarray of" 1086 " indices with only 1 dimensions"); 1087 return NULL; 1088 } 1089 // We need indices_obj to be contiguous, in order to take a view 1090 // with a different dtype. 1091 if (!PyArray_IS_C_CONTIGUOUS((PyArrayObject*) indices_obj)) { 1092 PyObject* indices_obj_contig = PyArray_NewCopy((PyArrayObject*) indices_obj, NPY_CORDER); 1093 if (!indices_obj_contig) 1094 return NULL; 1095 indices_obj = indices_obj_contig; 1096 } else { 1097 // Keep the refcount consistent 1098 Py_INCREF(indices_obj); 1099 } 1100 PyArray_Descr* float32_descr = PyArray_DescrFromType(NPY_FLOAT32); 1101 PyObject * indices_float32 = NULL; 1102 indices_float32 = PyArray_View((PyArrayObject*)indices_obj, 1103 float32_descr, NULL); 1104 if (verbose) printf("ndarray indices\n"); 1105 if (!indices_float32) { 1106 Py_DECREF(indices_obj); 1107 return NULL; 1108 } 1109 1110 indices = (CudaNdarray*) CudaNdarray_New(); 1111 if (verbose) printf("\nndarray after new\n"); 1112 if (! indices){ 1113 Py_DECREF(indices_obj); 1114 Py_DECREF(indices_float32); 1115 return NULL; 1116 } 1117 if (CudaNdarray_CopyFromArray(indices, 1118 (PyArrayObject *)indices_float32)){ 1119 Py_DECREF(indices_obj); 1120 Py_DECREF(indices_float32); 1121 return NULL; 1122 } 1123 Py_DECREF(indices_obj); 1124 Py_DECREF(indices_float32); 1125 } else { 1126 PyObject* py_s = PyObject_Str(indices_obj); 1127 const char* s = PyString_AsString(py_s); 1128 Py_DECREF(py_s); 1129 PyErr_Format(PyExc_TypeError, 1130 "CudaNdarray_TakeFrom: need an ndarray of int64 or a" 1131 " CudaNdarray(float32) that is a view from int64 data" 1132 " for indices. Got %s", s); 1133 return NULL; 1134 } 1135 1136 if (verbose) { 1137 printf("indices used on the gpu\n"); 1138 fprint_CudaNdarray(stdout, indices); 1139 PyObject * used_indices = CudaNdarray_CreateArrayObj(indices); 1140 PyObject_Print(used_indices, stdout, 0); 1141 Py_DECREF(used_indices); 1142 } 1143 if (verbose) printf("after print of object\n"); 1144 if(!CudaNdarray_is_c_contiguous(indices) != 0) { 1145 PyErr_SetString(PyExc_NotImplementedError, 1146 "CudaNdarray_TakeFrom: The indices must be contiguous in memory."); 1147 Py_DECREF(indices); 1148 return NULL; 1149 } 1150 int nb_indices = CudaNdarray_SIZE((CudaNdarray *)indices) / 2;// int64 are 8 bytes, float32 are 4 bytes 1151 1152 //Check argument axis 1153 //TODO: implement the default and other axis 1154 long axis = PyInt_AsLong(axis_obj); 1155 1156 if (axis != 0) { 1157 PyErr_Format(PyExc_NotImplementedError, 1158 "CudaNdarray_TakeFrom: only axis=0 is currently supported." 1159 " Got %ld.", axis); 1160 Py_DECREF(indices); 1161 return NULL; 1162 } 1163 1164 //Check argument out_obj 1165 CudaNdarray * out = NULL; 1166 if (out_obj && CudaNdarray_Check(out_obj)) 1167 out = (CudaNdarray*) out_obj; 1168 if (out && (out->nd != self->nd || 1169 CudaNdarray_HOST_DIMS(out)[0] != nb_indices)) 1170 out = NULL; 1171 int * dims = (int *)malloc(sizeof(int) * self->nd); 1172 dims[0] = nb_indices; 1173 1174 for (int i=1 ; ind ; i++) { 1175 dims[i] = CudaNdarray_HOST_DIMS(self)[i]; 1176 if (out && CudaNdarray_HOST_DIMS(out)[i] != dims[i]) { 1177 out = NULL; 1178 } 1179 } 1180 if (!out) { 1181 out = (CudaNdarray*)CudaNdarray_New(); 1182 if (!out){ 1183 Py_DECREF(indices); 1184 free(dims); 1185 return NULL; 1186 } 1187 if (CudaNdarray_alloc_contiguous(out, self->nd, dims)) { 1188 Py_DECREF(out); 1189 Py_DECREF(indices); 1190 free(dims); 1191 return NULL; 1192 } 1193 }else { 1194 Py_INCREF(out); 1195 } 1196 1197 //Check argument clipmode 1198 if (clipmode_obj) { 1199 char * clipmode = PyString_AsString(clipmode_obj); 1200 if (! clipmode){ 1201 Py_DECREF(indices); 1202 Py_DECREF(out); 1203 free(dims); 1204 return NULL; 1205 } 1206 if (strcmp(clipmode, "raise") != 0) { 1207 PyErr_Format(PyExc_NotImplementedError, 1208 "CudaNdarray_TakeFrom: only the raise mode is currently supported. Got '%s'", 1209 clipmode); 1210 Py_DECREF(indices); 1211 Py_DECREF(out); 1212 free(dims); 1213 return NULL; 1214 } 1215 } 1216 void (*k3)(const int, const int, const int, 1217 const npy_int64*, 1218 float*, const int, const int, const int, 1219 const float*, const int, 1220 const int, const int, const int, 1221 int*); 1222 k3 = k_take_3; 1223 1224 // Create the memory place that will store the error information. 1225 if(init_err_var() != 0) return NULL; 1226 1227 dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1); 1228 if(CudaNdarray_HOST_DIMS(out)[0] == 0){ 1229 // We take 0 elements, so no need for the rest of the code. 1230 // This speed up that case AND fix crash otherwise. 1231 free(dims); 1232 Py_DECREF(indices); 1233 return (PyObject *)out; 1234 } 1235 1236 switch (self->nd) { 1237 case 1: 1238 { 1239 dim3 n_threads(1, 1, 1); 1240 if (verbose) 1241 printf("cudaGetLastError=%d, nd=%d" 1242 " kernel config: (n_blocks.x=%d, n_blocks.y=%d," 1243 " n_threads.x=%i, n_threads.y=%i)\n", 1244 cudaGetLastError(), self->nd, 1245 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y); 1246 k3<<>>( 1247 dims[0], 1248 1, 1249 1, 1250 (npy_int64*) CudaNdarray_DEV_DATA(indices), 1251 CudaNdarray_DEV_DATA(out), 1252 CudaNdarray_HOST_STRIDES(out)[0], //strides 1253 1, 1254 1, 1255 CudaNdarray_DEV_DATA(self), 1256 CudaNdarray_HOST_DIMS(self)[0], //For indices check 1257 CudaNdarray_HOST_STRIDES(self)[0], //strides 1258 1, 1259 1, 1260 err_var); 1261 } 1262 break; 1263 case 2: 1264 { 1265 dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads), 1, 1); 1266 1267 if (verbose) 1268 printf("cudaGetLastError=%d, nd=%d" 1269 " kernel config: (n_blocks.x=%d, n_blocks.y=%d," 1270 " n_threads.x=%i, n_threads.y=%i)\n", 1271 cudaGetLastError(), self->nd, 1272 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y); 1273 1274 k3<<>>( 1275 dims[0], //dimensions 1276 dims[1], 1277 1, 1278 (npy_int64*) CudaNdarray_DEV_DATA(indices), 1279 CudaNdarray_DEV_DATA(out), 1280 CudaNdarray_HOST_STRIDES(out)[0], //strides 1281 CudaNdarray_HOST_STRIDES(out)[1], 1282 1, 1283 CudaNdarray_DEV_DATA(self), 1284 CudaNdarray_HOST_DIMS(self)[0], //For indices check 1285 CudaNdarray_HOST_STRIDES(self)[0], //strides 1286 CudaNdarray_HOST_STRIDES(self)[1], 1287 1, 1288 err_var); 1289 } 1290 break; 1291 case 3: 1292 { 1293 int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], max_threads); 1294 int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads / ty); 1295 dim3 n_threads(tx, ty, 1); 1296 if (verbose) 1297 printf("cudaGetLastError=%d, nd=%d" 1298 " kernel config: (n_blocks.x=%d, n_blocks.y=%d," 1299 " n_threads.x=%i, n_threads.y=%i)\n", 1300 cudaGetLastError(), self->nd, 1301 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y); 1302 k3<<>>( 1303 dims[0], //dimensions 1304 dims[1], 1305 dims[2], 1306 (npy_int64*) CudaNdarray_DEV_DATA(indices), 1307 CudaNdarray_DEV_DATA(out), 1308 CudaNdarray_HOST_STRIDES(out)[0], //strides 1309 CudaNdarray_HOST_STRIDES(out)[1], 1310 CudaNdarray_HOST_STRIDES(out)[2], 1311 CudaNdarray_DEV_DATA(self), 1312 CudaNdarray_HOST_DIMS(self)[0], //For indices check 1313 CudaNdarray_HOST_STRIDES(self)[0], //strides 1314 CudaNdarray_HOST_STRIDES(self)[1], 1315 CudaNdarray_HOST_STRIDES(self)[2], 1316 err_var); 1317 } 1318 break; 1319 default: 1320 PyErr_SetString(PyExc_NotImplementedError, 1321 "CudaNdarray_TakeFrom: only input with 1, 2 or 3" 1322 " dimensions are currently supported"); 1323 1324 } 1325 free(dims); 1326 CNDA_THREAD_SYNC; 1327 cudaError_t err = cudaGetLastError(); 1328 if (cudaSuccess != err) { 1329 PyErr_Format(PyExc_RuntimeError, 1330 "Cuda error: %s: %s.\n", 1331 "CudaNdarray_TakeFrom", 1332 cudaGetErrorString(err)); 1333 Py_DECREF(indices); 1334 Py_DECREF(out); 1335 return NULL; 1336 } 1337 1338 int index_err = check_err_var(); 1339 Py_DECREF(indices); 1340 if (index_err != 0) { 1341 Py_DECREF(out); 1342 return NULL; 1343 } 1344 1345 if (verbose) printf("TAKE SUCCEDED\n"); 1346 return (PyObject *)out; 1347 } 1348 1349 1350 PyObject * CudaNdarray_SetStride(CudaNdarray * self, PyObject *args) 1351 { 1352 int pos, stride; 1353 if (! PyArg_ParseTuple(args, "ii", &pos, &stride)) 1354 return NULL; 1355 if ((pos < 0) || (pos >= self->nd)) 1356 { 1357 PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd); 1358 return NULL; 1359 } 1360 CudaNdarray_set_stride(self, pos, stride); 1361 if (cnda_copy_structure_to_device(self)) 1362 { 1363 return NULL; 1364 } 1365 Py_INCREF(Py_None); 1366 return Py_None; 1367 } 1368 PyObject * CudaNdarray_SetShapeI(CudaNdarray * self, PyObject *args) 1369 { 1370 int pos, dim; 1371 if (! PyArg_ParseTuple(args, "ii", &pos, &dim)) 1372 return NULL; 1373 if ((pos < 0) || (pos >= self->nd)) 1374 { 1375 PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd); 1376 return NULL; 1377 } 1378 CudaNdarray_set_dim(self, pos, dim); 1379 if (cnda_copy_structure_to_device(self)) 1380 { 1381 return NULL; 1382 } 1383 Py_INCREF(Py_None); 1384 return Py_None; 1385 } 1386 1387 static PyObject * 1388 CudaNdarray_exp(CudaNdarray* self) 1389 { 1390 CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New(); 1391 if ((NULL == rval) || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self))) 1392 { 1393 Py_XDECREF(rval); 1394 return NULL; 1395 } 1396 unsigned int size = 1; 1397 for (int i = 0; i < self->nd; i++) 1398 { 1399 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i]; 1400 } 1401 unsigned int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); 1402 unsigned int n_blocks = std::min(ceil_intdiv(size,threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS); 1403 k_elemwise_unary_rowmajor_exp<<>>(size, self->nd, CudaNdarray_DEV_DIMS(self), 1404 CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_STRIDES(self), 1405 CudaNdarray_DEV_DATA(rval), CudaNdarray_DEV_STRIDES(rval)); 1406 1407 //TODO: don't do this right away, do it when we need the result 1408 CNDA_THREAD_SYNC; 1409 cudaError_t err = cudaGetLastError(); 1410 if( cudaSuccess != err) 1411 { 1412 Py_DECREF(rval); 1413 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err)); 1414 return NULL; 1415 } 1416 1417 return (PyObject*)rval; 1418 } 1419 1420 static PyMethodDef CudaNdarray_methods[] = 1421 { 1422 {"__array__", 1423 (PyCFunction)CudaNdarray_CreateArrayObj, METH_VARARGS, 1424 "Copy from the device to a numpy ndarray"}, 1425 {"__copy__", 1426 (PyCFunction)CudaNdarray_View, METH_NOARGS, 1427 "Create a shallow copy of this object. used by module copy"}, 1428 {"__deepcopy__", 1429 (PyCFunction)CudaNdarray_DeepCopy, METH_O, 1430 "Create a copy of this object"}, 1431 {"zeros", 1432 (PyCFunction)CudaNdarray_Zeros, METH_STATIC | METH_O, 1433 "Create a new CudaNdarray with specified shape, filled with zeros."}, 1434 {"copy", 1435 (PyCFunction)CudaNdarray_Copy, METH_NOARGS, 1436 "Create a copy of this object"}, 1437 {"is_c_contiguous", 1438 (PyCFunction)CudaNdarray_IS_C_Contiguous, METH_NOARGS, 1439 "Return True is the object is c contiguous. False otherwise."}, 1440 {"reduce_sum", 1441 (PyCFunction)CudaNdarray_ReduceSum, METH_O, 1442 "Reduce over the given dimensions by summation"}, 1443 {"exp", 1444 (PyCFunction)CudaNdarray_exp, METH_NOARGS, 1445 "Return the exponential of all elements"}, 1446 {"reshape", 1447 (PyCFunction)CudaNdarray_Reshape, METH_O, 1448 "Return a reshaped view (or copy) of this ndarray\n\ 1449 The required argument is a tuple of integers specifying the shape of the new ndarray."}, 1450 {"view", 1451 (PyCFunction)CudaNdarray_View, METH_NOARGS, 1452 "Return an alias of this ndarray"}, 1453 {"_set_stride", 1454 (PyCFunction)CudaNdarray_SetStride, METH_VARARGS, 1455 "For integer arguments (i, s), set the 'i'th stride to 's'"}, 1456 {"take", 1457 (PyCFunction)CudaNdarray_TakeFrom, METH_VARARGS, 1458 "Equivalent of numpy.take"}, 1459 {"_set_shape_i", 1460 (PyCFunction)CudaNdarray_SetShapeI, METH_VARARGS, 1461 "For integer arguments (i, s), set the 'i'th shape to 's'"}, 1462 {NULL, NULL, NULL, NULL} /* Sentinel */ 1463 }; 1464 1465 1466 //////////////////// 1467 // Number protocol 1468 //////////////////// 1469 1470 __global__ void kAdd_contiguous(float* a, float* b, float* dest, unsigned int numEls) { 1471 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 1472 const unsigned int numThreads = blockDim.x * gridDim.x; 1473 1474 for (unsigned int i = idx; i < numEls; i += numThreads) { 1475 dest[i] = a[i] + b[i]; 1476 } 1477 } 1478 1479 // Will be called by __add__ in Python 1480 static PyObject * 1481 CudaNdarray_add(PyObject* py_self, PyObject * py_other) 1482 { 1483 if (! CudaNdarray_Check(py_self)) { 1484 PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on left"); 1485 return NULL; 1486 } 1487 if (! CudaNdarray_Check(py_other)) { 1488 PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on right"); 1489 return NULL; 1490 } 1491 CudaNdarray * self = (CudaNdarray *)py_self; 1492 CudaNdarray * other = (CudaNdarray *)py_other; 1493 if(!CudaNdarray_is_c_contiguous(self) || !CudaNdarray_is_c_contiguous(other)){ 1494 PyErr_SetString(PyExc_TypeError, "We have implementet only the c_contiguous version for now."); 1495 return NULL; 1496 } 1497 1498 //standard elemwise size checks 1499 if (self->nd != other->nd) 1500 { 1501 PyErr_SetString(PyExc_TypeError, "CudaNdarray_add: need same number of dims"); 1502 return NULL; 1503 } 1504 //standard elemwise dim checks 1505 unsigned int size = 1; 1506 for (int i = 0; i< self->nd; ++i) 1507 { 1508 if (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i]) 1509 { 1510 PyErr_SetString(PyExc_TypeError, "need same dimensions"); 1511 return NULL; 1512 } 1513 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i]; 1514 } 1515 CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New(); 1516 if (!rval || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self))) 1517 { 1518 Py_XDECREF(rval); 1519 return NULL; 1520 } 1521 1522 if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){ 1523 return (PyObject *) rval; 1524 } 1525 1526 int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); 1527 int n_blocks = std::min(ceil_intdiv(size,(unsigned int)threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS); 1528 kAdd_contiguous<<>>( 1529 self->devdata, other->devdata, rval->devdata, size); 1530 CNDA_THREAD_SYNC; 1531 cudaError_t err = cudaGetLastError(); 1532 if( cudaSuccess != err) 1533 { 1534 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kAdd", cudaGetErrorString(err)); 1535 Py_DECREF(rval); 1536 return NULL; 1537 } 1538 return (PyObject *) rval; 1539 } 1540 1541 template 1542 __global__ void k_ielem_3(const int d0, const int d1, const int d2, 1543 float* a, const int sA0, const int sA1, const int sA2, 1544 const float* b, const int sB0, const int sB1, const int sB2){ 1545 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ 1546 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){ 1547 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){ 1548 switch (operator_num) 1549 { 1550 case IADD: 1551 a[i0*sA0 + i1*sA1 + i2*sA2] += b[i0*sB0 + i1*sB1 + i2*sB2]; 1552 break; 1553 case IDIV: 1554 a[i0*sA0 + i1*sA1 + i2*sA2] /= b[i0*sB0 + i1*sB1 + i2*sB2]; 1555 break; 1556 case CPY: 1557 a[i0*sA0 + i1*sA1 + i2*sA2] = b[i0*sB0 + i1*sB1 + i2*sB2]; 1558 break; 1559 } 1560 } 1561 } 1562 } 1563 } 1564 1565 template 1566 __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3, 1567 float* a, const int sA0, const int sA1, 1568 const int sA2, const int sA3, 1569 const float* b, const int sB0, const int sB1, 1570 const int sB2, const int sB3){ 1571 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ 1572 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){ 1573 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){ 1574 for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){ 1575 switch (operator_num) { 1576 case IADD: 1577 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3] 1578 += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]; 1579 break; 1580 case IDIV: 1581 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3] 1582 /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]; 1583 break; 1584 case CPY: 1585 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3] 1586 = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3]; 1587 break; 1588 } 1589 } 1590 } 1591 } 1592 } 1593 } 1594 1595 template 1596 __global__ void k_ielem_6(const int d0, const int d1, 1597 const int d2, const int d3, 1598 const int d4, const int d5, 1599 float* a, const int sA0, const int sA1, 1600 const int sA2, const int sA3, 1601 const int sA4, const int sA5, 1602 const float* b, const int sB0, const int sB1, 1603 const int sB2, const int sB3, 1604 const int sB4, const int sB5 1605 ){ 1606 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ 1607 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){ 1608 for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){ 1609 for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){ 1610 for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){ 1611 for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){ 1612 switch (operator_num) { 1613 case IADD: 1614 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5] 1615 += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5]; 1616 break; 1617 case IDIV: 1618 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5] 1619 /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5]; 1620 break; 1621 case CPY: 1622 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5] 1623 = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5]; 1624 break; 1625 } 1626 } 1627 } 1628 } 1629 } 1630 } 1631 } 1632 } 1633 1634 /* 1635 CudaNdarray_inplace_elemwise 1636 Compute elemwise, working inplace on A. 1637 Currently implemented A / B, A + B and A = B 1638 (the last is not tested and not used!) 1639 1640 py_self - the CudaNdarray that we'll modify (A) 1641 py_other - the other argument (B) 1642 fct_nb - which operation to perform (operator_t) 1643 1644 Returns 0 on success. 1645 Returns -1 on failure, and sets Python exception. 1646 1647 */ 1648 int 1649 CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb) 1650 { 1651 int verbose = 0; 1652 void (*k3)(const int, const int, const int, 1653 float*, const int, const int, const int, 1654 const float*, const int, const int, const int); 1655 void (*k4)(const int, const int, const int, const int, 1656 float*, const int, const int, 1657 const int, const int, 1658 const float*, const int, const int, 1659 const int, const int); 1660 void (*k6)(const int, const int, 1661 const int, const int, 1662 const int, const int, 1663 float*, const int, const int, 1664 const int, const int, 1665 const int, const int, 1666 const float*, const int, const int, 1667 const int, const int, 1668 const int, const int); 1669 switch (fct_nb) 1670 { 1671 case IADD: 1672 k3 = k_ielem_3; 1673 k4 = k_ielem_4; 1674 k6 = k_ielem_6; 1675 break; 1676 case IDIV: 1677 k3 = k_ielem_3; 1678 k4 = k_ielem_4; 1679 k6 = k_ielem_6; 1680 break; 1681 case CPY: 1682 k3 = k_ielem_3; 1683 k4 = k_ielem_4; 1684 k6 = k_ielem_6; 1685 break; 1686 default: 1687 assert (0); 1688 PyErr_Format( 1689 PyExc_TypeError, 1690 "CudaNdarray_inplace_elemwise invalid fct_nb (%i).", 1691 (int)fct_nb); 1692 return -1; 1693 } 1694 if (!CudaNdarray_Check(py_self)) { 1695 PyErr_SetString( 1696 PyExc_TypeError, 1697 "CudaNdarray_inplace_elemwise need a CudaNdarray on left"); 1698 return -1; 1699 } 1700 CudaNdarray * new_other = NULL; 1701 if (!CudaNdarray_Check(py_other)) { 1702 new_other = (CudaNdarray*) CudaNdarray_New(); 1703 if(!new_other) 1704 { 1705 return -1; 1706 } 1707 if(CudaNdarray_CopyFromArray(new_other, (PyArrayObject *) py_other)) 1708 { 1709 Py_XDECREF(new_other); 1710 return -1; 1711 } 1712 py_other = (PyObject *) new_other; 1713 } 1714 1715 CudaNdarray * self = (CudaNdarray *)py_self; 1716 CudaNdarray * other = (CudaNdarray *)py_other; 1717 1718 if (verbose) 1719 { 1720 fprintf(stderr, 1721 "INPLACE ADD/DIV for self->nd=%d other->nd=%d\n", 1722 self->nd, other->nd); 1723 } 1724 1725 //standard elemwise nb dim checks 1726 if (self->nd < other->nd) 1727 { 1728 PyErr_Format( 1729 PyExc_TypeError, 1730 "CudaNdarray_inplace_elemwise: The destination need more or the" 1731 " same number of dimensions then the source. Got %d and %d.", 1732 self->nd, other->nd); 1733 Py_XDECREF(new_other); 1734 return -1; 1735 } 1736 1737 //broadcast to the same number of dimensions. 1738 int* other_dims = (int*) alloca(self->nd * sizeof(int)); 1739 int* other_strides = (int*) alloca(self->nd * sizeof(int)); 1740 int added_dims = self->nd - other->nd; 1741 // Add the added broadcasted dimensions 1742 for (int i = 0; i< added_dims; ++i) 1743 { 1744 other_dims[i] = 1; 1745 other_strides[i] = 0; 1746 } 1747 // Copy the existing dimensions 1748 for (int i = 0; i< other->nd; ++i) 1749 { 1750 other_dims[i+added_dims] = CudaNdarray_HOST_DIMS(other)[i]; 1751 other_strides[i+added_dims] = CudaNdarray_HOST_STRIDES(other)[i]; 1752 } 1753 1754 //standard elemwise dim checks 1755 unsigned int size = 1; 1756 for (int i = 0; i< self->nd; ++i) 1757 { 1758 if ((CudaNdarray_HOST_DIMS(self)[i] != other_dims[i]) 1759 && (other_dims[i] != 1)) 1760 { 1761 PyErr_SetString( 1762 PyExc_ValueError, 1763 "CudaNdarray_inplace_elemwise need same dimensions (or broadcastable dimension)"); 1764 Py_XDECREF(new_other); 1765 return -1; 1766 } 1767 // if we're broadcasting other, then make sure it has stride 0 1768 assert ((CudaNdarray_HOST_DIMS(self)[i] == other_dims[i]) 1769 || (other_strides[i] == 0)); 1770 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i]; 1771 } 1772 1773 if (size==0) 1774 { 1775 int other_size = CudaNdarray_SIZE((CudaNdarray *)py_other); 1776 if (!(other_size == 0 || other_size == 1)) 1777 { 1778 PyErr_SetString( 1779 PyExc_ValueError, 1780 "CudaNdarray_inplace_elemwise cannot work inplace on" 1781 " un-initialized array when the new value have more than" 1782 " 0 or 1 broadcastable dimensions"); 1783 Py_XDECREF(new_other); 1784 return 0; 1785 } 1786 Py_XDECREF(new_other); 1787 return 0; 1788 } 1789 1790 switch(self->nd) 1791 { 1792 case 0: 1793 { 1794 dim3 n_blocks(1, 1, 1); 1795 dim3 n_threads(1); 1796 k3<<>>( 1797 1, //d0 1798 1, //d1 1799 1, //d2 1800 CudaNdarray_DEV_DATA(self), 1801 1, //strides 1802 1, 1803 1, 1804 CudaNdarray_DEV_DATA(other), 1805 1, //strides 1806 1, 1807 1); 1808 CNDA_THREAD_SYNC; 1809 cudaError_t err = cudaGetLastError(); 1810 if (cudaSuccess != err) 1811 { 1812 PyErr_Format( 1813 PyExc_RuntimeError, 1814 "CudaNdarray_inplace_elemwise case0: Cuda error: %s: %s.\n", 1815 "k3", 1816 cudaGetErrorString(err)); 1817 Py_XDECREF(new_other); 1818 return -1; 1819 } 1820 } 1821 break; 1822 case 1: 1823 { 1824 dim3 n_blocks(1, 1, 1); 1825 dim3 n_threads( 1826 std::min( 1827 CudaNdarray_HOST_DIMS(self)[0], 1828 NUM_VECTOR_OP_THREADS_PER_BLOCK)); 1829 k3<<>>( 1830 1, //dimensions 1831 1, 1832 CudaNdarray_HOST_DIMS(self)[0], 1833 CudaNdarray_DEV_DATA(self), 1834 1, //strides 1835 1, 1836 CudaNdarray_HOST_STRIDES(self)[0], 1837 CudaNdarray_DEV_DATA(other), 1838 1, //strides 1839 1, 1840 other_strides[0]); 1841 CNDA_THREAD_SYNC; 1842 cudaError_t err = cudaGetLastError(); 1843 if (cudaSuccess != err) 1844 { 1845 PyErr_Format( 1846 PyExc_RuntimeError, 1847 "CudaNdarray_inplace_elemwise case1: Cuda error: %s: %s.\n", 1848 "k3", 1849 cudaGetErrorString(err)); 1850 Py_XDECREF(new_other); 1851 return -1; 1852 } 1853 } 1854 break; 1855 case 2: 1856 { 1857 //TODO: if both self and other are f-contiguous 1858 // Then flip the block and thread dimensions 1859 // to make contiguous reads & writes 1860 dim3 n_blocks(1, 1861 std::min( 1862 CudaNdarray_HOST_DIMS(self)[0], 1863 NUM_VECTOR_OP_BLOCKS)); 1864 dim3 n_threads( 1865 std::min( 1866 CudaNdarray_HOST_DIMS(self)[1], 1867 NUM_VECTOR_OP_THREADS_PER_BLOCK)); 1868 k3<<>>(1, 1869 CudaNdarray_HOST_DIMS(self)[0], 1870 CudaNdarray_HOST_DIMS(self)[1], 1871 CudaNdarray_DEV_DATA(self), 1872 1, 1873 CudaNdarray_HOST_STRIDES(self)[0], 1874 CudaNdarray_HOST_STRIDES(self)[1], 1875 CudaNdarray_DEV_DATA(other), 1876 1, 1877 other_strides[0], 1878 other_strides[1]); 1879 CNDA_THREAD_SYNC; 1880 cudaError_t err = cudaGetLastError(); 1881 if (cudaSuccess != err) 1882 { 1883 PyErr_Format( 1884 PyExc_RuntimeError, 1885 "CudaNdarray_inplace_elemwise case2: Cuda error: %s: %s.\n", 1886 "k3", 1887 cudaGetErrorString(err)); 1888 Py_XDECREF(new_other); 1889 return -1; 1890 } 1891 } 1892 break; 1893 case 3: 1894 { 1895 //TODO: Dimshuffle so that at least one of the arrays 1896 // has a contiguous dimension on the thread idx. 1897 dim3 n_blocks( 1898 std::min( 1899 CudaNdarray_HOST_DIMS(self)[0], 1900 NUM_VECTOR_OP_BLOCKS), 1901 CudaNdarray_HOST_DIMS(self)[1]); 1902 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS) 1903 n_blocks.y /= 2; 1904 dim3 n_threads( 1905 std::min( 1906 CudaNdarray_HOST_DIMS(self)[2], 1907 NUM_VECTOR_OP_THREADS_PER_BLOCK)); 1908 k3<<>>( 1909 CudaNdarray_HOST_DIMS(self)[0], 1910 CudaNdarray_HOST_DIMS(self)[1], 1911 CudaNdarray_HOST_DIMS(self)[2], 1912 CudaNdarray_DEV_DATA(self), 1913 CudaNdarray_HOST_STRIDES(self)[0], 1914 CudaNdarray_HOST_STRIDES(self)[1], 1915 CudaNdarray_HOST_STRIDES(self)[2], 1916 CudaNdarray_DEV_DATA(other), 1917 other_strides[0], 1918 other_strides[1], 1919 other_strides[2]); 1920 CNDA_THREAD_SYNC; 1921 cudaError_t err = cudaGetLastError(); 1922 if (cudaSuccess != err) 1923 { 1924 PyErr_Format( 1925 PyExc_RuntimeError, 1926 "CudaNdarray_inplace_elemwise case3: Cuda error: %s: %s.\n", 1927 "k3", 1928 cudaGetErrorString(err)); 1929 Py_XDECREF(new_other); 1930 return -1; 1931 } 1932 } 1933 break; 1934 case 4: 1935 { 1936 dim3 n_blocks( 1937 std::min( 1938 CudaNdarray_HOST_DIMS(self)[0], 1939 NUM_VECTOR_OP_BLOCKS), 1940 CudaNdarray_HOST_DIMS(self)[1] 1941 ); 1942 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS) 1943 n_blocks.y /= 2; 1944 dim3 n_threads( 1945 std::min( 1946 CudaNdarray_HOST_DIMS(self)[2], 1947 NUM_VECTOR_OP_THREADS_PER_BLOCK) 1948 //TODO: DON"T YOU NEED OT PUT DIMS[3] in here??? 1949 ); 1950 k4<<>>( 1951 CudaNdarray_HOST_DIMS(self)[0], 1952 CudaNdarray_HOST_DIMS(self)[1], 1953 CudaNdarray_HOST_DIMS(self)[2], 1954 CudaNdarray_HOST_DIMS(self)[3], 1955 CudaNdarray_DEV_DATA(self), 1956 CudaNdarray_HOST_STRIDES(self)[0], 1957 CudaNdarray_HOST_STRIDES(self)[1], 1958 CudaNdarray_HOST_STRIDES(self)[2], 1959 CudaNdarray_HOST_STRIDES(self)[3], 1960 CudaNdarray_DEV_DATA(other), 1961 other_strides[0], 1962 other_strides[1], 1963 other_strides[2], 1964 other_strides[3]); 1965 CNDA_THREAD_SYNC; 1966 cudaError_t err = cudaGetLastError(); 1967 if (cudaSuccess != err) 1968 { 1969 PyErr_Format( 1970 PyExc_RuntimeError, 1971 "CudaNdarray_inplace_elemwise case4: Cuda error: %s: %s.\n", 1972 "k4", 1973 cudaGetErrorString(err)); 1974 Py_XDECREF(new_other); 1975 return -1; 1976 } 1977 } 1978 break; 1979 case 5: 1980 { 1981 dim3 n_blocks( 1982 std::min( 1983 CudaNdarray_HOST_DIMS(self)[1], 1984 NUM_VECTOR_OP_BLOCKS), 1985 CudaNdarray_HOST_DIMS(self)[2]); 1986 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS) 1987 n_blocks.y /= 2; 1988 dim3 n_threads( 1989 std::min( 1990 CudaNdarray_HOST_DIMS(self)[3], 1991 NUM_VECTOR_OP_THREADS_PER_BLOCK) 1992 //TODO: DON"T YOU NEED OT PUT DIMS[3] in here??? 1993 ); 1994 for (int i = 0; i < CudaNdarray_HOST_DIMS(self)[0]; ++i) 1995 { 1996 k4<<>>( 1997 CudaNdarray_HOST_DIMS(self)[1], 1998 CudaNdarray_HOST_DIMS(self)[2], 1999 CudaNdarray_HOST_DIMS(self)[3], 2000 CudaNdarray_HOST_DIMS(self)[4], 2001 CudaNdarray_DEV_DATA(self) + i * CudaNdarray_HOST_STRIDES(self)[0], 2002 CudaNdarray_HOST_STRIDES(self)[1], 2003 CudaNdarray_HOST_STRIDES(self)[2], 2004 CudaNdarray_HOST_STRIDES(self)[3], 2005 CudaNdarray_HOST_STRIDES(self)[4], 2006 CudaNdarray_DEV_DATA(other) + i * other_strides[0], 2007 other_strides[1], 2008 other_strides[2], 2009 other_strides[3], 2010 other_strides[4]); 2011 CNDA_THREAD_SYNC; 2012 cudaError_t err = cudaGetLastError(); 2013 if( cudaSuccess != err) 2014 { 2015 PyErr_Format( 2016 PyExc_RuntimeError, 2017 "CudaNdarray_inplace_elemwise case5: Cuda error: %s: %s. n_block=(%ld,%ld) n_threads=%ld\n", 2018 "k5 with loop over k4", 2019 cudaGetErrorString(err), 2020 (long) n_blocks.x, (long) n_blocks.y, (long) n_threads.x); 2021 Py_XDECREF(new_other); 2022 return -1; 2023 } 2024 } 2025 } 2026 break; 2027 case 6: 2028 { 2029 dim3 n_blocks( 2030 std::min( 2031 CudaNdarray_HOST_DIMS(self)[0], 2032 NUM_VECTOR_OP_BLOCKS), 2033 CudaNdarray_HOST_DIMS(self)[1], 2034 CudaNdarray_HOST_DIMS(self)[2] 2035 ); 2036 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS) 2037 n_blocks.y /= 2; 2038 // GTX285(compute capabilities 1.3) don't support n_blocks.z > 1 2039 // (compute capabilities 2.0) support 65535 for n_blocks.z 2040 //while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS) 2041 // n_blocks.z /= 2; 2042 n_blocks.z = 1; 2043 dim3 n_threads( 2044 std::min( 2045 CudaNdarray_HOST_DIMS(self)[3], 2046 NUM_VECTOR_OP_THREADS_PER_BLOCK) 2047 //TODO: DON'T YOU NEED TO PUT DIMS[4] in here??? 2048 //TODO: DON'T YOU NEED TO PUT DIMS[5] in here??? 2049 ); 2050 k6<<>>( 2051 CudaNdarray_HOST_DIMS(self)[0], 2052 CudaNdarray_HOST_DIMS(self)[1], 2053 CudaNdarray_HOST_DIMS(self)[2], 2054 CudaNdarray_HOST_DIMS(self)[3], 2055 CudaNdarray_HOST_DIMS(self)[4], 2056 CudaNdarray_HOST_DIMS(self)[5], 2057 CudaNdarray_DEV_DATA(self), 2058 CudaNdarray_HOST_STRIDES(self)[0], 2059 CudaNdarray_HOST_STRIDES(self)[1], 2060 CudaNdarray_HOST_STRIDES(self)[2], 2061 CudaNdarray_HOST_STRIDES(self)[3], 2062 CudaNdarray_HOST_STRIDES(self)[4], 2063 CudaNdarray_HOST_STRIDES(self)[5], 2064 CudaNdarray_DEV_DATA(other), 2065 other_strides[0], 2066 other_strides[1], 2067 other_strides[2], 2068 other_strides[3], 2069 other_strides[4], 2070 other_strides[5]); 2071 CNDA_THREAD_SYNC; 2072 cudaError_t err = cudaGetLastError(); 2073 if (cudaSuccess != err) 2074 { 2075 PyErr_Format( 2076 PyExc_RuntimeError, 2077 "CudaNdarray_inplace_elemwise case6: Cuda error: %s: %s. n_blocks=(%ld, %ld, %ld) n_threads=(%ld)\n", 2078 "k6", 2079 cudaGetErrorString(err), 2080 (long) n_blocks.x, (long) n_blocks.y, (long) n_blocks.z, 2081 (long) n_threads.x); 2082 Py_XDECREF(new_other); 2083 return -1; 2084 } 2085 } 2086 break; 2087 default: 2088 { 2089 PyErr_Format( 2090 PyExc_NotImplementedError, 2091 "inplace_elemwise w nd=%i\n", 2092 self->nd); 2093 Py_XDECREF(new_other); 2094 return -1; 2095 } 2096 } 2097 if (verbose) 2098 fprintf(stderr, "INPLACE ADD/DIV end\n"); 2099 Py_XDECREF(new_other); 2100 return 0; 2101 } 2102 2103 /* 2104 * We need this inplace Add to support IncSubTensor 2105 * It returns py_self on success with an additional reference. Else NULL. 2106 */ 2107 // Will be called by __iadd__ in Python 2108 PyObject * 2109 CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other) 2110 { 2111 if (CudaNdarray_inplace_elemwise(py_self, py_other, IADD)) 2112 { 2113 return NULL; 2114 } 2115 Py_INCREF(py_self); 2116 return py_self; 2117 } 2118 2119 /* 2120 * We need this inplace div for cuda/tests/test_basic_ops.py:test_shared_options 2121 * It returns py_self on success with an additional reference. Else NULL. 2122 */ 2123 // Will be called by __idiv__ in Python 2124 static PyObject * 2125 CudaNdarray_inplace_div(PyObject* py_self, PyObject * py_other) 2126 { 2127 if (CudaNdarray_inplace_elemwise(py_self, py_other, IDIV)) 2128 { 2129 return NULL; 2130 } 2131 Py_INCREF(py_self); 2132 return py_self; 2133 } 2134 2135 // The PyNumberMethods struct layout changed in a non-trivial way from 2 to 3. 2136 #if PY_MAJOR_VERSION == 3 2137 static PyNumberMethods CudaNdarrayNumberMethods = 2138 { 2139 (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__ 2140 0, //binaryfunc nb_subtract; 2141 0, //binaryfunc nb_multiply; 2142 0, //binaryfunc nb_remainder; 2143 0, //binaryfunc nb_divmod; 2144 0, //ternaryfunc nb_power; 2145 0, //unaryfunc nb_negative; 2146 0, //unaryfunc nb_positive; 2147 0, //unaryfunc nb_absolute; 2148 0, //inquiry nb_bool; 2149 0, //unaryfunc nb_invert; 2150 0, //binaryfunc nb_lshift; 2151 0, //binaryfunc nb_rshift; 2152 0, //binaryfunc nb_and; 2153 0, //binaryfunc nb_xor; 2154 0, //binaryfunc nb_or; 2155 0, //unaryfunc nb_int; 2156 0, //void *nb_reserved; 2157 0, //unaryfunc nb_float; 2158 2159 (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__ 2160 0, //binaryfunc nb_inplace_subtract; 2161 0, //binaryfunc nb_inplace_multiply; 2162 0, //binaryfunc nb_inplace_remainder; 2163 0, //ternaryfunc nb_inplace_power; 2164 0, //binaryfunc nb_inplace_lshift; 2165 0, //binaryfunc nb_inplace_rshift; 2166 0, //binaryfunc nb_inplace_and; 2167 0, //binaryfunc nb_inplace_xor; 2168 0, //binaryfunc nb_inplace_or; 2169 2170 0, //binaryfunc nb_floor_divide; 2171 0, //binaryfunc nb_true_divide; 2172 0, //binaryfunc nb_inplace_floor_divide; 2173 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_true_divide; __idiv__ 2174 2175 0, //unaryfunc nb_index 2176 }; 2177 #else 2178 static PyNumberMethods CudaNdarrayNumberMethods = 2179 { 2180 (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__ 2181 0, //binaryfunc nb_subtract; __sub__ 2182 0, //binaryfunc nb_multiply; __mul__ 2183 0, //binaryfunc nb_divide; __div__ 2184 0, //binaryfunc nb_remainder; __mod__ 2185 0, //binaryfunc nb_divmod; __divmod__ 2186 0, //ternaryfunc nb_power; __pow__ 2187 0, //unaryfunc nb_negative; __neg__ 2188 0, //unaryfunc nb_positive; __pos__ 2189 0, //unaryfunc nb_absolute; __abs__ 2190 0, //inquiry nb_nonzero; __nonzero__ /* Used by PyObject_IsTrue */ 2191 0, //unaryfunc nb_invert; __invert__ 2192 0, //binaryfunc nb_lshift; __lshift__ 2193 0, //binaryfunc nb_rshift; __rshift__ 2194 0, //binaryfunc nb_and; __and__ 2195 0, //binaryfunc nb_xor; __xor__ 2196 0, //binaryfunc nb_or; __or__ 2197 0, //coercion nb_coerce; __coerce__ /* Used by the coerce() function */ 2198 0, //unaryfunc nb_int; __int__ 2199 0, //unaryfunc nb_long; __long__ 2200 0, //unaryfunc nb_float; __float__ 2201 0, //unaryfunc nb_oct; __oct__ 2202 0, //unaryfunc nb_hex; __hex__ 2203 2204 /* Added in release 2.0 */ 2205 (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__ 2206 0, //binaryfunc nb_inplace_subtract; __isub__ 2207 0, //binaryfunc nb_inplace_multiply; __imul__ 2208 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_divide; __idiv__ 2209 0, //binaryfunc nb_inplace_remainder; __imod__ 2210 0, //ternaryfunc nb_inplace_power; __ipow__ 2211 0, //binaryfunc nb_inplace_lshift; __ilshift__ 2212 0, //binaryfunc nb_inplace_rshift; __irshift__ 2213 0, //binaryfunc nb_inplace_and; __iand__ 2214 0, //binaryfunc nb_inplace_xor; __ixor__ 2215 0, //binaryfunc nb_inplace_or; __ior__ 2216 2217 /* Added in release 2.2 */ 2218 0, //binaryfunc nb_floor_divide; __floordiv__ 2219 0, //binaryfunc nb_true_divide; __truediv__ 2220 0, //binaryfunc nb_inplace_floor_divide; __ifloordiv__ 2221 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_true_divide; __itruediv__ 2222 2223 #if PY_MINOR_VERSION > 4 2224 /* Added in release 2.5 */ 2225 0 //unaryfunc nb_index; __index__ 2226 #endif 2227 }; 2228 #endif 2229 2230 2231 ///////////////////// 2232 // Mapping protocol 2233 ///////////////////// 2234 2235 // Will by called by __len__ in Python 2236 static Py_ssize_t 2237 CudaNdarray_len(PyObject * py_self) 2238 { 2239 CudaNdarray * self = (CudaNdarray*) py_self; 2240 if (self->nd <= 0) 2241 { 2242 return (Py_ssize_t) 0; 2243 } 2244 else 2245 { 2246 return (Py_ssize_t) CudaNdarray_HOST_DIMS(self)[0]; 2247 } 2248 } 2249 2250 // Will by called by __getitem__ in Python 2251 PyObject * 2252 CudaNdarray_Subscript(PyObject * py_self, PyObject * key) 2253 { 2254 int verbose = 0; 2255 if (verbose) fprintf(stderr, "Subscript .... \n"); 2256 CudaNdarray * self = (CudaNdarray*) py_self; 2257 PyObject * py_rval = NULL; 2258 CudaNdarray * rval = NULL; 2259 PyObject * intobj = NULL; 2260 2261 //PyObject_Print(key, stderr, 0); 2262 2263 if (key == Py_Ellipsis) 2264 { 2265 Py_INCREF(py_self); 2266 return py_self; 2267 } 2268 if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER 2269 //else if (PyInt_Check(key)) //INDEXING BY INTEGER 2270 { 2271 int d_idx = PyInt_AsLong(intobj); 2272 Py_DECREF(intobj); intobj=NULL; 2273 //int d_idx = PyInt_AsLong(key); 2274 if (self->nd == 0) 2275 { 2276 PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed"); 2277 return NULL; 2278 } 2279 int d_dim = CudaNdarray_HOST_DIMS(self)[0]; 2280 int offset = 0; 2281 2282 if ((d_idx >= 0) && (d_idx < d_dim)) 2283 { 2284 //normal indexing 2285 offset += d_idx * CudaNdarray_HOST_STRIDES(self)[0]; 2286 } 2287 else if ((d_idx < 0) && (d_idx >= -d_dim)) 2288 { 2289 //end-based indexing 2290 // d_idx is negative 2291 offset += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[0]; 2292 } 2293 else 2294 { 2295 PyErr_Format(PyExc_IndexError, 2296 "index out of bounds. Asked %d, but size of %d", 2297 d_idx, d_dim); 2298 return NULL; 2299 } 2300 2301 //allocate our subtensor view 2302 py_rval = CudaNdarray_new_nd(self->nd - 1); 2303 rval = (CudaNdarray*) py_rval; 2304 if (!rval) return NULL; 2305 assert (0 == rval->data_allocated); 2306 2307 //initialize the view's data pointer to our own. 2308 if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self) + offset, self)) 2309 { 2310 Py_DECREF(rval); 2311 return NULL; 2312 } 2313 for (int d = 1; d < self->nd; ++d) 2314 { 2315 CudaNdarray_set_stride(rval, d-1, CudaNdarray_HOST_STRIDES(self)[d]); 2316 CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]); 2317 } 2318 } 2319 else 2320 { 2321 PyErr_Clear(); 2322 } 2323 if (PySlice_Check(key)) //INDEXING BY SLICE 2324 { 2325 if (verbose) fprintf(stderr, "by slice\n"); 2326 if (self->nd == 0) 2327 { 2328 PyErr_SetString(PyExc_ValueError, "cannot slice a 0-d array"); 2329 return NULL; 2330 } 2331 2332 int d_dim = CudaNdarray_HOST_DIMS(self)[0]; 2333 Py_ssize_t start, stop, step, slen; 2334 if (PySlice_GetIndicesEx(SLICE_CAST(key), d_dim, &start, &stop, &step, &slen)) 2335 { 2336 if (verbose) 2337 fprintf(stderr, "PySlice_GetIndicesEx failed\n"); 2338 return NULL; 2339 } 2340 if (verbose) 2341 { 2342 std::cerr << "start " << start << "\n"; 2343 std::cerr << "stop " << stop << "\n"; 2344 std::cerr << "step " << step << "\n"; 2345 std::cerr << "slen " << slen << "\n"; 2346 } 2347 2348 //allocate our subtensor view 2349 py_rval = CudaNdarray_new_nd(self->nd); 2350 rval = (CudaNdarray*) py_rval; 2351 if (!rval) return NULL; 2352 assert (0 == rval->data_allocated); 2353 2354 2355 //initialize the view's data pointer to our own. 2356 if (CudaNdarray_set_device_data(rval, 2357 CudaNdarray_DEV_DATA(self) + start * CudaNdarray_HOST_STRIDES(self)[0], 2358 self)) 2359 { 2360 Py_DECREF(rval); 2361 return NULL; 2362 } 2363 //initialize dimension 0 of rval 2364 CudaNdarray_set_stride(rval, 0, 2365 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[0]); 2366 CudaNdarray_set_dim(rval, 0, slen); 2367 if (verbose) std::cerr << "rval stride " << CudaNdarray_HOST_STRIDES(rval)[0] << "\n"; 2368 // initialize dimensions > 0 of rval 2369 for (int d = 1; d < self->nd; ++d) 2370 { 2371 CudaNdarray_set_stride(rval, d, CudaNdarray_HOST_STRIDES(self)[d]); 2372 CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]); 2373 } 2374 } 2375 if (PyTuple_Check(key)) //INDEXING BY TUPLE 2376 { 2377 if (verbose) fprintf(stderr, "by tuple\n"); 2378 //elements of the tuple can be either integers or slices 2379 //the dimensionality of the view we will return is diminished for each slice in the tuple 2380 2381 if (PyTuple_Size(key) > self->nd) 2382 { 2383 PyErr_SetString(PyExc_IndexError, "index error"); 2384 return NULL; 2385 } 2386 2387 //calculate the number of dimensions in the return value 2388 int rval_nd = self->nd; 2389 for (int d = 0; d < PyTuple_Size(key); ++d) 2390 { 2391 //On some paltform PyInt_Check() return true, other it return false. 2392 //So we use PyArray_IsAnyScalar that should covert everything. 2393 rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, d)); 2394 } 2395 2396 //allocate our subtensor view 2397 py_rval = CudaNdarray_new_nd(rval_nd); 2398 rval = (CudaNdarray*) py_rval; 2399 if (!rval) return NULL; 2400 assert (0 == rval->data_allocated); 2401 2402 //initialize the view's data pointer to our own. 2403 if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self)) 2404 { 2405 Py_DECREF(rval); 2406 return NULL; 2407 } 2408 2409 // rval_d will refer to the current dimension in the rval. 2410 // It will not be incremented for integer keys, but will be incremented for slice 2411 // keys 2412 int rval_d = 0; 2413 2414 for (int d = 0; d < self->nd; ++d) 2415 { 2416 // keys can be shorter than self->nd. 2417 // when that happens, it means that the remaining dimensions are "full slices" 2418 if (d >=PyTuple_Size(key)) 2419 { 2420 CudaNdarray_set_stride(rval, rval_d, CudaNdarray_HOST_STRIDES(self)[d]); 2421 CudaNdarray_set_dim(rval, rval_d, CudaNdarray_HOST_DIMS(self)[d]); 2422 ++rval_d; 2423 } 2424 else 2425 { 2426 PyObject * key_d = PyTuple_GetItem(key, d); 2427 2428 if (PySlice_Check(key_d)) 2429 { 2430 Py_ssize_t start, stop, step, slen; 2431 if (PySlice_GetIndicesEx(SLICE_CAST(key_d), CudaNdarray_HOST_DIMS(self)[d], &start, &stop, &step, &slen)) 2432 { 2433 Py_DECREF(rval); 2434 return NULL; 2435 } 2436 rval->devdata += start * CudaNdarray_HOST_STRIDES(self)[d]; 2437 CudaNdarray_set_stride(rval, rval_d, 2438 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[d]); 2439 CudaNdarray_set_dim(rval, rval_d, slen); 2440 if (0) 2441 { 2442 std::cerr << "start " << start << "\n"; 2443 std::cerr << "stop " << stop << "\n"; 2444 std::cerr << "step " << step << "\n"; 2445 std::cerr << "slen " << slen << "\n"; 2446 } 2447 ++rval_d; 2448 } 2449 else if ((intobj=PyNumber_Int(key_d))) 2450 { 2451 assert(PyArray_IsAnyScalar(key_d)); 2452 int d_idx = PyInt_AsLong(intobj); 2453 Py_DECREF(intobj); 2454 intobj = NULL; 2455 int d_dim = CudaNdarray_HOST_DIMS(self)[d]; 2456 2457 if ((d_idx >= 0) && (d_idx < d_dim)) 2458 { 2459 //normal indexing 2460 rval->devdata += d_idx * CudaNdarray_HOST_STRIDES(self)[d]; 2461 } 2462 else if ((d_idx < 0) && (d_idx >= -d_dim)) 2463 { 2464 //end-based indexing 2465 rval->devdata += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[d]; 2466 } 2467 else 2468 { 2469 PyErr_Format(PyExc_IndexError, 2470 "index out of bounds. Asked %d for dimensions %d, but size of %d", 2471 d_idx, d, d_dim); 2472 Py_DECREF(rval); 2473 return NULL; 2474 } 2475 } 2476 else 2477 { 2478 PyErr_Clear(); // clear the error set by PyNumber_Int 2479 PyErr_SetString(PyExc_IndexError, "index must be either int or slice"); 2480 Py_DECREF(rval); 2481 return NULL; 2482 } 2483 } 2484 } 2485 } 2486 if (py_rval) 2487 { 2488 if (verbose) fprint_CudaNdarray(stderr, self); 2489 if (verbose) fprint_CudaNdarray(stderr, rval); 2490 } 2491 else 2492 { 2493 PyErr_SetString(PyExc_NotImplementedError, "Unknown key type"); 2494 return NULL; 2495 } 2496 return py_rval; 2497 } 2498 2499 // Will by called by __setitem__ in Python 2500 // See http://docs.python.org/dev/py3k/c-api/object.html#PyObject_SetItem 2501 // Doesn't handle broadcasting, e.g. a[:] = 5 2502 // Can only be assigned from a CudaNdarray on the right side 2503 // Or a ndarray 2504 // Or a python scalar with value 0 when the left side part is c contiguous. 2505 static int 2506 CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) 2507 { 2508 int verbose = 0; 2509 if (verbose) fprintf(stderr, "CudaNdarray_setitem start\n"); 2510 // We try to copy directly into this CudaNdarray from the ndarray 2511 CudaNdarray* rval = (CudaNdarray*)CudaNdarray_Subscript(o, key); 2512 CudaNdarray* new_value = NULL; 2513 2514 if(!rval){ 2515 // CudaNdarray_Subscript failed and set the error msg. 2516 Py_XDECREF(rval); 2517 return -1; 2518 } 2519 2520 if(rval != (CudaNdarray*)o && 2521 (rval->data_allocated || 2522 // The new array should have a base 2523 !(((CudaNdarray*)rval)->base) || 2524 // If the original array has no base, the base of the new 2525 // array should be the original one 2526 (!((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != o) || 2527 // Else, the two arrays should have the same base 2528 (((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != ((CudaNdarray*)o)->base))) 2529 { 2530 // This case shouldn't happen, based on what I see in Subscript 2531 // but just in case it happens sometime in the future 2532 2533 PyErr_Format(PyExc_RuntimeError, 2534 "__getitem__ must return a CudaNdarray that refers to" 2535 " the original CudaNdarray, not a copy. rval.base=%p" 2536 " o.base=%p o=%p", 2537 (((CudaNdarray*)rval)->base), ((CudaNdarray*)o)->base, o); 2538 Py_DECREF(rval); 2539 return -1; 2540 } 2541 2542 PyObject * intobj = NULL; 2543 if (CudaNdarray_Check(o) && PyArray_Check(value)){ 2544 if (verbose) 2545 fprintf(stderr, 2546 "CudaNdarray_setitem dest is a CudaNdarray and" 2547 " value is a ndarray\n"); 2548 new_value = (CudaNdarray*) CudaNdarray_New(); 2549 if(!new_value) 2550 { 2551 return -1; 2552 } 2553 if (CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value)) 2554 { 2555 Py_XDECREF(new_value); 2556 Py_XDECREF(rval); 2557 return -1; 2558 } 2559 value = (PyObject *) new_value; 2560 } 2561 else if ((intobj=PyNumber_Int(value))) 2562 { 2563 if (verbose) 2564 fprintf(stderr, 2565 "CudaNdarray_setitem dest and value is a python number\n"); 2566 if(! CudaNdarray_is_c_contiguous(rval)){ 2567 PyErr_SetString(PyExc_NotImplementedError, 2568 "CudaNdarray.__setitem__: When the new value is a scalar" 2569 " of value 0 the part where we copy to must be c contiguous."); 2570 Py_XDECREF(rval); 2571 return -1; 2572 } 2573 2574 long val = PyInt_AsLong(intobj); 2575 Py_DECREF(intobj); intobj=NULL; 2576 if (val == 0) 2577 { 2578 cudaError_t err = cudaMemset(rval->devdata, 0, 2579 CudaNdarray_SIZE(rval) * sizeof(real)); 2580 Py_XDECREF(rval); 2581 if (err) 2582 { 2583 // Clear the error flag, cudaMemset doesn't do it. 2584 // Currently this returns the same thing as err, but if in future 2585 // it returns something else I still don't see why we should ignore 2586 // it. All we want to do here is reset the flag. 2587 cudaGetLastError(); 2588 PyErr_SetString(PyExc_RuntimeError, 2589 "CudaNdarray.__setitem__: cudaMemset failed"); 2590 return -1; 2591 } 2592 return 0; 2593 } else { 2594 Py_XDECREF(rval); 2595 PyErr_SetString(PyExc_NotImplementedError, 2596 "CudaNdarray.__setitem__: we support setting only python" 2597 " scalar of value 0, numpy nd array and CudaNdarray."); 2598 return -1; 2599 } 2600 } 2601 2602 PyErr_Clear(); // clear PyNumber_Int error. 2603 2604 if(!CudaNdarray_Check(o) || !CudaNdarray_Check(value)) 2605 { 2606 PyErr_SetString(PyExc_TypeError, 2607 "CudaNdarray.__setitem__: left must be a CudaNdarrays and right" 2608 " must be a CudaNdarrays, an ndarray or a python scalar of value 0."); 2609 Py_XDECREF(new_value); 2610 return -1; 2611 } 2612 2613 if (verbose) 2614 fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n"); 2615 2616 if (cnda_copy_structure_to_device(rval)) 2617 { 2618 PyErr_SetString(PyExc_RuntimeError, 2619 "CudaNdarray.__setitem__: syncing structure to device failed"); 2620 Py_DECREF(rval); 2621 Py_XDECREF(new_value); 2622 2623 if (verbose) 2624 fprintf(stderr, "CudaNdarray_setitem error end\n"); 2625 return -1; 2626 } 2627 2628 PyObject *baseSavedForComparison = rval->base; 2629 2630 if (CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true)) 2631 { 2632 Py_DECREF((PyObject*)rval); 2633 Py_XDECREF(new_value); 2634 2635 if (verbose) 2636 fprintf(stderr, "CudaNdarray_setitem error end\n"); 2637 return -1; 2638 } 2639 2640 assert (rval->base == baseSavedForComparison); 2641 assert (rval->dev_structure_fresh); 2642 2643 // Clean up locally-created references 2644 Py_DECREF(rval); 2645 Py_XDECREF(new_value); 2646 2647 return 0; 2648 } 2649 2650 2651 PyMappingMethods CudaNdarrayMappingMethods = { 2652 CudaNdarray_len, //lenfunc mp_length; __len__ 2653 CudaNdarray_Subscript, //binaryfunc mp_subscript; __getitem__ 2654 CudaNdarray_setitem //objobjargproc mp_ass_subscript; __setitem__ 2655 }; 2656 2657 //////////////////// 2658 // 2659 //////////////////// 2660 2661 static PyObject * 2662 CudaNdarray_get_shape(CudaNdarray *self, void *closure) 2663 { 2664 if (self->nd < 0) 2665 { 2666 PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized"); 2667 return NULL; 2668 } 2669 PyObject * rval = PyTuple_New(self->nd); 2670 for (int i = 0; i < self->nd; ++i) 2671 { 2672 if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_DIMS(self)[i]))) 2673 { 2674 Py_XDECREF(rval); 2675 return NULL; 2676 } 2677 2678 } 2679 return rval; 2680 } 2681 2682 static int 2683 CudaNdarray_set_shape(CudaNdarray *self, PyObject *value, void *closure) 2684 { 2685 PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape"); 2686 return -1; 2687 } 2688 2689 static PyObject * 2690 CudaNdarray_get_strides(CudaNdarray *self, void *closure) 2691 { 2692 if (self->nd < 0) 2693 { 2694 PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized"); 2695 return NULL; 2696 } 2697 PyObject * rval = PyTuple_New(self->nd); 2698 for (int i = 0; i < self->nd; ++i) 2699 { 2700 if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_STRIDES(self)[i]))) 2701 { 2702 Py_XDECREF(rval); 2703 return NULL; 2704 } 2705 2706 } 2707 return rval; 2708 } 2709 2710 static int 2711 CudaNdarray_set_strides(CudaNdarray *self, PyObject *value, void *closure) 2712 { 2713 //npy_intp newstrides_bytes[PyTuple_Size(value)]; 2714 if (PyTuple_Check(value)){ 2715 if (PyTuple_Size(value) != CudaNdarray_NDIM(self)){ 2716 PyErr_SetString(PyExc_ValueError, 2717 "The new strides tuple must have the same length" 2718 " as the number of dimensions"); 2719 return -1; 2720 } 2721 }else if (PyList_Check(value)){ 2722 if (PyList_Size(value) != CudaNdarray_NDIM(self)){ 2723 PyErr_SetString(PyExc_ValueError, 2724 "The new strides list must have the same length" 2725 " as the number of dimensions"); 2726 return -1; 2727 } 2728 }else{ 2729 PyErr_SetString(PyExc_ValueError, 2730 "The new strides need to be encoded in a tuple or list"); 2731 return -1; 2732 } 2733 npy_intp* newstrides = (npy_intp*) alloca(CudaNdarray_NDIM(self) * sizeof(npy_intp)); 2734 if (PyTuple_Check(value)){ 2735 for(int i=0; i < CudaNdarray_NDIM(self); i++){ 2736 newstrides[i] = PyInt_AsLong(PyTuple_GetItem(value, Py_ssize_t(i))); 2737 //newstrides_bytes[i] = newstrides[i] * 4; 2738 } 2739 }else if (PyList_Check(value)){ 2740 for(int i=0; i < CudaNdarray_NDIM(self); i++){ 2741 newstrides[i] = PyInt_AsLong(PyList_GetItem(value, Py_ssize_t(i))); 2742 //newstrides_bytes[i] = newstrides[i] * 4; 2743 } 2744 } 2745 /* 2746 // Do not do this check, as ExtractDiag needs that, and NumPy does not seem 2747 // to do it. 2748 npy_intp dims[PyTuple_Size(value)]; 2749 for(int i=0; i < CudaNdarray_NDIM(self); i++){ 2750 dims[i] = CudaNdarray_HOST_DIMS(self)[i]; 2751 } 2752 if (!PyArray_CheckStrides(4, 2753 CudaNdarray_NDIM(self), 2754 0, 0, 2755 dims, 2756 newstrides_bytes)){ 2757 PyErr_SetString(PyExc_ValueError, "bad new strides"); 2758 return -1; 2759 } 2760 */ 2761 for(int i=0; i < CudaNdarray_NDIM(self); i++){ 2762 CudaNdarray_set_stride(self, i, newstrides[i]); 2763 } 2764 return 0; 2765 } 2766 2767 static PyObject * 2768 CudaNdarray_get_dev_data(CudaNdarray *self, void *closure) 2769 { 2770 float * p = CudaNdarray_DEV_DATA(self); 2771 //printf("get_dev_data %p %li \n", p, (long int)p ); 2772 return PyInt_FromSize_t((size_t) CudaNdarray_DEV_DATA(self)); 2773 } 2774 2775 static int 2776 CudaNdarray_set_dev_data(CudaNdarray *self, PyObject *value, void *closure) 2777 { 2778 Py_ssize_t newdevdata = PyInt_AsSsize_t(value); 2779 //printf("set_dev_data %p %li \n",(float*)newdevdata ,newdevdata); 2780 if (PyErr_Occurred()) 2781 { 2782 return -1; 2783 } 2784 return CudaNdarray_set_device_data(self, (float*)newdevdata, (CudaNdarray*)self->base); 2785 } 2786 2787 static PyObject * 2788 CudaNdarray_get_dtype(CudaNdarray *self, void *closure) 2789 { 2790 return PyString_FromString("float32"); 2791 } 2792 2793 static PyObject * 2794 CudaNdarray_get_ndim(CudaNdarray *self, void *closure) 2795 { 2796 return PyInt_FromLong(self->nd); 2797 } 2798 2799 static PyObject * 2800 CudaNdarray_get_base(CudaNdarray *self, void *closure) 2801 { 2802 PyObject * base = self->base; 2803 if (!base) 2804 { 2805 // We cannot return a NULL pointer, use None instead 2806 base = Py_None; 2807 } 2808 Py_INCREF(base); 2809 return base; 2810 } 2811 2812 void put_in_dict(PyObject * dict, const char * key, int val) 2813 { 2814 PyObject * k = PyString_FromString(key); 2815 PyObject * v = PyInt_FromLong(val); 2816 PyDict_SetItem(dict, k, v); 2817 Py_DECREF(k); 2818 Py_DECREF(v); 2819 } 2820 2821 PyObject * 2822 GetDeviceProperties(PyObject* _unused, PyObject* args) 2823 { 2824 int dev_id = -1; 2825 if (! PyArg_ParseTuple(args, "i", &dev_id)) 2826 return NULL; 2827 cudaDeviceProp deviceProp; 2828 cudaGetDeviceProperties(&deviceProp, dev_id); 2829 2830 PyObject * dict = PyDict_New(); 2831 PyObject * str= PyString_FromString("name"); 2832 PyObject * i = PyString_FromString(deviceProp.name); 2833 PyDict_SetItem(dict, str, i); 2834 Py_DECREF(str); 2835 Py_DECREF(i); 2836 2837 put_in_dict(dict, "major", deviceProp.major); 2838 put_in_dict(dict, "minor", deviceProp.minor); 2839 #if CUDART_VERSION >= 2020 2840 int driverVersion = 0, runtimeVersion = 0; 2841 cudaDriverGetVersion(&driverVersion); 2842 cudaRuntimeGetVersion(&runtimeVersion); 2843 put_in_dict(dict, "driverVersion", driverVersion); 2844 put_in_dict(dict, "runtimeVersion", runtimeVersion); 2845 #endif 2846 #if CUDART_VERSION >= 2000 2847 2848 put_in_dict(dict, "multiProcessorCount", deviceProp.multiProcessorCount); 2849 //if ConvertSMVer2Cores is not defined in cuda_runtime_api.h, the run time is too old. 2850 int sm_cores = -1; 2851 if(deviceProp.major==1) 2852 sm_cores = 32; 2853 else if(deviceProp.major==2 && deviceProp.minor==0) 2854 sm_cores = 32; 2855 else if(deviceProp.major==2 && deviceProp.minor==1) 2856 sm_cores = 48; 2857 put_in_dict(dict, "coresCount", sm_cores * deviceProp.multiProcessorCount); 2858 #endif 2859 put_in_dict(dict, "totalConstMem", deviceProp.totalConstMem); 2860 put_in_dict(dict, "sharedMemPerBlock", deviceProp.sharedMemPerBlock); 2861 put_in_dict(dict, "regsPerBlock", deviceProp.regsPerBlock); 2862 put_in_dict(dict, "warpSize", deviceProp.warpSize); 2863 put_in_dict(dict, "maxThreadsPerBlock", deviceProp.maxThreadsPerBlock); 2864 put_in_dict(dict, "maxThreadsDim0", deviceProp.maxThreadsDim[0]); 2865 put_in_dict(dict, "maxThreadsDim1", deviceProp.maxThreadsDim[1]); 2866 put_in_dict(dict, "maxThreadsDim2", deviceProp.maxThreadsDim[2]); 2867 put_in_dict(dict, "maxGridSize0", deviceProp.maxGridSize[0]); 2868 put_in_dict(dict, "maxGridSize1", deviceProp.maxGridSize[1]); 2869 put_in_dict(dict, "maxGridSize2", deviceProp.maxGridSize[2]); 2870 put_in_dict(dict, "memPitch", deviceProp.memPitch); 2871 put_in_dict(dict, "textureAlignment", deviceProp.textureAlignment); 2872 put_in_dict(dict, "clockRate", deviceProp.clockRate); 2873 #if CUDART_VERSION >= 2000 2874 put_in_dict(dict, "deviceOverlap", deviceProp.deviceOverlap); 2875 #endif 2876 #if CUDART_VERSION >= 2020 2877 put_in_dict(dict, "kernelExecTimeoutEnabled", deviceProp.kernelExecTimeoutEnabled); 2878 put_in_dict(dict, "integrated", deviceProp.integrated); 2879 put_in_dict(dict, "canMapHostMemory", deviceProp.canMapHostMemory); 2880 put_in_dict(dict, "computeMode", deviceProp.computeMode); 2881 //in the doc of this fct tell that 0 - Normal mode, 1 - only 1 context, 2 - no context 2882 #endif 2883 #if CUDART_VERSION >= 3000 2884 put_in_dict(dict, "concurrentKernels", deviceProp.concurrentKernels); 2885 #endif 2886 #if CUDART_VERSION >= 3010 2887 put_in_dict(dict, "ECCEnabled", deviceProp.ECCEnabled); 2888 #endif 2889 #if CUDART_VERSION >= 3020 2890 put_in_dict(dict, "tccDriver", deviceProp.tccDriver); 2891 #endif 2892 2893 return dict; 2894 } 2895 2896 /* 2897 * Returns in *free and *total respectively, the free and total amount of memory available for allocation by the device in bytes. 2898 */ 2899 PyObject * 2900 GetDeviceMemInfo(PyObject* _unused, PyObject* dummy) 2901 { 2902 size_t free = 0, total = 0; 2903 if(g_gpu_context_active == 0){ 2904 PyErr_Format(PyExc_RuntimeError, "No gpu device selected yet. Please make sure the gpu device was initialized by Theano before."); 2905 return NULL; 2906 } 2907 2908 cudaError_t err = cudaMemGetInfo(&free, &total); 2909 if (err != cudaSuccess){ 2910 // Clear the error flag, cudaMemGetInfo doesn't do it. 2911 // Currently this returns the same thing as err, but if in future 2912 // it returns something else I still don't see why we should ignore 2913 // it. All we want to do here is reset the flag. 2914 cudaGetLastError(); 2915 PyErr_Format(PyExc_RuntimeError, 2916 "Error while getting memory info about the gpu: %s", 2917 cudaGetErrorString(err)); 2918 return NULL; 2919 } 2920 return PyTuple_Pack(2, PyLong_FromLong(free), PyLong_FromLong(total)); 2921 } 2922 2923 /* 2924 * Synchronize with all the gpu device stream. 2925 */ 2926 PyObject * 2927 CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy) 2928 { 2929 CNDA_BEGIN_ALLOW_THREADS 2930 cudaThreadSynchronize(); 2931 CNDA_END_ALLOW_THREADS 2932 Py_INCREF(Py_None); 2933 return Py_None; 2934 } 2935 2936 /* 2937 * Exist and return true if we link with cublas v2. 2938 */ 2939 PyObject * 2940 CudaNdarray_cublasv2(PyObject* _unused, PyObject* dummy) 2941 { 2942 Py_INCREF(Py_True); 2943 return Py_True; 2944 } 2945 2946 PyObject * 2947 CudaNdarray_select_a_gpu(PyObject* _unused, PyObject* dummy) 2948 { 2949 void * rval = NULL; 2950 cudaError_t err; 2951 int num_gpus = 0; 2952 2953 err = cudaGetDeviceCount(&num_gpus); 2954 if (cudaSuccess != err){ 2955 printf("ERR!\\n"); 2956 PyErr_Format(PyExc_RuntimeError, 2957 "Not able to get number of GPUs (%s).", 2958 cudaGetErrorString(err)); 2959 return NULL; 2960 } 2961 2962 for (int device = 0; device < num_gpus; device++) { 2963 cudaSetDevice(device); 2964 err = cudaDeviceSynchronize(); // << CUDA context gets created here. 2965 cudaGetLastError(); // reset the error state 2966 if (cudaSuccess == err) 2967 break; 2968 } 2969 2970 if (cudaSuccess != err){ 2971 printf("ERR!\\n"); 2972 PyErr_Format(PyExc_RuntimeError, 2973 "Not able to select available GPU from %d cards (%s).", 2974 num_gpus, cudaGetErrorString(err)); 2975 return NULL; 2976 } 2977 2978 Py_INCREF(Py_None); 2979 return Py_None; 2980 } 2981 2982 #if COMPUTE_GPU_MEM_USED 2983 /* 2984 * Return the size in bytes that Theano currently have allocated on the gpu. 2985 */ 2986 PyObject * 2987 GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy) 2988 { 2989 PyObject* a = PyLong_FromLong(_allocated_size); 2990 PyObject* b = PyLong_FromLong(_max_allocated_size); 2991 2992 PyObject* tuple = PyTuple_New(2); 2993 PyTuple_SetItem(tuple, 0, a); 2994 PyTuple_SetItem(tuple, 1, b); 2995 return tuple; 2996 } 2997 #endif 2998 2999 static PyGetSetDef CudaNdarray_getset[] = { 3000 {"shape", 3001 (getter)CudaNdarray_get_shape, 3002 (setter)CudaNdarray_set_shape, 3003 "shape of this ndarray (tuple)", 3004 NULL}, 3005 {"_strides", 3006 (getter)CudaNdarray_get_strides, 3007 (setter)CudaNdarray_set_strides, 3008 "data pointer strides (in elements)", 3009 NULL}, 3010 {"strides", 3011 (getter)CudaNdarray_get_strides, 3012 (setter)CudaNdarray_set_strides, 3013 "data pointer strides (in elements)", 3014 NULL}, 3015 //gpudata is needed to allow calling pycuda fct with CudaNdarray input. 3016 {"gpudata", 3017 (getter)CudaNdarray_get_dev_data, 3018 NULL, 3019 "device data pointer", 3020 NULL}, 3021 {"_dev_data", 3022 (getter)CudaNdarray_get_dev_data, 3023 (setter)CudaNdarray_set_dev_data, 3024 "device data pointer", 3025 NULL}, 3026 {"dtype", 3027 (getter)CudaNdarray_get_dtype, 3028 NULL, 3029 "The dtype of the element. Now always float32", 3030 NULL}, 3031 {"size", 3032 (getter)CudaNdarray_SIZE_Object, 3033 NULL, 3034 "The number of elements in this object.", 3035 NULL}, 3036 //mem_size is neede for pycuda.elementwise.ElementwiseKernel Why do they use size and mem_size of the same value? 3037 {"mem_size", 3038 (getter)CudaNdarray_SIZE_Object, 3039 NULL, 3040 "The number of elements in this object.", 3041 NULL}, 3042 {"ndim", 3043 (getter)CudaNdarray_get_ndim, 3044 NULL, 3045 "The number of dimensions in this object.", 3046 NULL}, 3047 {"base", 3048 (getter)CudaNdarray_get_base, 3049 NULL, 3050 "If this ndarray is a view, base is the original ndarray.", 3051 NULL}, 3052 3053 {NULL, NULL, NULL, NULL} /* Sentinel */ 3054 }; 3055 3056 PyObject *CudaNdarray_repr(PyObject *self) 3057 { 3058 CudaNdarray *object = (CudaNdarray *)self; 3059 PyObject * np_object = CudaNdarray_CreateArrayObj(object); 3060 PyObject * str = PyObject_Str((PyObject *) np_object); 3061 char * cstr = PyString_AsString(str); 3062 PyObject * out = PyString_FromFormat("%s%s%s", 3063 "CudaNdarray(", 3064 cstr, 3065 ")"); 3066 Py_DECREF(str); 3067 Py_DECREF(np_object); 3068 #if PY_MAJOR_VERSION >= 3 3069 // In Python 3 PyString_FromFormat return a Bytes object 3070 PyObject* out2 = PyObject_Str(out); 3071 Py_DECREF(out); 3072 return out2; 3073 #endif 3074 return out; 3075 } 3076 3077 static PyTypeObject CudaNdarrayType = 3078 { 3079 #if PY_MAJOR_VERSION >= 3 3080 PyVarObject_HEAD_INIT(NULL, 0) 3081 #else 3082 PyObject_HEAD_INIT(NULL) 3083 0, /*ob_size*/ 3084 #endif 3085 "CudaNdarray", /*tp_name*/ 3086 sizeof(CudaNdarray), /*tp_basicsize*/ 3087 0, /*tp_itemsize*/ 3088 (destructor)CudaNdarray_dealloc, /*tp_dealloc*/ 3089 0, /*tp_print*/ 3090 0, /*tp_getattr*/ 3091 0, /*tp_setattr*/ 3092 0, /*tp_compare*/ 3093 CudaNdarray_repr, /*tp_repr*/ 3094 &CudaNdarrayNumberMethods, /*tp_as_number*/ 3095 0, /*tp_as_sequence*/ 3096 &CudaNdarrayMappingMethods,/*tp_as_mapping*/ 3097 0, /*tp_hash */ 3098 0, /*tp_call*/ 3099 0, /*tp_str*/ 3100 0, /*tp_getattro*/ 3101 0, /*tp_setattro*/ 3102 0, /*tp_as_buffer*/ 3103 #if PY_MAJOR_VERSION >= 3 3104 // Py_TPFLAGS_CHECKTYPES is always true and was removed in Python 3. 3105 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ 3106 #else 3107 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES, /*tp_flags*/ 3108 #endif 3109 "CudaNdarray objects", /* tp_doc */ 3110 0, /* tp_traverse */ 3111 0, /* tp_clear */ 3112 0, /* tp_richcompare */ 3113 0, /* tp_weaklistoffset */ 3114 0, /* tp_iter */ 3115 0, /* tp_iternext */ 3116 CudaNdarray_methods, /* tp_methods */ 3117 CudaNdarray_members, /* tp_members */ 3118 CudaNdarray_getset, /* tp_getset */ 3119 0, /* tp_base */ 3120 0, /* tp_dict */ 3121 0, /* tp_descr_get */ 3122 0, /* tp_descr_set */ 3123 0, /* tp_dictoffset */ 3124 (initproc)CudaNdarray_init,/* tp_init */ 3125 0, /* tp_alloc */ 3126 CudaNdarray_new, /* tp_new */ 3127 }; 3128 3129 static __global__ void get_gpu_ptr_size(int* dst) 3130 { 3131 dst[0] = sizeof(float*); 3132 dst[1] = sizeof(int); 3133 } 3134 3135 PyObject * 3136 CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args) 3137 { 3138 int *gpu_data = (int*)device_malloc(sizeof(int)*2); 3139 if(gpu_data == NULL){ 3140 return NULL; 3141 } 3142 get_gpu_ptr_size<<<1,1>>>(gpu_data); 3143 3144 cudaError_t cudaErr = cudaGetLastError(); 3145 if (cudaSuccess != cudaErr){ 3146 3147 device_free(gpu_data); 3148 return PyErr_Format(PyExc_RuntimeError, 3149 "CudaNdarray_ptr_int_size: error when calling the gpu code. (%s)", 3150 cudaGetErrorString(cudaErr)); 3151 } 3152 3153 // Transfer the result to cpu 3154 int gpu_sizes[] = {-1,-1}; 3155 cublasStatus_t err; 3156 err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1); 3157 device_free(gpu_data); 3158 3159 if (CUBLAS_STATUS_SUCCESS != err){ 3160 PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory"); 3161 return NULL; 3162 } 3163 return Py_BuildValue("iiii", (int) gpu_sizes[0], (int)sizeof(float*), 3164 (int)sizeof(int), (int) gpu_sizes[1]); 3165 } 3166 3167 static int cublas_init(); 3168 static void cublas_shutdown(); 3169 // Initialize the gpu. 3170 // Takes two optional parameters, the device number and if we should use cnmem. 3171 // If the device number is provided, it sets that device to be the active device. 3172 // If not provided (usually just to test whether the gpu is available at all), 3173 // it does not set an active device. 3174 // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed. 3175 // cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it. 3176 PyObject * 3177 CudaNdarray_gpu_init(PyObject* _unused, PyObject* args) 3178 { 3179 int card_nb = 0; 3180 int card_number_provided = 1; 3181 float cnmem = 0; // Theano flag lib.cnmem 3182 // if we're given something wildly invalid, this will throw a TypeError 3183 if(!PyArg_ParseTuple(args, "|if", &card_nb, &cnmem)) 3184 return NULL; 3185 if(cnmem) 3186 g_use_cnmem = true; 3187 3188 if(PyTuple_Size(args) == 0) { 3189 card_number_provided = 0; 3190 card_nb = 0; 3191 } 3192 3193 int deviceCount; 3194 cudaError err = cudaGetDeviceCount(&deviceCount); 3195 if(cudaSuccess != err) { 3196 return PyErr_Format(PyExc_EnvironmentError, 3197 "Unable to get the number of gpus available: %s", 3198 cudaGetErrorString(cudaGetLastError())); 3199 } 3200 3201 // as soon as the first successful call to a cuda* function is made, a 3202 // gpu context has been created 3203 g_gpu_context_active = 1; 3204 3205 if(deviceCount <= 0) { 3206 return PyErr_Format(PyExc_EnvironmentError, 3207 "Can't use the GPU, no devices support CUDA"); 3208 } 3209 if(card_number_provided && (card_nb < 0 || card_nb > (deviceCount - 1))) { 3210 return PyErr_Format(PyExc_ValueError, 3211 "Bad device number %d. Only %d devices available.", 3212 card_nb, 3213 deviceCount); 3214 } 3215 3216 cudaDeviceProp deviceProp; 3217 err = cudaGetDeviceProperties(&deviceProp, card_nb); 3218 if(cudaSuccess != err) { 3219 return PyErr_Format(PyExc_EnvironmentError, 3220 "Unable to get properties of gpu %i: %s", 3221 card_nb, 3222 cudaGetErrorString(cudaGetLastError())); 3223 } 3224 3225 if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){ 3226 return PyErr_Format(PyExc_EnvironmentError, 3227 "There is no device that supports CUDA"); 3228 } 3229 3230 if(card_number_provided) { 3231 err = cudaSetDevice(card_nb); 3232 if(cudaSuccess != err) { 3233 return PyErr_Format(PyExc_EnvironmentError, 3234 "Unable to set device %i: %s", 3235 card_nb, 3236 cudaGetErrorString(cudaGetLastError())); 3237 } 3238 if (cublas_init() == -1) 3239 return NULL; 3240 } 3241 if(card_number_provided && g_use_cnmem) { 3242 size_t mem = 0; 3243 if (cnmem > 1) 3244 mem = cnmem * 1024 * 1024; 3245 else{ 3246 // Clip to 95% to let memory for the driver. 3247 // 98% didn't worked in some cases. 3248 if (cnmem > .95){ 3249 cnmem = .95; 3250 } 3251 size_t free = 0, total = 0; 3252 cudaError_t err = cudaMemGetInfo(&free, &total); 3253 if (err != cudaSuccess){ 3254 // Clear the error flag, cudaMemGetInfo doesn't do it. 3255 // Currently this returns the same thing as err, but if in future 3256 // it returns something else I still don't see why we should ignore 3257 // it. All we want to do here is reset the flag. 3258 cudaGetLastError(); 3259 PyErr_Format(PyExc_RuntimeError, 3260 "Error while getting memory info about the gpu: %s", 3261 cudaGetErrorString(err)); 3262 return NULL; 3263 } 3264 mem = total * cnmem; 3265 } 3266 if(initCnmem(card_number_provided, card_nb, mem) == -1){ 3267 return NULL; 3268 } 3269 } 3270 3271 Py_INCREF(Py_None); 3272 return Py_None; 3273 } 3274 3275 PyObject * 3276 CudaNdarray_active_device_number(PyObject* _unused, PyObject* _unused_args) { 3277 // NB: No cuda error checking here; keeps things simple, and it's not 3278 // really necessary. 3279 int currentDevice; 3280 cudaGetDevice(¤tDevice); 3281 return PyInt_FromLong(currentDevice); 3282 } 3283 3284 PyObject * 3285 CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) { 3286 // NB: No cuda error checking here; keeps things simple, and it's not 3287 // really necessary. 3288 int currentDevice; 3289 cudaGetDevice(¤tDevice); 3290 3291 cudaDeviceProp deviceProp; 3292 cudaGetDeviceProperties(&deviceProp, currentDevice); 3293 return PyString_FromString(deviceProp.name); 3294 } 3295 3296 PyObject * 3297 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) { 3298 // Don't handle errors here 3299 cublas_shutdown(); 3300 g_gpu_context_active = 0; // context has now been closed down 3301 if(g_use_cnmem) { 3302 cnmemStatus_t status = cnmemFinalize(); 3303 if(status != CNMEM_STATUS_SUCCESS) { 3304 fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n", 3305 cnmemGetErrorString(status)); 3306 if(status == CNMEM_STATUS_CUDA_ERROR) { 3307 fprintf(stderr, " Cuda-Reason=%s\n", 3308 cudaGetErrorString(cudaGetLastError())); 3309 } 3310 } 3311 } 3312 3313 Py_INCREF(Py_None); 3314 return Py_None; 3315 } 3316 3317 /* 3318 * This function is tested in theano/misc/test_pycuda_theano_simple.py 3319 */ 3320 PyObject * 3321 CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args) 3322 { 3323 int verbose = 0; 3324 PyObject *gpu_ptr = NULL; 3325 PyObject *shapes = NULL; 3326 PyObject *strides = NULL; 3327 PyObject *base = NULL; 3328 PyObject *rval = NULL; 3329 3330 //args should consist of 3 python objects 3331 //The first is the gpu ptr 3332 //The second if the shape 3333 //The third if the strides 3334 if (! PyArg_ParseTuple(args, "OOOO", &gpu_ptr, &shapes, &strides, &base)) 3335 return NULL; 3336 3337 if (verbose) printf("In CudaNdarray_from_gpu_pointer\n"); 3338 if (!PyLong_Check(gpu_ptr)) 3339 { 3340 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long"); 3341 return NULL; 3342 } 3343 3344 Py_ssize_t nd = PyObject_Length(shapes); 3345 if (nd < 0) 3346 { 3347 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of second argument"); 3348 return NULL; 3349 } 3350 Py_ssize_t nd_stride = PyObject_Length(strides); 3351 if (nd_stride < 0) 3352 { 3353 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument"); 3354 return NULL; 3355 } 3356 3357 if (nd != nd_stride) 3358 { 3359 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides"); 3360 return NULL; 3361 } 3362 3363 rval = CudaNdarray_New(); 3364 3365 if (CudaNdarray_set_nd((CudaNdarray *)rval, nd)) 3366 { 3367 //CudaNdarray_set_nd set the error msg 3368 return NULL; 3369 } 3370 // set gpu pointeur 3371 assert(((CudaNdarray *)rval)->data_allocated == 0); 3372 if (CudaNdarray_set_device_data((CudaNdarray *)rval, (float *)PyInt_AsLong(gpu_ptr), base)) 3373 { 3374 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Error while setting the gpu pointor"); 3375 return NULL; 3376 3377 } 3378 3379 // Set dims and strides 3380 for (int i = nd-1; i >= 0; --i) 3381 { 3382 PyObject * idx = PyLong_FromLong(i); 3383 if (idx == NULL) 3384 { 3385 PyErr_SetString(PyExc_Exception, "CudaNdarray_from_gpu_pointer: Couldn't make long object to loop over list/tuple"); 3386 return NULL; 3387 } 3388 PyObject* dim_ = PyObject_GetItem(shapes, idx); 3389 PyObject* strd_ = PyObject_GetItem(strides, idx); 3390 if (!PyInt_Check(dim_)) 3391 { 3392 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i); 3393 return NULL; 3394 } 3395 if (!PyInt_Check(strd_)) 3396 { 3397 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i); 3398 return NULL; 3399 } 3400 int dim = PyInt_AsLong(dim_); 3401 int strd = PyInt_AsLong(strd_); 3402 CudaNdarray_set_stride((CudaNdarray *)rval, i, strd); 3403 CudaNdarray_set_dim((CudaNdarray *)rval, i, dim); 3404 Py_DECREF(idx); 3405 Py_DECREF(dim_); 3406 Py_DECREF(strd_); 3407 } 3408 if (verbose) printf("CudaNdarray_from_gpu_pointer normal return\n"); 3409 return rval; 3410 } 3411 3412 PyObject * 3413 CudaNdarray_Dot(PyObject* _unused, PyObject* args) 3414 { 3415 PyObject *l=NULL; 3416 PyObject *r=NULL; 3417 PyObject * rval = NULL; 3418 3419 //args should consist of two python objects ("OO") 3420 if (! PyArg_ParseTuple(args, "OO", &l, &r)) 3421 return NULL; 3422 3423 if (!CudaNdarray_Check(l) || !CudaNdarray_Check(r)) 3424 { 3425 PyErr_SetString(PyExc_TypeError, "CudaNdarray arguments required "); 3426 goto CudaNdarray_dot_fail; 3427 } 3428 if (((CudaNdarray*)l)->nd != 2) 3429 { 3430 PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now"); 3431 goto CudaNdarray_dot_fail; 3432 } 3433 if (((CudaNdarray*)r)->nd != 2) 3434 { 3435 PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now"); 3436 goto CudaNdarray_dot_fail; 3437 } 3438 rval = CudaNdarray_New(); 3439 if (!rval) 3440 { 3441 goto CudaNdarray_dot_fail; 3442 } 3443 int dims[2]; 3444 dims[0] = CudaNdarray_HOST_DIMS((CudaNdarray*)l)[0]; 3445 dims[1] = CudaNdarray_HOST_DIMS((CudaNdarray*)r)[1]; 3446 if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, 2, dims)) 3447 { 3448 goto CudaNdarray_dot_fail; 3449 } 3450 if (CudaNdarray_gemm(1.0, (CudaNdarray*)l, (CudaNdarray*)r, 0.0, (CudaNdarray*)rval)) 3451 { 3452 goto CudaNdarray_dot_fail; 3453 } 3454 3455 return rval; 3456 3457 CudaNdarray_dot_fail: 3458 Py_XDECREF(rval); 3459 return NULL; 3460 } 3461 3462 static PyObject * 3463 filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage) 3464 { 3465 /* 3466 * TODO: DOC what this function should do in the various cases of 3467 * What is 'strict' supposed to mean in the context of this function? 3468 * What do we do with input that could be interpreted as matching the broadcastable pattern in strict vs. non-strict cases? 3469 * 3470 */ 3471 PyObject *py_data=NULL; 3472 PyArrayObject * data = NULL; 3473 int strict = 0; 3474 PyObject * broadcastable=NULL; 3475 PyObject * storage=NULL; 3476 CudaNdarray * rval=NULL; 3477 3478 //Python object references which are provided to the caller are borrowed references 3479 if (!PyArg_ParseTuple(args, "OOiO", &py_data, &broadcastable, &strict, &storage)) return NULL; 3480 3481 if (!PyTuple_Check(broadcastable)){ 3482 PyErr_SetString(PyExc_TypeError, "broadcastable arg should be a tuple of int."); 3483 return NULL; 3484 } 3485 Py_INCREF(py_data); 3486 Py_INCREF(broadcastable); 3487 3488 CudaNdarray * cnda = (CudaNdarray*)py_data; 3489 3490 if (strict || CudaNdarray_Check(py_data)) 3491 { 3492 //TODO: support non-strict "casting" from a vt to the broadcastable/type/size that we need. 3493 if (!CudaNdarray_Check(py_data)) 3494 { 3495 Py_DECREF(py_data); 3496 Py_DECREF(broadcastable); 3497 PyErr_SetString(PyExc_TypeError, "strict mode requires CudaNdarray"); 3498 return NULL; 3499 } 3500 if (cnda->nd != PyTuple_Size(broadcastable)) 3501 { 3502 Py_DECREF(py_data); 3503 Py_DECREF(broadcastable); 3504 PyErr_Format(PyExc_TypeError, "Wrong rank: %i vs %li", cnda->nd, (long)PyTuple_Size(broadcastable)); 3505 return NULL; 3506 } 3507 for (int i = 0; i < cnda->nd; ++i) 3508 { 3509 if ((CudaNdarray_HOST_DIMS(cnda)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i)))) 3510 { 3511 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable vt dimension %i", i); 3512 Py_DECREF(py_data); 3513 Py_DECREF(broadcastable); 3514 return NULL; 3515 }else if (CudaNdarray_HOST_DIMS(cnda)[i] == 1 && CudaNdarray_HOST_STRIDES(cnda)[i] != 0){ 3516 PyErr_Format(PyExc_TypeError, "Non-zeros strides(%d) on dimension %d of size 1", 3517 CudaNdarray_HOST_STRIDES(cnda)[i], i); 3518 Py_DECREF(py_data); 3519 Py_DECREF(broadcastable); 3520 return NULL; 3521 } 3522 } 3523 Py_DECREF(broadcastable); 3524 return py_data; 3525 } 3526 else 3527 { 3528 data = (PyArrayObject*)PyArray_FromObject(py_data, REAL_TYPENUM, PyTuple_Size(broadcastable), PyTuple_Size(broadcastable)); 3529 if (!data) 3530 { 3531 //err message already defined 3532 Py_DECREF(py_data); 3533 Py_DECREF(broadcastable); 3534 return NULL; 3535 } 3536 for (int i = 0; i < PyArray_NDIM(data); ++i) 3537 { 3538 if ((PyArray_DIMS(data)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i)))) 3539 { 3540 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable dimension %i", i); 3541 Py_DECREF(data); 3542 Py_DECREF(py_data); 3543 Py_DECREF(broadcastable); 3544 return NULL; 3545 } 3546 } 3547 if (storage && CudaNdarray_Check(storage)) 3548 { 3549 rval = (CudaNdarray*) storage; 3550 Py_INCREF(rval); 3551 } 3552 else 3553 { 3554 rval = (CudaNdarray*) CudaNdarray_New(); 3555 } 3556 if (rval) 3557 { 3558 if (CudaNdarray_CopyFromArray(rval, data)) 3559 { 3560 Py_DECREF(rval); 3561 rval = NULL; 3562 } 3563 } 3564 Py_DECREF(data); 3565 Py_DECREF(py_data); 3566 Py_DECREF(broadcastable); 3567 return (PyObject*)rval; 3568 } 3569 } 3570 3571 //TODO-- CudaNdarray_Dot and CudaNdarray_active_device_name are following different capitalization conventions. 3572 // Pick one and standardize it, this file is already annoying enough to grep through 3573 static PyMethodDef module_methods[] = { 3574 {"dimshuffle", CudaNdarray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a CudaNdarray."}, 3575 {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."}, 3576 {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."}, 3577 {"select_a_gpu", CudaNdarray_select_a_gpu, METH_NOARGS, "Call this method if you want to select a GPU before gpu_init call and let the driver choose the GPU."}, 3578 {"active_device_name", CudaNdarray_active_device_name, METH_VARARGS, "Get the name of the active device."}, 3579 {"active_device_number", CudaNdarray_active_device_number, METH_VARARGS, "Get the number of the active device."}, 3580 {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."}, 3581 {"device_properties", GetDeviceProperties, METH_VARARGS, "Return a dictionary with the device properties."}, 3582 {"mem_info", GetDeviceMemInfo, METH_NOARGS, "Return a tuple with the free and total memory on the gpu in bytes."}, 3583 #if COMPUTE_GPU_MEM_USED 3584 {"theano_allocated", GetTheanoAllocInfo, METH_NOARGS, "Return the size in bytes of memory Theano currently have allocated on the gpu."}, 3585 #endif 3586 {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."}, 3587 {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."}, 3588 {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"}, 3589 {"from_gpu_pointer", CudaNdarray_from_gpu_pointer, METH_VARARGS, "Used to create a CudaNdarray from already allocated memory on the gpu.(example by pycuda)"}, 3590 {"synchronize", CudaNdarray_synchronize, METH_NOARGS, "Used to synchronize the device"}, 3591 {"cublas_v2", CudaNdarray_cublasv2, METH_NOARGS, 3592 "Used to know if this version of cuda_ndarray is linked with cublas v2."}, 3593 {NULL, NULL, NULL, NULL} /* Sentinel */ 3594 }; 3595 3596 #define CNDA_MOD_NAME "cuda_ndarray" 3597 #define CNDA_DOCSTRING "CUDA implementation of a numpy ndarray-like object." 3598 3599 #if PY_MAJOR_VERSION == 3 3600 static struct PyModuleDef cuda_ndarray_moduledef = 3601 { 3602 PyModuleDef_HEAD_INIT, 3603 CNDA_MOD_NAME, 3604 CNDA_DOCSTRING, 3605 -1, /* size of per-interpreter state of the module, 3606 or -1 if the module keeps state in global variables. */ 3607 module_methods 3608 }; 3609 3610 PyMODINIT_FUNC 3611 PyInit_cuda_ndarray(void) 3612 #else 3613 PyMODINIT_FUNC 3614 initcuda_ndarray(void) 3615 #endif 3616 { 3617 import_array(); 3618 3619 PyObject* m; 3620 3621 if (PyType_Ready(&CudaNdarrayType) < 0) { 3622 #if PY_MAJOR_VERSION == 3 3623 return NULL; 3624 #else 3625 return; 3626 #endif 3627 } 3628 3629 #if PY_MAJOR_VERSION == 3 3630 m = PyModule_Create(&cuda_ndarray_moduledef); 3631 #else 3632 m = Py_InitModule3(CNDA_MOD_NAME, module_methods, CNDA_DOCSTRING); 3633 #endif 3634 3635 if (m == NULL) { 3636 #if PY_MAJOR_VERSION == 3 3637 return NULL; 3638 #else 3639 return; 3640 #endif 3641 } 3642 3643 Py_INCREF(&CudaNdarrayType); 3644 PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType); 3645 #if COMPUTE_GPU_MEM_USED 3646 for(int i=0;ind = 0; 3705 } 3706 else if (nd > 0) 3707 { 3708 if (CudaNdarray_set_nd(self, nd)) 3709 { 3710 Py_DECREF(self); 3711 return NULL; 3712 } 3713 } 3714 ++_outstanding_mallocs[1]; 3715 return (PyObject *)self; 3716 } 3717 3718 3719 3720 ////////////////////////////// 3721 // 3722 // Published helper functions 3723 // 3724 ////////////////////////////// 3725 3726 static int 3727 cublas_init() 3728 { 3729 cublasStatus_t err; 3730 err = cublasCreate(&handle); 3731 if (CUBLAS_STATUS_SUCCESS != err) 3732 { 3733 if(CUBLAS_STATUS_NOT_INITIALIZED == err) 3734 PyErr_SetString(PyExc_RuntimeError, 3735 "cublasCreate() returned this error " 3736 "'the CUDA Runtime initialization failed'"); 3737 else if(CUBLAS_STATUS_ALLOC_FAILED == err) 3738 PyErr_SetString(PyExc_RuntimeError, 3739 "cublasCreate() returned this error " 3740 "'the resources could not be allocated'"); 3741 else 3742 PyErr_SetString(PyExc_RuntimeError, 3743 "unknow error during returned by cublasCreate()"); 3744 return -1; 3745 } 3746 // Set the default stream as the one to execute on (default) 3747 cublasSetStream(handle, NULL); 3748 // Pointer to scalars are on the host (also default) 3749 cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST); 3750 #if CUDA_VERSION >= 5000 3751 // atomics can be used in kernels to speed up operations (not default) 3752 // This may lead to a slight variance from run to run in some operations 3753 cublasSetAtomicsMode(handle, CUBLAS_ATOMICS_ALLOWED); 3754 #endif 3755 return 0; 3756 } 3757 3758 static void 3759 cublas_shutdown() 3760 { 3761 if (handle != NULL) 3762 cublasDestroy(handle); 3763 // No point in handling any errors here 3764 handle = NULL; 3765 } 3766 3767 int 3768 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj) 3769 { 3770 int err = CudaNdarray_alloc_contiguous(self, PyArray_NDIM(obj), 3771 PyArray_DIMS(obj)); 3772 if (err) { 3773 return err; 3774 } 3775 3776 int typenum = PyArray_TYPE(obj); 3777 if (typenum != REAL_TYPENUM) 3778 { 3779 PyErr_SetString(PyExc_TypeError, "can only copy from float arrays"); 3780 return -1; 3781 } 3782 assert( 4 == PyArray_ITEMSIZE(obj)); 3783 PyArrayObject * py_src = (PyArrayObject *)PyArray_ContiguousFromAny( 3784 (PyObject*)obj, typenum, self->nd, self->nd); 3785 if (!py_src) { 3786 return -1; 3787 } 3788 npy_intp py_src_size = PyArray_SIZE(py_src); 3789 void *py_src_data = PyArray_DATA(py_src); 3790 cudaError_t cerr; 3791 CNDA_BEGIN_ALLOW_THREADS; 3792 cerr = cudaMemcpy(self->devdata, py_src_data, 3793 py_src_size * sizeof(real), 3794 cudaMemcpyHostToDevice); 3795 //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway 3796 CNDA_END_ALLOW_THREADS; 3797 if (cudaSuccess != cerr) 3798 { 3799 PyErr_Format(PyExc_RuntimeError, 3800 "Cuda error '%s' while copying %lli data element" 3801 " to device memory. str ptr=%p. dst ptr=%p", 3802 cudaGetErrorString(cerr), 3803 (long long)py_src_size, 3804 py_src_data, 3805 self->devdata); 3806 Py_DECREF(py_src); 3807 return -1; 3808 } 3809 Py_DECREF(py_src); 3810 return 0; 3811 } 3812 3813 PyObject * 3814 CudaNdarray_new_nd(int nd) 3815 { 3816 CudaNdarray * rval = (CudaNdarray*) CudaNdarray_New(); 3817 if (!rval || CudaNdarray_set_nd(rval, nd)) 3818 { 3819 Py_XDECREF(rval); 3820 rval = NULL; 3821 } 3822 return (PyObject *) rval; 3823 } 3824 3825 3826 /** 3827 * Initialize 'self' as a view of 'base', with memory storage 'data' 3828 */ 3829 3830 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base) 3831 { 3832 if (self->data_allocated) 3833 { 3834 assert(self->devdata); 3835 if (device_free(self->devdata)) 3836 { 3837 self->devdata = NULL; 3838 self->data_allocated = 0; 3839 return -1; 3840 } 3841 } 3842 // Get the original base object (base.base.base...) 3843 PyObject * orig_base = base; 3844 // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ... 3845 while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base) 3846 { 3847 // base_base is itself a view 3848 orig_base = ((CudaNdarray*) orig_base)->base; 3849 } 3850 //N.B. XDECREF and XINCREF are no-ops for NULL pointers 3851 if (self->base != orig_base) 3852 { 3853 Py_XDECREF(self->base); 3854 self->base = orig_base; 3855 Py_XINCREF(self->base); 3856 } 3857 self->data_allocated = 0; 3858 self->devdata = data; 3859 return 0; 3860 } 3861 3862 static __global__ void k_copy_1d(const int N, const float * x, const int sx, float * y, const int sy) 3863 { 3864 for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x) 3865 { 3866 y[i*sy] = x[i*sx]; 3867 } 3868 } 3869 3870 // N1 through N4 are the size of y 3871 static __global__ void k_copy_4d(const int N1, 3872 const int N2, const int N3, const int N4, 3873 const float * x, const int sx1, const int sx2, const int sx3, 3874 const int sx4, float * y, const int sy1, const int sy2, 3875 const int sy3, const int sy4) 3876 { 3877 // These must be made int instead of unsigned int due to a bug in nvcc 3878 int bx = blockIdx.x; 3879 int by = blockIdx.y; 3880 3881 for (int i = bx; i < N1; i += gridDim.x) 3882 { 3883 for (int j = by; j < N2; j += gridDim.y) 3884 { 3885 for (int k = threadIdx.x; k < N3; k += (int) blockDim.x) 3886 { 3887 for (int l = threadIdx.y; l < N4; l += (int) blockDim.y) 3888 { 3889 y[i * sy1 + j * sy2 + k * sy3 + l * sy4] = 3890 x[i * sx1 + j * sx2 + k * sx3 + l * sx4]; 3891 } 3892 } 3893 } 3894 } 3895 } 3896 3897 //copy from other into self 3898 int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, 3899 const CudaNdarray * other, 3900 bool unbroadcast) 3901 { 3902 int verbose = 0; 3903 if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n"); 3904 3905 //standard elemwise size checks 3906 if (self->nd == -1) 3907 { 3908 PyErr_SetString(PyExc_TypeError, 3909 "can't copy into un-initialized CudaNdarray"); 3910 return -1; 3911 } 3912 CudaNdarray * new_other = NULL; 3913 3914 if (self->nd < other->nd) 3915 { 3916 PyErr_Format(PyExc_NotImplementedError, 3917 "CudaNdarray_CopyFromCudaNdarray: The number of dimensions of the " 3918 "destination needs to be >= the number of dimensions of the " 3919 "source. Got %d and %d.", self->nd, other->nd); 3920 return -1; 3921 } 3922 else if (self->nd != other->nd) 3923 { 3924 new_other = (CudaNdarray *) CudaNdarray_View(other); 3925 int added_dims = self->nd - other->nd; 3926 int* pattern = (int*) alloca(self->nd * sizeof(int)); 3927 for(int i = 0; i < added_dims; i++) 3928 pattern[i] = -1; 3929 for(int i = 0; i < other->nd; i++) 3930 pattern[i + added_dims] = i; 3931 CudaNdarray_dimshuffle(new_other, self->nd, pattern); 3932 other = new_other; 3933 } 3934 assert(self->nd == other->nd); 3935 //standard elemwise dim checks (also compute total size) 3936 unsigned int size = 1; 3937 unsigned int size_source = 1; 3938 for (int i = 0; i< self->nd; ++i) 3939 { 3940 if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i]) 3941 && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) ) 3942 { 3943 PyErr_Format(PyExc_ValueError, 3944 "CudaNdarray_CopyFromCudaNdarray:" 3945 " need same dimensions for dim %d," 3946 " destination=%d, source=%d", 3947 i, CudaNdarray_HOST_DIMS(self)[i], 3948 CudaNdarray_HOST_DIMS(other)[i]); 3949 Py_XDECREF(new_other); 3950 return -1; 3951 } 3952 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i]; 3953 size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i]; 3954 } 3955 if (0 == size) 3956 { 3957 Py_XDECREF(new_other); 3958 return 0; //nothing to copy, we're done. 3959 } 3960 if (CudaNdarray_is_c_contiguous(self) && 3961 CudaNdarray_is_c_contiguous(other) && 3962 size == size_source) 3963 { 3964 if (verbose) 3965 fprintf(stderr, "Copying contiguous vector with cublasScopy\n"); 3966 3967 cublasStatus_t err; 3968 err = cublasScopy(handle, size, CudaNdarray_DEV_DATA(other), 1, 3969 CudaNdarray_DEV_DATA(self), 1); 3970 CNDA_THREAD_SYNC; 3971 Py_XDECREF(new_other); 3972 if (CUBLAS_STATUS_SUCCESS != err) 3973 { 3974 PyErr_SetString(PyExc_RuntimeError, "Error copying memory"); 3975 return -1; 3976 } 3977 return 0; 3978 } 3979 //TODO: rewrite these copy operations to be more efficient 3980 // See, for example the transpose example in the cuda_sdk. 3981 switch (self->nd) 3982 { 3983 case 0: // scalar 3984 { 3985 // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS 3986 assert(0); 3987 }; break; 3988 case 1: // vector 3989 { 3990 if (verbose) fprintf(stderr, "Copying non-contiguous vector\n"); 3991 if (verbose) fprint_CudaNdarray(stderr, other); 3992 unsigned int n_blocks = std::min(size, 3993 (unsigned int)NUM_VECTOR_OP_BLOCKS); 3994 unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks), 3995 (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); 3996 k_copy_1d<<>>(size, 3997 CudaNdarray_DEV_DATA(other), 3998 CudaNdarray_HOST_STRIDES(other)[0], 3999 CudaNdarray_DEV_DATA(self), 4000 CudaNdarray_HOST_STRIDES(self)[0]); 4001 CNDA_THREAD_SYNC; 4002 cudaError_t err = cudaGetLastError(); 4003 if( cudaSuccess != err) 4004 { 4005 PyErr_Format(PyExc_RuntimeError, 4006 "Cuda error: %s: %s. (n_blocks=%i," 4007 " n_threads_per_block=%i)\n", "k_copy_1d", 4008 cudaGetErrorString(err), n_blocks, n_threads); 4009 Py_XDECREF(new_other); 4010 return -1; 4011 } 4012 }; break; 4013 case 4: // 4-tensor 4014 { 4015 if (verbose) 4016 { 4017 if (0 != fprint_CudaNdarray(stderr, other)) 4018 { 4019 Py_XDECREF(new_other); 4020 return -1; 4021 } 4022 } 4023 4024 // The blocks implement the looping over the first two axes so 4025 // this needs to be (N1, N2) 4026 dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(self)[0], 4027 NUM_VECTOR_OP_BLOCKS), 4028 std::min(CudaNdarray_HOST_DIMS(self)[1], 4029 NUM_VECTOR_OP_BLOCKS)); 4030 // For the threads, just make as many as possible 4031 dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2], 4032 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK), 4033 std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[3], 4034 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK)); 4035 4036 n_threads.x = std::min( (unsigned int) 32, (unsigned int) n_threads.x); 4037 n_threads.y = std::min( n_threads.y, NUM_VECTOR_OP_THREADS_PER_BLOCK / n_threads.x); 4038 4039 k_copy_4d<<>>( 4040 // size of y 4041 (unsigned int) CudaNdarray_HOST_DIMS(self)[0], // N1 4042 (unsigned int) CudaNdarray_HOST_DIMS(self)[1], // N2 4043 (unsigned int) CudaNdarray_HOST_DIMS(self)[2], // N3 4044 (unsigned int) CudaNdarray_HOST_DIMS(self)[3], // N4 4045 CudaNdarray_DEV_DATA(other), // x 4046 // x strides 4047 CudaNdarray_HOST_STRIDES(other)[0], 4048 CudaNdarray_HOST_STRIDES(other)[1], 4049 CudaNdarray_HOST_STRIDES(other)[2], 4050 CudaNdarray_HOST_STRIDES(other)[3], 4051 CudaNdarray_DEV_DATA(self), // y 4052 // y strides 4053 CudaNdarray_HOST_STRIDES(self)[0], 4054 CudaNdarray_HOST_STRIDES(self)[1], 4055 CudaNdarray_HOST_STRIDES(self)[2], 4056 CudaNdarray_HOST_STRIDES(self)[3] 4057 ); 4058 CNDA_THREAD_SYNC; 4059 cudaError_t err = cudaGetLastError(); 4060 if( cudaSuccess != err) 4061 { 4062 PyErr_Format(PyExc_RuntimeError, 4063 "Cuda error: %s: %s.", 4064 "k_copy_4d", 4065 cudaGetErrorString(err)); 4066 Py_XDECREF(new_other); 4067 return -1; 4068 } 4069 }; break; 4070 default: 4071 { 4072 cudaError_t err = cudaGetLastError(); 4073 if(cudaSuccess != err){ 4074 PyErr_Format(PyExc_RuntimeError, 4075 "Unexpected Cuda error: %s: %s\n", 4076 "CudaNdarray_CopyFromCudaNdarray", 4077 cudaGetErrorString(err)); 4078 Py_XDECREF(new_other); 4079 return -1; 4080 } 4081 4082 if (verbose) 4083 fprintf(stderr, 4084 "Copying with default version unbroadcast=%d\n", 4085 unbroadcast); 4086 // call worker routine 4087 unsigned int threads_per_block = std::min(size, 4088 (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); 4089 unsigned int n_blocks = std::min(ceil_intdiv(size, threads_per_block), 4090 (unsigned int)NUM_VECTOR_OP_BLOCKS); 4091 const CudaNdarray * cuda_dims = other; 4092 if(unbroadcast) 4093 cuda_dims = self; 4094 //copy from other into self 4095 k_elemwise_unary_rowmajor_copy<<>>( 4096 size, 4097 (unsigned int)other->nd, 4098 (const int *)CudaNdarray_DEV_DIMS(cuda_dims), 4099 (const float*)CudaNdarray_DEV_DATA(other), 4100 (const int *)CudaNdarray_DEV_STRIDES(other), 4101 CudaNdarray_DEV_DATA(self), 4102 (const int *)CudaNdarray_DEV_STRIDES(self)); 4103 CNDA_THREAD_SYNC; 4104 err = cudaGetLastError(); 4105 if(verbose>1) 4106 fprintf(stderr, 4107 "INFO k_elemwise_unary_rowmaj (n_blocks=%i," 4108 " n_threads_per_block=%i)\n", 4109 n_blocks, threads_per_block); 4110 if( cudaSuccess != err) 4111 { 4112 //fprint_CudaNdarray(stderr, self); 4113 //fprint_CudaNdarray(stderr, other); 4114 PyErr_Format(PyExc_RuntimeError, 4115 "Cuda error: %s: %s. (n_blocks=%i," 4116 " n_threads_per_block=%i)\n", 4117 "k_elemwise_unary_rowmajor_copy", 4118 cudaGetErrorString(err), n_blocks, 4119 threads_per_block); 4120 Py_XDECREF(new_other); 4121 return -1; 4122 } 4123 } 4124 }; 4125 Py_XDECREF(new_other); 4126 return 0; 4127 } 4128 4129 int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C) 4130 { 4131 if (A->nd != 2) 4132 { 4133 PyErr_SetString(PyExc_ValueError, "non-matrix arg A to gemm"); 4134 return -1; 4135 } 4136 if (B->nd != 2) 4137 { 4138 PyErr_SetString(PyExc_ValueError, "non-matrix arg B to gemm"); 4139 return -1; 4140 } 4141 if (C->nd != 2) 4142 { 4143 PyErr_SetString(PyExc_ValueError, "non-matrix arg C to gemm"); 4144 return -1; 4145 } 4146 4147 // We must allow dimensions to be zeros. 4148 if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0]) 4149 || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0]) 4150 || (CudaNdarray_HOST_DIMS(B)[1] != CudaNdarray_HOST_DIMS(C)[1])) 4151 { 4152 PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemm (%i,%i)x(%i,%i)->(%i,%i)", 4153 CudaNdarray_HOST_DIMS(A)[0], 4154 CudaNdarray_HOST_DIMS(A)[1], 4155 CudaNdarray_HOST_DIMS(B)[0], 4156 CudaNdarray_HOST_DIMS(B)[1], 4157 CudaNdarray_HOST_DIMS(C)[0], 4158 CudaNdarray_HOST_DIMS(C)[1]); 4159 return -1; 4160 } 4161 4162 // If matrix A or B has non-unit size and non-unit stride in both 4163 // dimensions, we can make a copy. 4164 CudaNdarray * A_new = NULL; 4165 CudaNdarray * B_new = NULL; 4166 if (((CudaNdarray_HOST_DIMS(A)[0] > 1) 4167 && (CudaNdarray_HOST_STRIDES(A)[0] != 1) 4168 && (CudaNdarray_HOST_DIMS(A)[1] > 1) 4169 && (CudaNdarray_HOST_STRIDES(A)[1] != 1)) 4170 || (CudaNdarray_HOST_STRIDES(A)[0] < 0) 4171 || (CudaNdarray_HOST_STRIDES(A)[1] < 0)) 4172 { 4173 A_new = (CudaNdarray*) CudaNdarray_Copy(A); 4174 if (!A_new) 4175 return -1; 4176 A = A_new; 4177 } 4178 4179 if (((CudaNdarray_HOST_DIMS(B)[0] > 1) 4180 && (CudaNdarray_HOST_STRIDES(B)[0] != 1) 4181 && (CudaNdarray_HOST_DIMS(B)[1] > 1) 4182 && (CudaNdarray_HOST_STRIDES(B)[1] != 1)) 4183 || (CudaNdarray_HOST_STRIDES(B)[0] < 0) 4184 || (CudaNdarray_HOST_STRIDES(B)[1] < 0)) 4185 { 4186 B_new = (CudaNdarray*) CudaNdarray_Copy(B); 4187 if (!B_new) 4188 { 4189 // If A_new is NULL, meaning A was not copied nothing happens 4190 Py_XDECREF(A_new); 4191 return -1; 4192 } 4193 B = B_new; 4194 } 4195 4196 // If matrix C has non-unit size and non-unit stride in both 4197 // dimensions, or negative strides, we can't operate. We cannot copy 4198 // C either, because the calling code will expect the result to be 4199 // in the original C container. 4200 if (((CudaNdarray_HOST_DIMS(C)[0] > 1) 4201 && (CudaNdarray_HOST_STRIDES(C)[0] != 1) 4202 && (CudaNdarray_HOST_DIMS(C)[1] > 1) 4203 && (CudaNdarray_HOST_STRIDES(C)[1] != 1)) 4204 || (CudaNdarray_HOST_STRIDES(C)[0] < 0) 4205 || (CudaNdarray_HOST_STRIDES(C)[1] < 0)) 4206 { 4207 PyErr_Format(PyExc_AssertionError, 4208 "non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)", 4209 CudaNdarray_HOST_STRIDES(C)[0], 4210 CudaNdarray_HOST_STRIDES(C)[1], 4211 CudaNdarray_HOST_DIMS(C)[0], 4212 CudaNdarray_HOST_DIMS(C)[1]); 4213 Py_XDECREF(A_new); 4214 Py_XDECREF(B_new); 4215 return -1; 4216 } 4217 4218 // the unit integer is divided logically into three fields of 4 bits 4219 // the lowermost 4 bits encode the stride pattern of the output 4220 // the next higher 4 bits encode the B variable (or y) 4221 // the next higher 4 bits encode the C variable (or x) 4222 // 4223 // the stride pattern for each input is encoded as 0 for unit stride from col to col (Row major) 4224 // 1 for unit stride from row to row (Col major) 4225 4226 // a stride of 0 implies a dimension of 1 - so we can actually define 4227 // a stride of 0 as a 'unit' stride because gemm will never use it. 4228 // If a dimension is 0, its stride will not be used either, so we can 4229 // consider it a 'unit' stride too. 4230 int unit = 0; 4231 if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_DIMS(A)[1] <= 1) { 4232 unit |= (0x0 << 8); 4233 } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_DIMS(A)[0] <= 1) { 4234 unit |= (0x1 << 8); 4235 } else { 4236 unit |= (0x2 << 8); 4237 } 4238 if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_DIMS(B)[1] <= 1) { 4239 unit |= (0x0 << 4); 4240 } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_DIMS(B)[0] <= 1) { 4241 unit |= (0x1 << 4); 4242 } else { 4243 unit |= (0x2 << 4); 4244 } 4245 if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_DIMS(C)[1] <= 1) { 4246 unit |= (0x0 << 0); 4247 } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_DIMS(C)[0] <= 1) { 4248 unit |= (0x1 << 0); 4249 } else { 4250 unit |= (0x2 << 0); 4251 } 4252 4253 /* create appropriate strides for malformed matrices that are row or column 4254 * vectors 4255 */ 4256 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1]; 4257 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0]; 4258 int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : CudaNdarray_HOST_DIMS(B)[1]; 4259 int sb_1 = (CudaNdarray_HOST_DIMS(B)[1] > 1) ? CudaNdarray_HOST_STRIDES(B)[1] : CudaNdarray_HOST_DIMS(B)[0]; 4260 int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : CudaNdarray_HOST_DIMS(C)[1]; 4261 int sc_1 = (CudaNdarray_HOST_DIMS(C)[1] > 1) ? CudaNdarray_HOST_STRIDES(C)[1] : CudaNdarray_HOST_DIMS(C)[0]; 4262 4263 float* a = CudaNdarray_DEV_DATA(A); 4264 float* b = CudaNdarray_DEV_DATA(B); 4265 float* c = CudaNdarray_DEV_DATA(C); 4266 cublasOperation_t N = CUBLAS_OP_N; 4267 cublasOperation_t T = CUBLAS_OP_T; 4268 //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n'; 4269 // There should be no negative stride at that point 4270 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \ 4271 if (sx == 0){sx = 1;}\ 4272 if (sy == 0){sy = 1;}\ 4273 if (sz == 0){sz = 1;}\ 4274 if ((sx > 0) && (sy > 0) && (sz > 0)) { \ 4275 err = cublasSgemm(handle, T0, T1, D0, D1, D2, &a, x, sx, y, sy, &b, z, sz); \ 4276 } else { \ 4277 PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\ 4278 Py_XDECREF(A_new);\ 4279 Py_XDECREF(B_new);\ 4280 return -1; \ 4281 } 4282 4283 cublasStatus_t err; 4284 switch(unit) 4285 { 4286 case 0x000: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_0, beta, c, sc_0); break; 4287 case 0x100: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_1, beta, c, sc_0); break; 4288 case 0x010: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_0, beta, c, sc_0); break; 4289 case 0x110: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_1, beta, c, sc_0); break; 4290 case 0x001: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_0, beta, c, sc_1); break; 4291 case 0x101: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_0, beta, c, sc_1); break; 4292 case 0x011: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_1, beta, c, sc_1); break; 4293 case 0x111: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_1, beta, c, sc_1); break; 4294 default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%x)", unit); 4295 return -1; 4296 }; 4297 CNDA_THREAD_SYNC; 4298 Py_XDECREF(A_new); 4299 Py_XDECREF(B_new); 4300 4301 if (CUBLAS_STATUS_SUCCESS != err) 4302 { 4303 PyErr_Format(PyExc_RuntimeError, 4304 "cublasSgemm failed (%i) %s\n" 4305 " unit=%x N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%p, b=%p, c=%p" 4306 " sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d", 4307 err, cublasGetErrorString(err), 4308 unit, N, 4309 CudaNdarray_HOST_DIMS(C)[0], 4310 CudaNdarray_HOST_DIMS(C)[1], 4311 CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1], 4312 alpha, beta, a, b, c, sa_0, sa_1, sb_0, sb_1, sc_0, sc_1); 4313 4314 return -1; 4315 } 4316 return 0; 4317 } 4318 4319 int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C) 4320 { 4321 /** 4322 * C <- alpha A B + beta C 4323 * A : matrix 4324 * B, C: vector 4325 * alpha, beta: scalars 4326 */ 4327 if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemv"); return -1; } 4328 if (B->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; } 4329 if (C->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; } 4330 4331 // We must allow dimensions to be zeros. 4332 if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0]) 4333 || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])) 4334 { 4335 PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemv (%i,%i)x(%i)->(%i)", 4336 CudaNdarray_HOST_DIMS(A)[0], 4337 CudaNdarray_HOST_DIMS(A)[1], 4338 CudaNdarray_HOST_DIMS(B)[0], 4339 CudaNdarray_HOST_DIMS(C)[0]); 4340 return -1; 4341 } 4342 4343 // If matrix A has non-unit size and non-unit stride in both 4344 // dimensions, or negative strides, we cannot operate, but we can 4345 // make a copy. 4346 CudaNdarray * A_new = NULL; 4347 CudaNdarray * B_new = NULL; 4348 if (((CudaNdarray_HOST_DIMS(A)[0] > 1) 4349 && (CudaNdarray_HOST_STRIDES(A)[0] != 1) 4350 && (CudaNdarray_HOST_DIMS(A)[1] > 1) 4351 && (CudaNdarray_HOST_STRIDES(A)[1] != 1)) 4352 || (CudaNdarray_HOST_STRIDES(A)[0] < 0) 4353 || (CudaNdarray_HOST_STRIDES(A)[1] < 0)) 4354 { 4355 A_new = (CudaNdarray*) CudaNdarray_Copy(A); 4356 if (!A_new) 4357 return -1; 4358 A = A_new; 4359 } 4360 4361 // If vector B as a negative stride, we also have to make a copy. 4362 if (CudaNdarray_HOST_STRIDES(B)[0] < 0) 4363 { 4364 B_new = (CudaNdarray*) CudaNdarray_Copy(B); 4365 if (!B_new) 4366 { 4367 // If A was not copied, A_new is NULL, and Py_XDECREF does not 4368 // do anything 4369 Py_XDECREF(A_new); 4370 return -1; 4371 } 4372 B = B_new; 4373 } 4374 4375 // cudablas does not handle negative strides as expected 4376 if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0) 4377 || (CudaNdarray_HOST_STRIDES(A)[1] < 0)) 4378 { 4379 PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)", 4380 CudaNdarray_HOST_STRIDES(A)[0], 4381 CudaNdarray_HOST_STRIDES(A)[1]); 4382 Py_XDECREF(A_new); 4383 Py_XDECREF(B_new); 4384 return -1; 4385 } 4386 4387 /* create appropriate strides for malformed matrices that are row or column 4388 * vectors 4389 */ 4390 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1]; 4391 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0]; 4392 int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1; 4393 int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1; 4394 4395 if (sa_0 == 0) sa_0 = 1; 4396 if (sa_1 == 0) sa_1 = 1; 4397 4398 int used_dot = 0; 4399 4400 // This is important because we can end up not calling Sgemv at all 4401 cublasStatus_t err = CUBLAS_STATUS_SUCCESS; 4402 if (CudaNdarray_SIZE(C)) { 4403 // A is row vector & alpha==1 & beta==0 -> use cublasSdot 4404 if (CudaNdarray_HOST_DIMS(A)[0] == 1 && alpha==1.f && beta==0.f) { 4405 //replace this with custom inner product kernel with alpha and beta parameter? 4406 cublasPointerMode_t pmode; 4407 //set pointer mode to make sure cublas not storing on host pointer 4408 cublasGetPointerMode(handle, &pmode); 4409 cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); 4410 err = cublasSdot( 4411 handle, CudaNdarray_HOST_DIMS(A)[1], 4412 CudaNdarray_DEV_DATA(A), sa_1, 4413 CudaNdarray_DEV_DATA(B), sb_0, 4414 CudaNdarray_DEV_DATA(C)); 4415 cublasSetPointerMode(handle, pmode); 4416 used_dot = 1; 4417 } 4418 // A is row-contiguous | row vector 4419 else if ((CudaNdarray_HOST_DIMS(A)[0] <= 1) 4420 || ((CudaNdarray_HOST_STRIDES(A)[0] == 1) 4421 && (CudaNdarray_HOST_STRIDES(A)[1] > 0))) 4422 { 4423 err = cublasSgemv(handle, CUBLAS_OP_N, 4424 CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1], 4425 &alpha, 4426 CudaNdarray_DEV_DATA(A), sa_1, 4427 CudaNdarray_DEV_DATA(B), sb_0, 4428 &beta, 4429 CudaNdarray_DEV_DATA(C), sc_0); 4430 } 4431 // A is column-contiguous | column vector 4432 else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1) 4433 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1) 4434 && (CudaNdarray_HOST_STRIDES(A)[0] > 0))) 4435 { 4436 err = cublasSgemv(handle, CUBLAS_OP_T, 4437 CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0], 4438 &alpha, 4439 CudaNdarray_DEV_DATA(A), sa_0, 4440 CudaNdarray_DEV_DATA(B), sb_0, 4441 &beta, 4442 CudaNdarray_DEV_DATA(C), sc_0); 4443 } 4444 // A is non vector and have malformed strides 4445 else 4446 { 4447 PyErr_Format(PyExc_AssertionError, 4448 "Unexpected stride pattern in gemv: (%i, %i) x %i -> %i.\n" 4449 "Shapes are: (%i, %i) x %i -> %i\n", 4450 CudaNdarray_HOST_STRIDES(A)[0], 4451 CudaNdarray_HOST_STRIDES(A)[1], 4452 CudaNdarray_HOST_STRIDES(B)[0], 4453 CudaNdarray_HOST_STRIDES(C)[0], 4454 CudaNdarray_HOST_DIMS(A)[0], 4455 CudaNdarray_HOST_DIMS(A)[1], 4456 CudaNdarray_HOST_DIMS(B)[0], 4457 CudaNdarray_HOST_DIMS(C)[0]); 4458 Py_XDECREF(A_new); 4459 Py_XDECREF(B_new); 4460 return -1; 4461 } 4462 } 4463 4464 CNDA_THREAD_SYNC; 4465 Py_XDECREF(A_new); 4466 Py_XDECREF(B_new); 4467 4468 if (CUBLAS_STATUS_SUCCESS != err) 4469 { 4470 if (!used_dot) 4471 { 4472 PyErr_Format(PyExc_RuntimeError, 4473 "cublasSgemv failed (%i)", 4474 err); 4475 } else { 4476 PyErr_Format(PyExc_RuntimeError, 4477 "cublasSdot failed (%i)", 4478 err); 4479 } 4480 return -1; 4481 } 4482 return 0; 4483 } 4484 4485 int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray * A) { 4486 if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; } 4487 if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; } 4488 if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; } 4489 4490 if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0]) 4491 || (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) { 4492 PyErr_Format(PyExc_ValueError, 4493 "dimension mismatch in args to sger (%i)x(%i)->(%i,%i)", 4494 CudaNdarray_HOST_DIMS(x)[0], 4495 CudaNdarray_HOST_DIMS(y)[0], 4496 CudaNdarray_HOST_DIMS(A)[0], 4497 CudaNdarray_HOST_DIMS(A)[1]); 4498 return -1; 4499 } 4500 4501 int x_strides = CudaNdarray_HOST_STRIDES(x)[0]; 4502 CudaNdarray * x_new = NULL; 4503 if(x_strides == 0){ 4504 if(CudaNdarray_HOST_DIMS(x)[0] != 1){ 4505 PyErr_Format(PyExc_RuntimeError, 4506 "CudaNdarray_sger: Invalid input x (should not happen)." 4507 " We received a CudaNdarray vector with a stride of 0" 4508 " that has more than 1 element!"); 4509 return -1; 4510 } 4511 x_strides = 1; 4512 } else if(x_strides < 0){ 4513 x_new = (CudaNdarray*) CudaNdarray_Copy(x); 4514 x = x_new; 4515 x_strides = CudaNdarray_HOST_STRIDES(x)[0]; 4516 } 4517 4518 int y_strides = CudaNdarray_HOST_STRIDES(y)[0]; 4519 CudaNdarray * y_new = NULL; 4520 if(y_strides == 0){ 4521 if(CudaNdarray_HOST_DIMS(y)[0] != 1){ 4522 PyErr_Format(PyExc_RuntimeError, 4523 "CudaNdarray_sger: Invalid input y (should not happen)." 4524 " We received a CudaNdarray vector with a stride of 0" 4525 " that has more than 1 elements!"); 4526 Py_XDECREF(x_new); 4527 return -1; 4528 } 4529 y_strides = 1; 4530 } else if(y_strides < 0){ 4531 y_new = (CudaNdarray*) CudaNdarray_Copy(y); 4532 y = y_new; 4533 y_strides = CudaNdarray_HOST_STRIDES(y)[0]; 4534 } 4535 4536 // Create appropriate strides if A is a row or column vector 4537 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] 4538 : CudaNdarray_HOST_DIMS(A)[1]; 4539 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] 4540 : CudaNdarray_HOST_DIMS(A)[0]; 4541 4542 // This is important because we can end up not calling Sger at all 4543 cublasStatus_t err = CUBLAS_STATUS_SUCCESS; 4544 if(CudaNdarray_SIZE(A)){ 4545 // If A is in col-major 4546 if ((CudaNdarray_HOST_DIMS(A)[0] <= 1) 4547 || ((CudaNdarray_HOST_STRIDES(A)[0] == 1) 4548 && (CudaNdarray_HOST_STRIDES(A)[1] > 0))) 4549 { 4550 err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], &alpha, 4551 CudaNdarray_DEV_DATA(x), x_strides, 4552 CudaNdarray_DEV_DATA(y), y_strides, 4553 CudaNdarray_DEV_DATA(A), sa_1); 4554 } 4555 // Since Sger expects A in col-major, we invert x and y to fake this. 4556 else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1) 4557 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1) 4558 && (CudaNdarray_HOST_STRIDES(A)[0] > 0))) 4559 { 4560 err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], &alpha, 4561 CudaNdarray_DEV_DATA(y), y_strides, 4562 CudaNdarray_DEV_DATA(x), x_strides, 4563 CudaNdarray_DEV_DATA(A), sa_0); 4564 } 4565 // A has to be either c- or f-contiguous, with no negative strides 4566 else 4567 { 4568 PyErr_SetString(PyExc_NotImplementedError, 4569 "non-contiguous A, or negative strides, in sger"); 4570 Py_XDECREF(x_new); 4571 Py_XDECREF(y_new); 4572 return -1; 4573 } 4574 } 4575 CNDA_THREAD_SYNC; 4576 Py_XDECREF(x_new); 4577 Py_XDECREF(y_new); 4578 4579 if (CUBLAS_STATUS_SUCCESS != err) 4580 { 4581 PyErr_Format(PyExc_RuntimeError, 4582 "cublasSger failed (%i)", 4583 err); 4584 return -1; 4585 } 4586 4587 return 0; 4588 } 4589 4590 /** 4591 * 4592 * Precondition: 4593 * a->dim[d] == (dims_a[d]==0) ? (1 << log2_dims_a[d]) : dims_a[d] 4594 * z->dim[d] == (z_str[d]==0) ? 1 : dims_a[d]; 4595 * 4596 * TODO: templatize this function to support other reductions. 4597 * All that needs to change is the initial value for sum, and the reduction operator. 4598 */ 4599 4600 static __global__ void kernel_reduce_sum(const unsigned int size_z, 4601 const unsigned int nd, 4602 const int * dims_a, 4603 const int * log2_dims_a, 4604 const int * a_str, 4605 const float * a_data, 4606 const int * z_str, 4607 float * z_data) 4608 { 4609 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 4610 const unsigned int numThreads = blockDim.x * gridDim.x; 4611 4612 //structure data contains the strides and dimensions of both a and z 4613 // a_dim[0], a_dim[1], ... a_dim[nd-1], 4614 // a_log2dim[0], a_log2dim[1], ... a_log2dim[nd-1], 4615 // a_str[0], ... a_str[nd-1], 4616 // z_str[0], ... z_str[nd-1] 4617 extern __shared__ int structure_data[]; 4618 for (unsigned int i = threadIdx.x; i < nd; i += blockDim.x) 4619 { 4620 structure_data[i+0*nd] = dims_a[i]; 4621 structure_data[i+1*nd] = log2_dims_a[i]; 4622 structure_data[i+2*nd] = a_str[i]; 4623 structure_data[i+3*nd] = z_str[i]; 4624 } 4625 dims_a = structure_data; 4626 log2_dims_a = structure_data + nd; 4627 a_str = structure_data + 2*nd; 4628 z_str = structure_data + 3*nd; 4629 4630 __syncthreads(); //wait for all the shared structure to be loaded 4631 4632 for (unsigned int i = idx; i < size_z; i += numThreads) 4633 { 4634 unsigned int ii = i; 4635 const float * a_data_i = a_data; 4636 float * z_data_i = z_data; 4637 unsigned int n_reduce_elements = 1; 4638 unsigned int n_reduce_dims = 0; 4639 unsigned int reduce_dim0 = nd-1; 4640 4641 4642 //In this loop, we locate the initial element of the slice that we'd like to reduce with this thread 4643 // At the same time, we [re]calculate the size of that slice (n_reduce_elements) 4644 for (unsigned int d = 0; d < nd; ++d) 4645 { 4646 if (a_str[d] && (!z_str[d])) // this means 'd' is a dimension we are reducing over 4647 { 4648 n_reduce_elements *= dims_a[d]; 4649 n_reduce_dims += 1; 4650 reduce_dim0 = (d < reduce_dim0) ? d : reduce_dim0; 4651 } 4652 else //'d' is not a dimension that we are reducing over 4653 { 4654 unsigned int pos_d; 4655 if (log2_dims_a[d]==-1) //TODO: when things are working, use this switch 4656 { 4657 // this branch is not preferred, 4658 // because the manual said that integer mod and div operations are slow on gpu 4659 pos_d = (ii % dims_a[d]); 4660 ii = (ii / dims_a[d]); 4661 } 4662 else 4663 { 4664 pos_d = (ii & ((1 << log2_dims_a[d])-1)); //take the lower log2_dims bits 4665 ii = (ii >> log2_dims_a[d]); //shift those lower log2_dims bits off of ii 4666 } 4667 a_data_i += pos_d * a_str[d]; 4668 z_data_i += pos_d * z_str[d]; 4669 } 4670 } 4671 // now we've got pointers a_data_i and z_data_i into element 0 of the slice over which we are reducing 4672 // do a similar loop 4673 4674 float sum = 0.0f; 4675 switch(n_reduce_dims) 4676 { 4677 case 0: 4678 { 4679 sum = a_data_i[0]; 4680 } 4681 break; 4682 case 1: 4683 { 4684 const int stride = a_str[reduce_dim0]; 4685 const float * a_data_i_max = a_data_i + dims_a[reduce_dim0] * stride; 4686 while (a_data_i != a_data_i_max) 4687 { 4688 sum += a_data_i[0]; 4689 a_data_i += stride; 4690 } 4691 } 4692 break; 4693 case 2: 4694 { 4695 int rd = reduce_dim0+1; 4696 for (; rd < nd; ++rd) 4697 { 4698 if (a_str[rd] && (!z_str[rd])) // this means 'rd' is a dimension we are reducing over 4699 break; 4700 } 4701 const int stride0 = a_str[reduce_dim0]; 4702 const int stride1 = a_str[rd]; 4703 for (int ii = 0; ii < dims_a[rd]; ++ii) 4704 { 4705 const float * a_data_ri = a_data_i + ii * stride1; 4706 const float * a_data_ri_max = a_data_ri + dims_a[reduce_dim0] * stride0; 4707 while (a_data_ri != a_data_ri_max) 4708 { 4709 sum += a_data_ri[0]; 4710 a_data_ri += stride0; 4711 } 4712 } 4713 }; 4714 break; 4715 default: 4716 { 4717 for (unsigned int reduce_i = 0; reduce_i < n_reduce_elements; ++reduce_i) 4718 { 4719 //TODO: optimize this loop to work more like theano's Elemwise. It's serial code. 4720 unsigned int reduce_ii = reduce_i; 4721 const float * a_data_ri = a_data_i; 4722 4723 //This loop finds the element in the a slice to add. 4724 for (unsigned int rd = reduce_dim0; rd < nd; ++rd) 4725 { 4726 unsigned int pos_d; 4727 if (a_str[rd] && (!z_str[rd])) // this means 'd' is a dimension we are reducing over 4728 { 4729 if (log2_dims_a[rd]==-1) 4730 { 4731 // this branch is not preferred, 4732 // because the manual said that integer mod and div operations are slow on gpu 4733 pos_d = (reduce_ii % dims_a[rd]); 4734 reduce_ii = (reduce_ii / dims_a[rd]); 4735 } 4736 else 4737 { 4738 pos_d = (reduce_ii & ((1 << log2_dims_a[rd])-1)); //take the lower log2_dims bits 4739 reduce_ii = (reduce_ii >> log2_dims_a[rd]); //shift those lower log2_dims bits off of ii 4740 } 4741 a_data_ri += pos_d * a_str[rd]; 4742 } 4743 } 4744 sum += a_data_ri[0]; 4745 } 4746 } 4747 } 4748 z_data_i[0] = sum; 4749 } 4750 } 4751 4752 static __global__ void kernel_reduce_sum_1011( 4753 const unsigned int d0, 4754 const unsigned int d1, 4755 const unsigned int d2, 4756 const unsigned int d3, 4757 const float *A, const int sA0, const int sA1, const int sA2, const int sA3, 4758 float * Z, const int sZ0) 4759 { 4760 const int threadCount = blockDim.x * blockDim.y * blockDim.z; 4761 const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; 4762 extern __shared__ float buf[]; 4763 float mysum = 0.0f; 4764 4765 if (warpSize != 32) 4766 { 4767 return; //TODO: set error code 4768 } 4769 4770 for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z) 4771 { 4772 float Ai = A[i0 * sA0 + blockIdx.x * sA1 + threadIdx.y * sA2 + threadIdx.x * sA3]; 4773 mysum += Ai; 4774 } 4775 buf[threadNum] = mysum; 4776 __syncthreads(); 4777 4778 // rest of function is handled by one warp 4779 if (threadNum < warpSize) 4780 { 4781 for (int i = threadNum + warpSize; i < threadCount; i += warpSize) 4782 { 4783 mysum += buf[i]; 4784 } 4785 buf[threadNum] = mysum; 4786 if (threadNum < 16) 4787 { 4788 //reduce so that threadNum 0 has the sum of everything 4789 if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16]; 4790 if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8]; 4791 if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4]; 4792 if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2]; 4793 if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1]; 4794 if (threadNum == 0) 4795 { 4796 Z[blockIdx.x*sZ0] = buf[0]; 4797 } 4798 } 4799 } 4800 } 4801 /** 4802 * Dimensions in which the self has size 1 and A has size > 1 are considered summing dimensions 4803 * Dimensions in which self has size > 1 and A has size > 1 are considered non-summing dimensions, and in this case their sizes must be equal. 4804 */ 4805 int 4806 CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A) 4807 { 4808 int verbose = 0; 4809 //check input rank 4810 if (self->nd != A->nd) 4811 { 4812 PyErr_Format(PyExc_TypeError, "Rank mismatch in CudaNdarray_sum: %i vs %i", self->nd, A->nd); 4813 return -1; 4814 } 4815 for (int i = 0; i < self->nd; ++i) 4816 { 4817 if ((CudaNdarray_HOST_DIMS(self)[i] > 1) && (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(A)[i])) 4818 { 4819 PyErr_Format(PyExc_TypeError, "Dimension mismatch in CudaNdarray_sum: self->dim[%i] == %i , A->dim[%i] = %i", 4820 i, CudaNdarray_HOST_DIMS(self)[i], i, CudaNdarray_HOST_DIMS(A)[i]); 4821 return -1; 4822 } 4823 } 4824 4825 int n_summations = (unsigned int)CudaNdarray_SIZE(self); 4826 if (verbose) 4827 { 4828 std::cerr << "reduce_sum n_summations " << n_summations << '\n'; 4829 std::cerr << "reduce_sum nd " << self->nd << '\n'; 4830 fprint_CudaNdarray(stderr, A); 4831 fprint_CudaNdarray(stderr, self); 4832 } 4833 if (0 && (A->nd == 4) //check to see if kernel_reduce_sum_1011 applies 4834 && (CudaNdarray_HOST_DIMS(self)[0] == 1) 4835 && (CudaNdarray_HOST_DIMS(self)[2] == 1) 4836 && (CudaNdarray_HOST_DIMS(self)[3] == 1) 4837 ) 4838 { 4839 dim3 n_threads(CudaNdarray_HOST_DIMS(A)[3], CudaNdarray_HOST_DIMS(A)[2]); 4840 dim3 n_blocks(CudaNdarray_HOST_DIMS(A)[1]); 4841 while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z; 4842 n_threads.z -= 1; 4843 if (n_threads.z > 64) n_threads.z = 64; 4844 if (n_threads.z) 4845 { 4846 if (verbose) printf("trying kernel_reduce_sum_1011\n"); 4847 int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z; 4848 kernel_reduce_sum_1011<<>>( 4849 CudaNdarray_HOST_DIMS(A)[0], 4850 CudaNdarray_HOST_DIMS(A)[1], 4851 CudaNdarray_HOST_DIMS(A)[2], 4852 CudaNdarray_HOST_DIMS(A)[3], 4853 CudaNdarray_DEV_DATA(A), 4854 CudaNdarray_HOST_STRIDES(A)[0], 4855 CudaNdarray_HOST_STRIDES(A)[1], 4856 CudaNdarray_HOST_STRIDES(A)[2], 4857 CudaNdarray_HOST_STRIDES(A)[3], 4858 CudaNdarray_DEV_DATA(self), 4859 CudaNdarray_HOST_STRIDES(self)[1]); 4860 CNDA_THREAD_SYNC; 4861 if (cudaSuccess == cudaGetLastError()) return 0; 4862 if (verbose) printf("failed, falling back to kernel_reduce_sum\n"); 4863 } 4864 } 4865 4866 int n_threads_per_block = std::min(n_summations, 4867 NUM_VECTOR_OP_THREADS_PER_BLOCK); 4868 int n_blocks = std::min(ceil_intdiv(n_summations,n_threads_per_block), 4869 NUM_VECTOR_OP_BLOCKS); 4870 int n_structure_cache = self->nd * 4 * sizeof(int); 4871 4872 if (verbose) 4873 { 4874 std::cerr << "n_blocks, n_threads_per_block " << n_blocks << ' ' << n_threads_per_block << '\n'; 4875 } 4876 assert (self->nd > 0); 4877 assert (self->nd == A->nd); 4878 kernel_reduce_sum<<>>( 4879 n_summations, 4880 self->nd, 4881 CudaNdarray_DEV_DIMS(A), 4882 CudaNdarray_DEV_LOG2DIMS(A), 4883 CudaNdarray_DEV_STRIDES(A), 4884 CudaNdarray_DEV_DATA(A), 4885 CudaNdarray_DEV_STRIDES(self), 4886 CudaNdarray_DEV_DATA(self)); 4887 CNDA_THREAD_SYNC; 4888 cudaError_t err = cudaGetLastError(); 4889 if (cudaSuccess != err) 4890 { 4891 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kernel_reduce_sum", cudaGetErrorString(err)); 4892 return -1; 4893 } 4894 return 0; 4895 } 4896 int 4897 CudaNdarray_reduce_prod(CudaNdarray * self, const CudaNdarray * A) 4898 { 4899 PyErr_SetString(PyExc_NotImplementedError, ""); 4900 return -1; 4901 } 4902 int 4903 CudaNdarray_reduce_min(CudaNdarray * self, const CudaNdarray * A) 4904 { 4905 PyErr_SetString(PyExc_NotImplementedError, ""); 4906 return -1; 4907 } 4908 int 4909 CudaNdarray_reduce_max(CudaNdarray * self, const CudaNdarray * A) 4910 { 4911 PyErr_SetString(PyExc_NotImplementedError, ""); 4912 return -1; 4913 } 4914 4915 4916 /** 4917 * 4918 * pattern is a permutation of [0, 1, ... self->nd-1] with the following twists: 4919 * - an element 'd' of the permutation can be dropped if CudaNdarray_HOST_DIMS(self)[d] == 1 4920 * - any number of '-1' elements can be in the pattern, and they will cause new ranks (with dim==1) to be inserted. 4921 * 4922 * For example, if CudaNdarray_HOST_DIMS(self) == [4, 5, 1, 6], and pattern = [0,3,-1,-1, 1], then CudaNdarray_HOST_DIMS(self) would be modified to become: 4923 * [4, 6, 1, 1, 5] (we dropped the original dim[2]==1, and inserted two singleton dimensions with the -1s. 4924 */ 4925 int 4926 CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern) 4927 { 4928 //TODO: pass a workspace pointer to avoid the internal malloc 4929 int * newdims = (int *)malloc(sizeof(int) * (len + len + self->nd)); //we tack on the taken buffer here for speed of not having to malloc twice. 4930 int * newstrides = newdims + len; 4931 int * dims_taken = newstrides + len; 4932 if (!newdims) 4933 { 4934 PyErr_SetString(PyExc_MemoryError, "CudaNdarray_dimshuffle: Failed to allocate temporary space"); 4935 return -1; 4936 } 4937 for (int i = 0; i < self->nd; ++i) 4938 { 4939 dims_taken[i] = 0; 4940 } 4941 for (int i = 0; i < len; ++i) 4942 { 4943 if (pattern[i] < 0) 4944 { 4945 newdims[i] = 1; 4946 newstrides[i] = 0; 4947 } 4948 else if(dims_taken[pattern[i]]) 4949 { 4950 PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You used the dimensions %d multiple time", 4951 pattern[i]); 4952 free(newdims); 4953 return -1; 4954 } 4955 else if (pattern[i]>= self->nd) 4956 { 4957 PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You asked for a dimensions that don't exist %d for a %d dims CudaNdarray", 4958 pattern[i], self->nd); 4959 free(newdims); 4960 return -1; 4961 } 4962 else 4963 { 4964 newdims[i] = CudaNdarray_HOST_DIMS(self)[pattern[i]]; 4965 newstrides[i] = CudaNdarray_HOST_STRIDES(self)[pattern[i]]; 4966 dims_taken[pattern[i]] = 1; 4967 } 4968 } 4969 //Check if we dropped not broadcastable dims 4970 for (int i = 0; i < self->nd; ++i) 4971 { 4972 if (dims_taken[i]==0 && CudaNdarray_HOST_DIMS(self)[i]!=1) 4973 { 4974 PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: You cannot drop a non-broadcastable dimension."); 4975 free(newdims); 4976 return -1; 4977 } 4978 } 4979 //swap this structure in for the one in self, and sync to the card 4980 if (CudaNdarray_set_nd(self, len)) 4981 { 4982 free(newdims); 4983 return -1; 4984 } 4985 for (int i = 0; i < len; ++i) 4986 { 4987 CudaNdarray_set_dim(self, i, newdims[i]); 4988 CudaNdarray_set_stride(self, i, newstrides[i]); 4989 } 4990 if (cnda_copy_structure_to_device(self)) 4991 { 4992 free(newdims); 4993 return -1; 4994 } 4995 free(newdims); 4996 return 0; 4997 } 4998 4999 5000 5001 /** 5002 * 5003 * This is the function that bind to python. 5004 * See CudaNdarray_dimshuffle to call from C. 5005 * We use -1 to mean 'x' as in Tensor Dimshuffle. 5006 */ 5007 PyObject * 5008 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args) 5009 { 5010 PyObject * self = NULL; 5011 PyObject * pattern_object = NULL; 5012 int * pattern = NULL; 5013 PyObject * rval = NULL; 5014 int success = -1; 5015 //const int * dims = NULL; 5016 5017 //args should consist of two python objects ("OO") 5018 if (! PyArg_ParseTuple(args, "OO", &self, &pattern_object)) 5019 return NULL; 5020 5021 if (!CudaNdarray_Check(self) ) 5022 { 5023 PyErr_SetString(PyExc_TypeError, "First argument to cuda_ndarray.dimshuffle must be a CudaNdarray"); 5024 return NULL; 5025 } 5026 5027 //parse pattern_object into int * pattern 5028 5029 Py_ssize_t pattern_dim = PyObject_Length(pattern_object); 5030 5031 if (pattern_dim < 0) 5032 { 5033 PyErr_SetString(PyExc_TypeError, "Couldn't get length of third argument to cuda_ndarray.dimshuffle"); 5034 return NULL; 5035 } 5036 5037 pattern = (int *) malloc( pattern_dim * sizeof(int)); 5038 5039 for (Py_ssize_t i = 0; i < pattern_dim; i++) 5040 { 5041 PyObject * idx = PyLong_FromLong(i); 5042 5043 if (idx == NULL) 5044 { 5045 PyErr_SetString(PyExc_Exception, "Couldn't make long object to loop over list/tuple"); 5046 goto CudaNdarray_dimshuffle_fail; 5047 } 5048 5049 long elem_value = 0; 5050 5051 PyObject * elem = PyObject_GetItem(pattern_object, idx); 5052 5053 if (elem == NULL) 5054 { 5055 Py_XDECREF( elem); 5056 PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers"); 5057 goto CudaNdarray_dimshuffle_fail; 5058 } 5059 5060 elem_value = PyInt_AsLong(elem); 5061 5062 if (elem_value == -1 && PyErr_Occurred() ) 5063 { 5064 Py_XDECREF(elem); 5065 PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers"); 5066 goto CudaNdarray_dimshuffle_fail; 5067 } 5068 5069 pattern[i] = elem_value; 5070 5071 Py_XDECREF( elem ); 5072 Py_XDECREF( idx ); 5073 } 5074 5075 //allocate rval 5076 rval = (PyObject *) CudaNdarray_View((CudaNdarray *) self); 5077 5078 if (rval == NULL) 5079 { 5080 //CudaNdarray_New should have set the exception string 5081 goto CudaNdarray_dimshuffle_fail; 5082 } 5083 5084 5085 //printf("pattern_dim: %d\n",pattern_dim); 5086 //printf("pattern: %d %d\n",pattern[0],pattern[1]); 5087 //dims = CudaNdarray_HOST_DIMS( (CudaNdarray *) self); 5088 //printf("dims before: %d %d\n",dims[0],dims[1]); 5089 5090 success = CudaNdarray_dimshuffle((CudaNdarray *) rval, pattern_dim, pattern); 5091 5092 if (success != 0) 5093 { 5094 //Exception string should already be set by CudaNdarray_dimshuffle 5095 goto CudaNdarray_dimshuffle_fail; 5096 } 5097 5098 free(pattern); 5099 5100 return rval; 5101 5102 CudaNdarray_dimshuffle_fail: 5103 5104 if (pattern != NULL) 5105 free(pattern); 5106 5107 Py_XDECREF(rval); 5108 return NULL; 5109 } 5110 5111 5112 int 5113 cnda_structure_size(int nd) 5114 { 5115 // dim0, dim1, ... 5116 // str0, str1, ... 5117 // log2(dim0), log2(dim1), ... 5118 return nd + nd + nd; 5119 } 5120 5121 const int * 5122 CudaNdarray_HOST_DIMS(const CudaNdarray * self) 5123 { 5124 return self->host_structure; 5125 } 5126 5127 const int * 5128 CudaNdarray_HOST_STRIDES(const CudaNdarray * self) 5129 { 5130 return self->host_structure + self->nd; 5131 } 5132 const int * 5133 CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self) 5134 { 5135 return self->host_structure + 2*self->nd; 5136 } 5137 5138 int 5139 CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase) 5140 { 5141 int verbose = 0; 5142 5143 if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh) 5144 { 5145 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 1\n"); 5146 return 0; 5147 } 5148 5149 if (cnda1->nd != cnda2->nd) 5150 { 5151 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 2\n"); 5152 return 0; 5153 } 5154 5155 for (int i=0; i < 2*cnda1->nd; i++) 5156 { 5157 if (cnda1->host_structure[i] != cnda2->host_structure[i]) 5158 { 5159 if(verbose) 5160 fprintf(stdout, "CUDANDARRAY_EQUAL : host_structure : %d, %d, %d\n", i, cnda1->host_structure[i], cnda2->host_structure[i]); 5161 return 0; 5162 } 5163 } 5164 5165 if (!ignoreBase && cnda1->base != cnda2->base) 5166 { 5167 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 4"); 5168 return 0; 5169 } 5170 else if (cnda1->data_allocated != cnda2->data_allocated) 5171 { 5172 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 5"); 5173 return 0; 5174 } 5175 else if (cnda1->data_allocated && cnda1->devdata != cnda2->devdata) 5176 { 5177 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 6"); 5178 // no need to check devdata if data is not allocated 5179 return 0; 5180 } 5181 5182 return 1; 5183 } 5184 5185 5186 int 5187 CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2) 5188 { 5189 return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0); 5190 } 5191 5192 int 5193 cnda_copy_structure_to_device(const CudaNdarray * self) 5194 { 5195 //If the device structure do not exists, create it. 5196 //We allocate it here as we do not need it often. 5197 //In fact, we need it so infrequently that we expect 5198 //that most object won't need it. Not allocating it 5199 //save a significant when creating object. 5200 //This speed up a benchmark by 8% with the gc. 5201 if (!self->dev_structure) 5202 { 5203 int struct_size = cnda_structure_size(self->nd); 5204 if (struct_size) 5205 { 5206 self->dev_structure = (int*)device_malloc(struct_size* sizeof(int)); 5207 if (NULL == self->dev_structure) 5208 { 5209 return -1; 5210 } 5211 } 5212 } 5213 if (cublasSetVector(cnda_structure_size(self->nd), 5214 sizeof(int), 5215 self->host_structure, 5216 1, 5217 self->dev_structure, 5218 1) != CUBLAS_STATUS_SUCCESS) 5219 { 5220 PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory"); 5221 return -1; 5222 } 5223 self->dev_structure_fresh = 1; 5224 return 0; 5225 } 5226 5227 const int * 5228 CudaNdarray_DEV_DIMS(const CudaNdarray * self) 5229 { 5230 if (!self->dev_structure_fresh) 5231 { 5232 if (cnda_copy_structure_to_device(self)) 5233 return NULL; 5234 } 5235 return self->dev_structure; 5236 } 5237 const int * 5238 CudaNdarray_DEV_STRIDES(const CudaNdarray * self) 5239 { 5240 if (!self->dev_structure_fresh) 5241 { 5242 if (cnda_copy_structure_to_device(self)) 5243 return NULL; 5244 } 5245 return self->dev_structure + self->nd; 5246 } 5247 const int * 5248 CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self) 5249 { 5250 if (!self->dev_structure_fresh) 5251 { 5252 if (cnda_copy_structure_to_device(self)) 5253 return NULL; 5254 } 5255 return self->dev_structure + 2*self->nd; 5256 } 5257 float * 5258 CudaNdarray_DEV_DATA(const CudaNdarray * self) 5259 { 5260 return self->devdata; 5261 } 5262 5263 /** 5264 * Return the number of elements in the ndarray (product of the dimensions) 5265 */ 5266 size_t 5267 CudaNdarray_SIZE(const CudaNdarray *self) 5268 { 5269 if (self->nd == -1) return 0; 5270 size_t size = 1; 5271 for (int i = 0; i < self->nd; ++i) 5272 { 5273 size *= CudaNdarray_HOST_DIMS(self)[i]; 5274 } 5275 return size; 5276 } 5277 5278 PyObject * 5279 CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure) 5280 { 5281 return PyInt_FromLong(CudaNdarray_SIZE(self)); 5282 } 5283 5284 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base) 5285 { 5286 return CudaNdarray_set_device_data(self, data, (PyObject *) base); 5287 } 5288 5289 PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self) 5290 { 5291 return PyBool_FromLong(CudaNdarray_is_c_contiguous(self)); 5292 } 5293 5294 int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self) 5295 { 5296 cudaError_t err = cudaGetLastError(); 5297 if( cudaSuccess != err) 5298 { 5299 PyErr_Format(PyExc_RuntimeError, 5300 "Cuda error: %s: %s.", 5301 "fprint_CudaNdarray was called with an uncleared error", 5302 cudaGetErrorString(err)); 5303 return -1; 5304 } 5305 fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n", 5306 self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated); 5307 fprintf(fd, "\tHOST_DIMS: "); 5308 for (int i = 0; i < self->nd; ++i) 5309 { 5310 fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]); 5311 } 5312 fprintf(fd, "\n\tHOST_STRIDES: "); 5313 for (int i = 0; i < self->nd; ++i) 5314 { 5315 fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]); 5316 } 5317 5318 if (self->dev_structure) 5319 { 5320 int data=0; 5321 fprintf(fd, "\n\tDEV_DIMS: "); 5322 for (int i = 0; i < self->nd; ++i) 5323 { 5324 cublasGetVector(1, sizeof(int), 5325 self->dev_structure+i, 1, 5326 &data, 1); 5327 fprintf(fd, "%i\t", data); 5328 } 5329 fprintf(fd, "\n\tDEV_STRIDES: "); 5330 for (int i = 0; i < self->nd; ++i) 5331 { 5332 cublasGetVector(1, sizeof(int), 5333 self->dev_structure + self->nd+i, 1, 5334 &data, 1); 5335 fprintf(fd, "%i \t", data); 5336 } 5337 fprintf(fd, "\n"); 5338 } 5339 else 5340 { 5341 fprintf(fd, "\n\tdev_structure not allocated\n"); 5342 } 5343 5344 err = cudaGetLastError(); 5345 if( cudaSuccess != err) 5346 { 5347 PyErr_Format(PyExc_RuntimeError, 5348 "Cuda error: %s: %s.", 5349 "fprint_CudaNdarray", 5350 cudaGetErrorString(err)); 5351 return -1; 5352 } 5353 return 0; 5354 } 5355 5356 5357 int CudaNdarray_prep_output(CudaNdarray ** arr, int nd, 5358 const int * dims, int fortran) 5359 { 5360 bool allocated = false; 5361 if (*arr == NULL) 5362 { 5363 // This allocates the metadata but not the data 5364 *arr = (CudaNdarray *) CudaNdarray_new_nd(nd); 5365 if (*arr == NULL) 5366 return -1; 5367 allocated = true; 5368 } 5369 5370 if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran)) 5371 { 5372 if (allocated) 5373 { 5374 Py_DECREF(*arr); 5375 *arr = NULL; 5376 } 5377 return -1; 5378 } 5379 return 0; 5380 } 5381 5382 5383 /* 5384 Local Variables: 5385 mode:c++ 5386 c-basic-offset:4 5387 c-file-style:"stroustrup" 5388 indent-tabs-mode:nil 5389 fill-column:79 5390 End: 5391 */ 5392 // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : 5393 =============================== mod.cu(940): warning: pointless comparison of unsigned integer with zero mod.cu(3000): warning: conversion from a string literal to "char *" is deprecated mod.cu(3003): warning: conversion from a string literal to "char *" is deprecated mod.cu(3005): warning: conversion from a string literal to "char *" is deprecated mod.cu(3008): warning: conversion from a string literal to "char *" is deprecated mod.cu(3010): warning: conversion from a string literal to "char *" is deprecated mod.cu(3013): warning: conversion from a string literal to "char *" is deprecated mod.cu(3016): warning: conversion from a string literal to "char *" is deprecated mod.cu(3019): warning: conversion from a string literal to "char *" is deprecated mod.cu(3021): warning: conversion from a string literal to "char *" is deprecated mod.cu(3024): warning: conversion from a string literal to "char *" is deprecated mod.cu(3026): warning: conversion from a string literal to "char *" is deprecated mod.cu(3029): warning: conversion from a string literal to "char *" is deprecated mod.cu(3031): warning: conversion from a string literal to "char *" is deprecated mod.cu(3034): warning: conversion from a string literal to "char *" is deprecated mod.cu(3037): warning: conversion from a string literal to "char *" is deprecated mod.cu(3040): warning: conversion from a string literal to "char *" is deprecated mod.cu(3042): warning: conversion from a string literal to "char *" is deprecated mod.cu(3045): warning: conversion from a string literal to "char *" is deprecated mod.cu(3047): warning: conversion from a string literal to "char *" is deprecated mod.cu(3050): warning: conversion from a string literal to "char *" is deprecated mod.cu(940): warning: pointless comparison of unsigned integer with zero mod.cu(3000): warning: conversion from a string literal to "char *" is deprecated mod.cu(3003): warning: conversion from a string literal to "char *" is deprecated mod.cu(3005): warning: conversion from a string literal to "char *" is deprecated mod.cu(3008): warning: conversion from a string literal to "char *" is deprecated mod.cu(3010): warning: conversion from a string literal to "char *" is deprecated mod.cu(3013): warning: conversion from a string literal to "char *" is deprecated mod.cu(3016): warning: conversion from a string literal to "char *" is deprecated mod.cu(3019): warning: conversion from a string literal to "char *" is deprecated mod.cu(3021): warning: conversion from a string literal to "char *" is deprecated mod.cu(3024): warning: conversion from a string literal to "char *" is deprecated mod.cu(3026): warning: conversion from a string literal to "char *" is deprecated mod.cu(3029): warning: conversion from a string literal to "char *" is deprecated mod.cu(3031): warning: conversion from a string literal to "char *" is deprecated mod.cu(3034): warning: conversion from a string literal to "char *" is deprecated mod.cu(3037): warning: conversion from a string literal to "char *" is deprecated mod.cu(3040): warning: conversion from a string literal to "char *" is deprecated mod.cu(3042): warning: conversion from a string literal to "char *" is deprecated mod.cu(3045): warning: conversion from a string literal to "char *" is deprecated mod.cu(3047): warning: conversion from a string literal to "char *" is deprecated mod.cu(3050): warning: conversion from a string literal to "char *" is deprecated /usr/include/string.h: In function ‘void* __mempcpy_inline(void*, const void*, size_t)’: /usr/include/string.h:652:42: error: ‘memcpy’ was not declared in this scope return (char *) memcpy (__dest, __src, __n) + __n; ^ mod.cu: In function ‘PyObject* CudaNdarray_Reshape(CudaNdarray*, PyObject*)’: mod.cu:954:122: warning: format ‘%lld’ expects argument of type ‘long long int’, but argument 3 has type ‘size_t {aka long unsigned int}’ [-Wformat=] PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %lld to %lld", CudaNdarray_SIZE(self), rval_size); ^ mod.cu:954:122: warning: format ‘%lld’ expects argument of type ‘long long int’, but argument 4 has type ‘size_t {aka long unsigned int}’ [-Wformat=] ERROR (theano.sandbox.cuda): Failed to compile cuda_ndarray.cu: ('nvcc return status', 1, 'for cmd', 'nvcc -shared -O3 -m64 -Xcompiler -DCUDA_NDARRAY_CUH=c72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden -Xlinker -rpath,/home/byungho/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.12-64/cuda_ndarray -I/home/byungho/.local/lib/python2.7/site-packages/theano/sandbox/cuda -I/home/byungho/.local/lib/python2.7/site-packages/numpy/core/include -I/usr/include/python2.7 -I/home/byungho/.local/lib/python2.7/site-packages/theano/gof -L/usr/lib -o /home/byungho/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.12-64/cuda_ndarray/cuda_ndarray.so mod.cu -lcublas -lpython2.7 -lcudart') Warning: couldn't load settings file: R.L.E: Retro Learning Environment (version 1.0.0) [Based upon the Arcade Learning Environment (A.L.E)] [Powered by LibRetro] Use -help for help screen. Traceback (most recent call last): File "./run_nips.py", line 62, in launcher.launch(sys.argv[1:], Defaults, __doc__) File "/home/byungho/sehar/deep_q_rl/deep_q_rl/launcher.py", line 222, in launch ale.loadROM(full_rom_path, full_core_path) File "/home/byungho/.local/lib/python2.7/site-packages/rle_python_interface/rle_python_interface.py", line 118, in loadROM raise ValueError('core must be atari|snes|genesis|game_gear|sg1000') ValueError: core must be atari|snes|genesis|game_gear|sg1000