diff --git a/pyGPs/Core/__init__.py b/pyGPs/Core/__init__.py
index 1c0c201..a7beaff 100644
--- a/pyGPs/Core/__init__.py
+++ b/pyGPs/Core/__init__.py
@@ -1,9 +1,10 @@
-import inf
-import cov
-import mean
-import lik
-import gp
-import opt
+from __future__ import absolute_import
+from . import inf
+from . import cov
+from . import mean
+from . import lik
+from . import gp
+from . import opt
 
 
 
diff --git a/pyGPs/Core/cov.py b/pyGPs/Core/cov.py
index 269b075..b03a6e9 100644
--- a/pyGPs/Core/cov.py
+++ b/pyGPs/Core/cov.py
@@ -1,3 +1,9 @@
+from __future__ import division
+from __future__ import print_function
+from builtins import str
+from builtins import range
+from past.utils import old_div
+from builtins import object
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -66,8 +72,8 @@ def __init__(self):
 
     def __repr__(self):
         strvalue =str(type(self))+': to get the kernel matrix or kernel derviatives use: \n'+\
-	    	  'model.covfunc.getCovMatrix()\n'+\
-		  'model.covfunc.getDerMatrix()'
+          'model.covfunc.getCovMatrix()\n'+\
+          'model.covfunc.getDerMatrix()'
         return strvalue
 
 
@@ -176,7 +182,7 @@ def __mul__(self,other):
         elif isinstance(other, Kernel):
             return ProductOfKernel(self,other)
         else:
-            print "only numbers and Kernels are supported operand types for *"
+            print("only numbers and Kernels are supported operand types for *")
 
 
 
@@ -411,9 +417,9 @@ def getCovMatrix(self,x=None,z=None,mode=None):
             nn, D = z.shape
             A = np.zeros((nn, 1))
         elif mode == 'train':                 # compute covariance matix for dataset x
-            A = spdist.cdist(x / ell, x / ell, 'sqeuclidean')
+            A = spdist.cdist(old_div(x, ell), old_div(x, ell), 'sqeuclidean')
         elif mode == 'cross':                 # compute covariance between data sets x and z
-            A = spdist.cdist(x / ell, z / ell, 'sqeuclidean')
+            A = spdist.cdist(old_div(x, ell), old_div(z, ell), 'sqeuclidean')
         dp = 2 * np.pi * np.sqrt(A) * ell / p
         A = np.exp(-0.5 * A) * np.cos(dp)
         return A
@@ -426,9 +432,9 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             nn, D = z.shape
             A = np.zeros((nn, 1))
         elif mode == 'train':                 # compute covariance matix for dataset x
-            A = spdist.cdist(x / ell, x / ell, 'sqeuclidean')
+            A = spdist.cdist(old_div(x, ell), old_div(x, ell), 'sqeuclidean')
         elif mode == 'cross':                 # compute covariance between data sets x and z
-            A = spdist.cdist(x / ell, z / ell, 'sqeuclidean')
+            A = spdist.cdist(old_div(x, ell), old_div(z, ell), 'sqeuclidean')
         dp = 2 * np.pi * np.sqrt(A) * ell / p
 
         if der == 0:                          # compute derivative matrix wrt 1st parameter
@@ -475,8 +481,8 @@ def __init__(self, Q=0, hyps=[], D=None):
         if D:
             self.hyp = np.random.random(Q*(1+2*D))
         else:
-	        self.hyp = hyps
-    	self.para = [Q]
+            self.hyp = hyps
+        self.para = [Q]
 
     def initSMhypers(self, x, y):
         """
@@ -491,7 +497,7 @@ def initSMhypers(self, x, y):
         w = np.zeros(Q)
         m = np.zeros((D, Q))
         s = np.zeros((D, Q))
-        w[:] = np.std(y) / Q
+        w[:] = old_div(np.std(y), Q)
         hypinit = np.zeros(Q + 2 * D * Q)
 
         for i in range(D):
@@ -503,10 +509,10 @@ def initSMhypers(self, x, y):
             else:
                 d2[d2 == 0] = 1
             minshift = np.min(np.min(np.sqrt(d2)))
-            nyquist = 0.5 / minshift
+            nyquist = old_div(0.5, minshift)
             m[i, :] = nyquist * np.random.ranf((1, Q))
             maxshift = np.max(np.max(np.sqrt(d2)))
-            s[i, :] = 1. / np.abs(maxshift * np.random.ranf((1, Q)))
+            s[i, :] = old_div(1., np.abs(maxshift * np.random.ranf((1, Q))))
         hypinit[:Q] = np.log(w)
         hypinit[Q + np.arange(0, Q * D)] = np.log(m[:]).T
         hypinit[Q + Q * D + np.arange(0, Q * D)] = np.log(s[:]).T
@@ -519,7 +525,7 @@ def getCovMatrix(self,x=None,z=None,mode=None):
             nn, D = z.shape
         else:
             nn, D = x.shape
-        assert Q == len(self.hyp) / (1 + 2 * D)
+        assert Q == old_div(len(self.hyp), (1 + 2 * D))
 
         w = np.exp(self.hyp[:Q])
         m = np.exp(np.reshape(self.hyp[Q:Q + Q * D], (D, Q)))
@@ -540,13 +546,13 @@ def getCovMatrix(self,x=None,z=None,mode=None):
                 d2[:, :, j] = spdist.cdist(xslice, zslice, 'sqeuclidean')
         d = np.sqrt(d2)
 
-        k = lambda (d2v, dm): np.exp(-2 * np.pi ** 2 * d2v) * np.cos(2* np.pi * dm)  # evaluation of the covariance
+        k = lambda d2v_dm: np.exp(-2 * np.pi ** 2 * d2v_dm[0]) * np.cos(2* np.pi * d2v_dm[1])  # evaluation of the covariance
         km = lambda dm: -2 * np.pi * np.tan(2 * np.pi * dm) * dm  # remainder when differentiating w.r.t. m
         kv = lambda d2v: -d2v * (2 * np.pi) ** 2  # remainder when differentiating w.r.t. v
 
         A = 0.
         c = 1.
-        qq = range(Q)
+        qq = list(range(Q))
         for q in qq:
             C = w[q] * c
             for j in range(D):
@@ -561,7 +567,7 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             nn, D = z.shape
         else:
             nn, D = x.shape
-        assert Q == len(self.hyp) / (1 + 2 * D)
+        assert Q == old_div(len(self.hyp), (1 + 2 * D))
 
         w = np.exp(self.hyp[:Q])
         m = np.exp(np.reshape(self.hyp[Q:Q + Q * D], (D, Q)))
@@ -582,24 +588,24 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
                 d2[:, :, j] = spdist.cdist(xslice, zslice, 'sqeuclidean')
         d = np.sqrt(d2)
 
-        k = lambda (d2v, dm): np.exp(-2 * np.pi ** 2 * d2v) * np.cos(2* np.pi * dm)  # evaluation of the covariance
+        k = lambda d2v_dm1: np.exp(-2 * np.pi ** 2 * d2v_dm1[0]) * np.cos(2* np.pi * d2v_dm1[1])  # evaluation of the covariance
         km = lambda dm: -2 * np.pi * np.tan(2 * np.pi * dm) * dm  # remainder when differentiating w.r.t. m
         kv = lambda d2v: -d2v * (2 * np.pi) ** 2  # remainder when differentiating w.r.t. v
 
         A = 0.
         c = 1.
-        qq = range(Q)
+        qq = list(range(Q))
         if der < Q:                         # compute derivative matrix wrt w
             c = 1
             qq = [der]
         elif der < Q + Q * D:               # compute derivative matrix wrt sig
             p = (der - Q) % D
-            q = (der - Q - p) / D
+            q = old_div((der - Q - p), D)
             c = km(d[:, :, p] * m[p, q])
             qq = [q]
         elif der < 2 * Q * D + Q:           # compute derivative matrix wrt mu
             p = (der - (D + 1) * Q) % D
-            q = (der - (D + 1) * Q - p) / D
+            q = old_div((der - (D + 1) * Q - p), D)
             c = kv(d2[:, :, p] * v[p, q])
             qq = [q]
         else:
@@ -738,9 +744,9 @@ def getCovMatrix(self,x=None,z=None,mode=None):
             nn,D = z.shape
             A = np.zeros((nn,1))
         elif mode == 'train':                # compute covariance matix for dataset x
-            A = np.sqrt( spdist.cdist(x/ell, x/ell, 'sqeuclidean') )
+            A = np.sqrt( spdist.cdist(old_div(x,ell), old_div(x,ell), 'sqeuclidean') )
         elif mode == 'cross':                # compute covariance between data sets x and z
-            A = np.sqrt( spdist.cdist(x/ell, z/ell, 'sqeuclidean') )
+            A = np.sqrt( spdist.cdist(old_div(x,ell), old_div(z,ell), 'sqeuclidean') )
         A = sf2 * self.pp(A,j,v,self.func)
         return A
 
@@ -762,9 +768,9 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             nn,D = z.shape
             A = np.zeros((nn,1))
         elif mode == 'train':                # compute covariance matix for dataset x
-            A = np.sqrt( spdist.cdist(x/ell, x/ell, 'sqeuclidean') )
+            A = np.sqrt( spdist.cdist(old_div(x,ell), old_div(x,ell), 'sqeuclidean') )
         elif mode == 'cross':                # compute covariance between data sets x and z
-            A = np.sqrt( spdist.cdist(x/ell, z/ell, 'sqeuclidean') )
+            A = np.sqrt( spdist.cdist(old_div(x,ell), old_div(z,ell), 'sqeuclidean') )
         if der == 0:                            # compute derivative matrix wrt 1st parameter
             A = sf2 * self.dpp(A,j,v,self.func,self.dfunc)
         elif der == 1:                          # compute derivative matrix wrt 2nd parameter
@@ -795,9 +801,9 @@ def getCovMatrix(self,x=None,z=None,mode=None):
             nn,D = z.shape
             A = np.zeros((nn,1))
         elif mode == 'train':             # compute covariance matix for training set
-            A = spdist.cdist(x/ell,x/ell,'sqeuclidean')
+            A = spdist.cdist(old_div(x,ell),old_div(x,ell),'sqeuclidean')
         elif mode == 'cross':             # compute covariance between data sets x and z
-            A = spdist.cdist(x/ell,z/ell,'sqeuclidean')
+            A = spdist.cdist(old_div(x,ell),old_div(z,ell),'sqeuclidean')
         A = sf2 * np.exp(-0.5*A)
         return A
 
@@ -810,9 +816,9 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             nn,D = z.shape
             A = np.zeros((nn,1))
         elif mode == 'train':             # compute covariance matix for dataset x
-            A = spdist.cdist(x/ell,x/ell,'sqeuclidean')
+            A = spdist.cdist(old_div(x,ell),old_div(x,ell),'sqeuclidean')
         elif mode == 'cross':             # compute covariance between data sets x and z
-            A = spdist.cdist(x/ell,z/ell,'sqeuclidean')
+            A = spdist.cdist(old_div(x,ell),old_div(z,ell),'sqeuclidean')
         if der == 0:    # compute derivative matrix wrt 1st parameter
             A = sf2 * np.exp(-0.5*A) * A
         elif der == 1:  # compute derivative matrix wrt 2nd parameter
@@ -840,9 +846,9 @@ def getCovMatrix(self,x=None,z=None,mode=None):
             nn,D = z.shape
             A = np.zeros((nn,1))
         elif mode == 'train':             # compute covariance matix for dataset x
-            A = spdist.cdist(x/ell,x/ell,'sqeuclidean')
+            A = spdist.cdist(old_div(x,ell),old_div(x,ell),'sqeuclidean')
         elif mode == 'cross':             # compute covariance between data sets x and z
-            A = spdist.cdist(x/ell,z/ell,'sqeuclidean')
+            A = spdist.cdist(old_div(x,ell),old_div(z,ell),'sqeuclidean')
         A = np.exp(-0.5*A)
         return A
 
@@ -853,9 +859,9 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             nn,D = z.shape
             A = np.zeros((nn,1))
         elif mode == 'train':             # compute covariance matix for dataset x
-            A = spdist.cdist(x/ell,x/ell,'sqeuclidean')
+            A = spdist.cdist(old_div(x,ell),old_div(x,ell),'sqeuclidean')
         elif mode == 'cross':             # compute covariance between data sets x and z
-            A = spdist.cdist(x/ell,z/ell,'sqeuclidean')
+            A = spdist.cdist(old_div(x,ell),old_div(z,ell),'sqeuclidean')
         if der == 0:           # compute derivative matrix wrt 1st parameter
             A = np.exp(-0.5*A) * A
         else:
@@ -875,7 +881,7 @@ class RBFard(Kernel):
     '''
     def __init__(self, D=None, log_ell_list=None, log_sigma=0.):
         if log_ell_list is None:
-            self.hyp = [0. for i in xrange(D)] + [log_sigma]
+            self.hyp = [0. for i in range(D)] + [log_sigma]
         else:
             self.hyp = log_ell_list + [log_sigma]
 
@@ -885,7 +891,7 @@ def getCovMatrix(self,x=None,z=None,mode=None):
             n, D = x.shape
         if not z is None:
             nn, D = z.shape
-        ell = 1./np.exp(self.hyp[0:D])    # characteristic length scale
+        ell = old_div(1.,np.exp(self.hyp[0:D]))    # characteristic length scale
         sf2 = np.exp(2.*self.hyp[D])      # signal variance
         if mode == 'self_test':           # self covariances for the test cases
             nn,D = z.shape
@@ -904,7 +910,7 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             n, D = x.shape
         if not z is None:
             nn, D = z.shape
-        ell = 1./np.exp(self.hyp[0:D])    # characteristic length scale
+        ell = old_div(1.,np.exp(self.hyp[0:D]))    # characteristic length scale
         sf2 = np.exp(2.*self.hyp[D])      # signal variance
         if mode == 'self_test':           # self covariances for the test cases
             nn,D = z.shape
@@ -919,10 +925,10 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             if mode == 'self_test':
                 A = A*0
             elif mode == 'train':
-                tem = np.atleast_2d(x[:,der])/ell[der]
+                tem = old_div(np.atleast_2d(x[:,der]),ell[der])
                 A *= spdist.cdist(tem,tem,'sqeuclidean')
             elif mode == 'cross':
-                A *= spdist.cdist(np.atleast_2d(x[:,der]).T/ell[der],np.atleast_2d(z[:,der]).T/ell[der],'sqeuclidean')
+                A *= spdist.cdist(old_div(np.atleast_2d(x[:,der]).T,ell[der]),old_div(np.atleast_2d(z[:,der]).T,ell[der]),'sqeuclidean')
         elif der == D:                    # compute derivative matrix wrt magnitude parameter
             A = 2.*A
         else:
@@ -1028,7 +1034,7 @@ class LINard(Kernel):
     '''
     def __init__(self, D=None, log_ell_list=None):
         if log_ell_list is None:
-            self.hyp = [0. for i in xrange(D)]
+            self.hyp = [0. for i in range(D)]
         else:
             self.hyp = log_ell_list
 
@@ -1042,7 +1048,7 @@ def getCovMatrix(self,x=None,z=None,mode=None):
             n, D = x.shape
             A = np.dot(x,x.T)+ np.eye(n)*1e-10
         elif mode == 'cross':             # compute covariance between data sets x and z
-            z = np.dot(z,np.diag(1./ell))
+            z = np.dot(z,np.diag(old_div(1.,ell)))
             A = np.dot(x,z.T)
         return A
 
@@ -1060,7 +1066,7 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             elif mode == 'train':
                 A = -2.*np.dot(np.atleast_2d(x[:,der]).T,np.atleast_2d(x[:,der]))
             elif mode == 'cross':
-                z = np.dot(z,np.diag(1./ell))
+                z = np.dot(z,np.diag(old_div(1.,ell)))
                 A = -2.*np.dot(np.atleast_2d(x[:,der]).T, np.atleast_2d(z[:,der])) # cross covariances
         else:
             raise Exception("Wrong derivative index in covLINard")
@@ -1102,9 +1108,9 @@ def dfunc(self,d,t): # Note, this is func - d func/dt
         elif d == 3:
             return t
         elif d == 5:
-            return (1./3.)*(t + t*t)
+            return (old_div(1.,3.))*(t + t*t)
         elif d == 7:
-            return (1./15.)*(t + 3.*t*t + t*t*t)
+            return (old_div(1.,15.))*(t + 3.*t*t + t*t*t)
         else:
             raise Exception("Wrong value for d in Matern")
 
@@ -1125,7 +1131,7 @@ def getCovMatrix(self,x=None,z=None,mode=None):
         try:
             assert(d in [1,3,5,7])         # check for valid values of d
         except AssertionError:
-            print "Warning: You specified d to be neither 1,3,5 nor 7. We set it to d=3. "
+            print("Warning: You specified d to be neither 1,3,5 nor 7. We set it to d=3. ")
             d = 3
         if mode == 'self_test':           # self covariances for the test cases
             nn,D = z.shape
@@ -1151,7 +1157,7 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
         try:
             assert(d in [1,3,5,7])         # check for valid values of d
         except AssertionError:
-            print "Warning: You specified d to be neither 1,3,5 nor 7. We set to d=3. "
+            print("Warning: You specified d to be neither 1,3,5 nor 7. We set to d=3. ")
             d = 3
         if mode == 'self_test':           # self covariances for the test cases
             nn,D = z.shape
@@ -1205,7 +1211,7 @@ def getCovMatrix(self,x=None,z=None,mode=None):
         elif mode == 'cross':             # compute covariance between data sets x and z
             A = np.sqrt(spdist.cdist(x, z, 'sqeuclidean'))
         A = np.pi*A/p
-        A = np.sin(A)/ell
+        A = old_div(np.sin(A),ell)
         A = A * A
         A = sf2 *np.exp(-2.*A)
         return A
@@ -1228,14 +1234,14 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             A = np.sqrt(spdist.cdist(x, z, 'sqeuclidean'))
         A = np.pi*A/p
         if der == 0:            # compute derivative matrix wrt 1st parameter
-            A = np.sin(A)/ell
+            A = old_div(np.sin(A),ell)
             A = A * A
             A = 4. *sf2 *np.exp(-2.*A) * A
         elif der == 1:          # compute derivative matrix wrt 2nd parameter
-            R = np.sin(A)/ell
+            R = old_div(np.sin(A),ell)
             A = 4 * sf2/ell * np.exp(-2.*R*R)*R*np.cos(A)*A
         elif der == 2:          # compute derivative matrix wrt 3rd parameter
-            A = np.sin(A)/ell
+            A = old_div(np.sin(A),ell)
             A = A * A
             A = 2. * sf2 * np.exp(-2.*A)
         else:
@@ -1315,9 +1321,9 @@ def getCovMatrix(self,x=None,z=None,mode=None):
             nn,D = z.shape
             D2 = np.zeros((nn,1))
         elif mode == 'train':             # compute covariance matix for dataset x
-            D2 = spdist.cdist(x/ell, x/ell, 'sqeuclidean')
+            D2 = spdist.cdist(old_div(x,ell), old_div(x,ell), 'sqeuclidean')
         elif mode == 'cross':             # compute covariance between data sets x and z
-            D2 = spdist.cdist(x/ell, z/ell, 'sqeuclidean')
+            D2 = spdist.cdist(old_div(x,ell), old_div(z,ell), 'sqeuclidean')
         A = sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) )
         return A
 
@@ -1330,9 +1336,9 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             nn,D = z.shape
             D2 = np.zeros((nn,1))
         elif mode == 'train':             # compute covariance matix for dataset x
-            D2 = spdist.cdist(x/ell, x/ell, 'sqeuclidean')
+            D2 = spdist.cdist(old_div(x,ell), old_div(x,ell), 'sqeuclidean')
         elif mode == 'cross':             # compute covariance between data sets x and z
-            D2 = spdist.cdist(x/ell, z/ell, 'sqeuclidean')
+            D2 = spdist.cdist(old_div(x,ell), old_div(z,ell), 'sqeuclidean')
         if der == 0:                # compute derivative matrix wrt 1st parameter
             A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * D2
         elif der == 1:              # compute derivative matrix wrt 2nd parameter
@@ -1359,7 +1365,7 @@ class RQard(Kernel):
     '''
     def __init__(self, D=None, log_ell_list=None, log_sigma=0., log_alpha=0.):
         if log_ell_list is None:
-            self.hyp = [0. for i in xrange(D)] + [ log_sigma, log_alpha ]
+            self.hyp = [0. for i in range(D)] + [ log_sigma, log_alpha ]
         else:
             self.hyp = log_ell_list + [ log_sigma, log_alpha ]
 
@@ -1369,7 +1375,7 @@ def getCovMatrix(self,x=None,z=None,mode=None):
             n, D = x.shape
         if not z is None:
             nn, D = z.shape
-        ell = 1./np.exp(self.hyp[0:D])    # characteristic length scale
+        ell = old_div(1.,np.exp(self.hyp[0:D]))    # characteristic length scale
         sf2 = np.exp(2.*self.hyp[D])      # signal variance
         alpha = np.exp(self.hyp[D+1])
         if mode == 'self_test':           # self covariances for the test cases
@@ -1389,7 +1395,7 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             n, D = x.shape
         if not z is None:
             nn, D = z.shape
-        ell = 1./np.exp(self.hyp[0:D])    # characteristic length scale
+        ell = old_div(1.,np.exp(self.hyp[0:D]))    # characteristic length scale
         sf2 = np.exp(2.*self.hyp[D])      # signal variance
         alpha = np.exp(self.hyp[D+1])
         if mode == 'self_test':           # self covariances for the test cases
@@ -1404,10 +1410,10 @@ def getDerMatrix(self,x=None,z=None,mode=None,der=None):
             if mode == 'self_test':
                 A = D2*0
             elif mode == 'train':
-                tmp = np.atleast_2d(x[:,der])/ell[der]
+                tmp = old_div(np.atleast_2d(x[:,der]),ell[der])
                 A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(tmp, tmp, 'sqeuclidean')
             elif mode == 'cross':
-                A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(np.atleast_2d(x[:,der]).T/ell[der], np.atleast_2d(z[:,der]).T/ell[der], 'sqeuclidean')
+                A = sf2 * ( 1.0 + 0.5*D2/alpha )**(-alpha-1) * spdist.cdist(old_div(np.atleast_2d(x[:,der]).T,ell[der]), old_div(np.atleast_2d(z[:,der]).T,ell[der]), 'sqeuclidean')
         elif der==D:                # compute derivative matrix wrt magnitude parameter
             A = 2. * sf2 * ( ( 1.0 + 0.5*D2/alpha )**(-alpha) )
         elif der==(D+1):            # compute derivative matrix wrt magnitude parameter
diff --git a/pyGPs/Core/gp.py b/pyGPs/Core/gp.py
index d9cec56..7010005 100644
--- a/pyGPs/Core/gp.py
+++ b/pyGPs/Core/gp.py
@@ -1,3 +1,9 @@
+from __future__ import division
+from __future__ import absolute_import
+from builtins import str
+from builtins import range
+from builtins import object
+from past.utils import old_div
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -42,8 +48,8 @@
 import itertools
 import numpy as np
 import matplotlib.pyplot as plt
-import inf, mean, lik, cov, opt
-from tools import unique, jitchol, solve_chol
+from . import inf, mean, lik, cov, opt
+from .tools import unique, jitchol, solve_chol
 from copy import deepcopy
 import pyGPs
 
@@ -183,7 +189,7 @@ def plotData_2d(self,x1,x2,t1,t2,p1,p2,axisvals=None):
         fig = plt.figure()
         plt.plot(x1[:,0], x1[:,1], 'b+', markersize = 12)
         plt.plot(x2[:,0], x2[:,1], 'r+', markersize = 12)
-        pc = plt.contour(t1, t2, np.reshape(p2/(p1+p2), (t1.shape[0],t1.shape[1]) ))
+        pc = plt.contour(t1, t2, np.reshape(old_div(p2,(p1+p2)), (t1.shape[0],t1.shape[1]) ))
         fig.colorbar(pc)
         plt.grid()
         if axisvals:
@@ -375,7 +381,7 @@ def predict(self, xs, ys=None):
         L     = self.posterior.L
         sW    = self.posterior.sW
 
-        nz = range(len(alpha[:,0]))         # non-sparse representation
+        nz = list(range(len(alpha[:,0])))         # non-sparse representation
         if L == []:                         # in case L is not provided, we compute it
             K = covfunc.getCovMatrix(x=x[nz,:], mode='train')
             #L = np.linalg.cholesky( (np.eye(nz) + np.dot(sW,sW.T)*K).T )
@@ -390,13 +396,13 @@ def predict(self, xs, ys=None):
         fs2 = np.zeros((ns,1))
         lp  = np.zeros((ns,1))
         while nact<=ns-1:                              # process minibatches of test cases to save memory
-            id  = range(nact,min(nact+nperbatch,ns))   # data points to process
+            id  = list(range(nact,min(nact+nperbatch,ns)))   # data points to process
             kss = covfunc.getCovMatrix(z=xs[id,:], mode='self_test')    # self-variances
             Ks  = covfunc.getCovMatrix(x=x[nz,:], z=xs[id,:], mode='cross')   # cross-covariances
             ms  = meanfunc.getMean(xs[id,:])
             N   = (alpha.shape)[1]                     # number of alphas (usually 1; more in case of sampling)
             Fmu = np.tile(ms,(1,N)) + np.dot(Ks.T,alpha[nz])          # conditional mean fs|f
-            fmu[id] = np.reshape(Fmu.sum(axis=1)/N,(len(id),1))       # predictive means
+            fmu[id] = np.reshape(old_div(Fmu.sum(axis=1),N),(len(id),1))       # predictive means
             if Ltril: # L is triangular => use Cholesky parameters (alpha,sW,L)
                 V       = np.linalg.solve(L.T,np.tile(sW,(1,len(id)))*Ks)
                 fs2[id] = kss - np.array([(V*V).sum(axis=0)]).T             # predictive variances
@@ -408,9 +414,9 @@ def predict(self, xs, ys=None):
                 Lp, Ymu, Ys2 = likfunc.evaluate(None,Fmu[:],Fs2[:],None,None,3)
             else:
                 Lp, Ymu, Ys2 = likfunc.evaluate(np.tile(ys[id],(1,N)), Fmu[:], Fs2[:],None,None,3)
-            lp[id]  = np.reshape( np.reshape(Lp,(np.prod(Lp.shape),N)).sum(axis=1)/N , (len(id),1) )   # log probability; sample averaging
-            ymu[id] = np.reshape( np.reshape(Ymu,(np.prod(Ymu.shape),N)).sum(axis=1)/N ,(len(id),1) )  # predictive mean ys|y and ...
-            ys2[id] = np.reshape( np.reshape(Ys2,(np.prod(Ys2.shape),N)).sum(axis=1)/N , (len(id),1) ) # .. variance
+            lp[id]  = np.reshape( old_div(np.reshape(Lp,(np.prod(Lp.shape),N)).sum(axis=1),N) , (len(id),1) )   # log probability; sample averaging
+            ymu[id] = np.reshape( old_div(np.reshape(Ymu,(np.prod(Ymu.shape),N)).sum(axis=1),N) ,(len(id),1) )  # predictive mean ys|y and ...
+            ys2[id] = np.reshape( old_div(np.reshape(Ys2,(np.prod(Ys2.shape),N)).sum(axis=1),N) , (len(id),1) ) # .. variance
             nact = id[-1]+1                  # set counter to index of next data point
         self.ym = ymu
         self.ys2 = ys2
@@ -465,7 +471,7 @@ def predict_with_posterior(self, post, xs, ys=None):
         L     = post.L
         sW    = post.sW
 
-        nz = range(len(alpha[:,0]))         # non-sparse representation
+        nz = list(range(len(alpha[:,0])))         # non-sparse representation
         if L == []:                         # in case L is not provided, we compute it
             K = covfunc.getCovMatrix(x=x[nz,:], mode='train')
             #L = np.linalg.cholesky( (np.eye(nz) + np.dot(sW,sW.T)*K).T )
@@ -480,13 +486,13 @@ def predict_with_posterior(self, post, xs, ys=None):
         fs2 = np.zeros((ns,1))
         lp  = np.zeros((ns,1))
         while nact<=ns-1:                              # process minibatches of test cases to save memory
-            id  = range(nact,min(nact+nperbatch,ns))   # data points to process
+            id  = list(range(nact,min(nact+nperbatch,ns)))   # data points to process
             kss = covfunc.getCovMatrix(z=xs[id,:], mode='self_test')    # self-variances
             Ks  = covfunc.getCovMatrix(x=x[nz,:], z=xs[id,:], mode='cross')   # cross-covariances
             ms  = meanfunc.getMean(xs[id,:])
             N   = (alpha.shape)[1]                     # number of alphas (usually 1; more in case of sampling)
             Fmu = np.tile(ms,(1,N)) + np.dot(Ks.T,alpha[nz])          # conditional mean fs|f
-            fmu[id] = np.reshape(Fmu.sum(axis=1)/N,(len(id),1))       # predictive means
+            fmu[id] = np.reshape(old_div(Fmu.sum(axis=1),N),(len(id),1))       # predictive means
             if Ltril: # L is triangular => use Cholesky parameters (alpha,sW,L)
                 V       = np.linalg.solve(L.T,np.tile(sW,(1,len(id)))*Ks)
                 fs2[id] = kss - np.array([(V*V).sum(axis=0)]).T             # predictive variances
@@ -498,9 +504,9 @@ def predict_with_posterior(self, post, xs, ys=None):
                 [Lp, Ymu, Ys2] = likfunc.evaluate(None,Fmu[:],Fs2[:],None,None,3)
             else:
                 [Lp, Ymu, Ys2] = likfunc.evaluate(np.tile(ys[id],(1,N)), Fmu[:], Fs2[:],None,None,3)
-            lp[id]  = np.reshape( np.reshape(Lp,(np.prod(Lp.shape),N)).sum(axis=1)/N , (len(id),1) )   # log probability; sample averaging
-            ymu[id] = np.reshape( np.reshape(Ymu,(np.prod(Ymu.shape),N)).sum(axis=1)/N ,(len(id),1) )  # predictive mean ys|y and ...
-            ys2[id] = np.reshape( np.reshape(Ys2,(np.prod(Ys2.shape),N)).sum(axis=1)/N , (len(id),1) ) # .. variance
+            lp[id]  = np.reshape( old_div(np.reshape(Lp,(np.prod(Lp.shape),N)).sum(axis=1),N) , (len(id),1) )   # log probability; sample averaging
+            ymu[id] = np.reshape( old_div(np.reshape(Ymu,(np.prod(Ymu.shape),N)).sum(axis=1),N) ,(len(id),1) )  # predictive mean ys|y and ...
+            ys2[id] = np.reshape( old_div(np.reshape(Ys2,(np.prod(Ys2.shape),N)).sum(axis=1),N) , (len(id),1) ) # .. variance
             nact = id[-1]+1                  # set counter to index of next data point
         self.ym  = ymu
         self.ys2 = ys2
@@ -784,7 +790,7 @@ def useLikelihood(self,newLik):
         :param str newLik: 'Logistic'
         '''
         if newLik == "Logistic":
-            raise "Logistic likelihood is currently not implemented."
+            raise Exception("Logistic likelihood is currently not implemented.")
             #self.likfunc = lik.Logistic()
         else:
             raise Exception('Possible lik values are "Logistic".')
@@ -830,8 +836,8 @@ def fitAndPredict(self, xs):
             xs = np.reshape(xs, (xs.shape[0],1))
 
         predictive_vote = np.zeros((xs.shape[0],self.n_class))
-        for i in xrange(self.n_class):         # classifier for class i...
-            for j in xrange(i+1,self.n_class): # ...and class j
+        for i in range(self.n_class):         # classifier for class i...
+            for j in range(i+1,self.n_class): # ...and class j
                 x,y = self.createBinaryClass(i,j)
                 model = GPC()
                 if self.newPrior:
@@ -868,8 +874,8 @@ def optimizeAndPredict(self, xs):
             xs = np.reshape(xs, (xs.shape[0],1))
 
         predictive_vote = np.zeros((xs.shape[0],self.n_class))
-        for i in xrange(self.n_class):         # classifier for class i...
-            for j in xrange(i+1,self.n_class): # ...and class j
+        for i in range(self.n_class):         # classifier for class i...
+            for j in range(i+1,self.n_class): # ...and class j
                 x,y = self.createBinaryClass(i,j)
                 model = GPC()
                 if self.newPrior:
@@ -904,7 +910,7 @@ def createBinaryClass(self, i,j):
         '''
         class_i = []
         class_j = []
-        for index in xrange(len(self.y_all)):       # check all classes
+        for index in range(len(self.y_all)):       # check all classes
             target = self.y_all[index]
             if target == i:
                 class_i.append(index)
@@ -961,7 +967,7 @@ def setData(self, x, y, value_per_axis=5):
         # get range of x in each dimension
         # 5 uniformally selected value for each dimension
         gridAxis=[]
-        for d in xrange(x.shape[1]):
+        for d in range(x.shape[1]):
             column = x[:,d]
             mini = np.min(column)
             maxi = np.max(column)
diff --git a/pyGPs/Core/inf.py b/pyGPs/Core/inf.py
index e40f7ce..c3bc088 100644
--- a/pyGPs/Core/inf.py
+++ b/pyGPs/Core/inf.py
@@ -1,3 +1,9 @@
+from __future__ import division
+from __future__ import absolute_import
+from builtins import str
+from builtins import range
+from past.utils import old_div
+from builtins import object
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -43,9 +49,9 @@
 
 
 import numpy as np
-import lik, cov
+from . import lik, cov
 from copy import copy, deepcopy
-from tools import solve_chol, brentmin, cholupdate, jitchol
+from .tools import solve_chol, brentmin, cholupdate, jitchol
 np.seterr(all='ignore')
 
 
@@ -72,7 +78,7 @@ def __init__(self):
     def __repr__(self):
         value = "posterior: to get the parameters of the posterior distribution use:\n"+\
                 "model.posterior.alpha\n"+"model.posterior.L\n"+"model.posterior.sW\n"+\
-		"See documentation and gpml book chapter 2.3 and chapter 3.4.3 for these parameters."
+        "See documentation and gpml book chapter 2.3 and chapter 3.4.3 for these parameters."
         return value
 
     def __str__(self):
@@ -96,11 +102,11 @@ def __init__(self, m, c, l):
         self.cov = []
         self.lik = []
         if m.hyp:
-            self.mean = [0 for i in xrange(len(m.hyp))]
+            self.mean = [0 for i in range(len(m.hyp))]
         if c.hyp:
-            self.cov  = [0 for i in xrange(len(c.hyp))]
+            self.cov  = [0 for i in range(len(c.hyp))]
         if l.hyp:
-            self.lik  = [0 for i in xrange(len(l.hyp))]
+            self.lik  = [0 for i in range(len(l.hyp))]
 
     def __str__(self):
         value = "Derivatives of mean, cov and lik functions:\n" +\
@@ -166,12 +172,12 @@ def _epComputeParams(self, K, y, ttau, tnu, likfunc, m, inffunc):
         Sigma = K - np.dot(V.T,V)
         mu    = np.dot(Sigma,tnu)
         Dsigma = np.reshape(np.diag(Sigma),(np.diag(Sigma).shape[0],1))
-        tau_n = 1/Dsigma - ttau               # compute the log marginal likelihood
-        nu_n  = mu/Dsigma-tnu + m*tau_n       # vectors of cavity parameters
-        lZ    = likfunc.evaluate(y, nu_n/tau_n, 1/tau_n, inffunc)
-        nlZ   = np.log(np.diag(L)).sum() - lZ.sum() - np.dot(tnu.T,np.dot(Sigma,tnu))/2  \
-                - np.dot((nu_n-m*tau_n).T,((ttau/tau_n*(nu_n-m*tau_n)-2*tnu) / (ttau+tau_n)))/2 \
-                + (tnu**2/(tau_n+ttau)).sum()/2.- np.log(1.+ttau/tau_n).sum()/2.
+        tau_n = old_div(1,Dsigma) - ttau               # compute the log marginal likelihood
+        nu_n  = old_div(mu,Dsigma)-tnu + m*tau_n       # vectors of cavity parameters
+        lZ    = likfunc.evaluate(y, old_div(nu_n,tau_n), old_div(1,tau_n), inffunc)
+        nlZ   = np.log(np.diag(L)).sum() - lZ.sum() - old_div(np.dot(tnu.T,np.dot(Sigma,tnu)),2)  \
+                - old_div(np.dot((nu_n-m*tau_n).T,(old_div((ttau/tau_n*(nu_n-m*tau_n)-2*tnu), (ttau+tau_n)))),2) \
+                + old_div((old_div(tnu**2,(tau_n+ttau))).sum(),2.)- old_div(np.log(1.+old_div(ttau,tau_n)).sum(),2.)
         return Sigma, mu, nlZ[0], L
 
     def _logdetA(self,K,w,nargout):
@@ -187,7 +193,7 @@ def _logdetA(self,K,w,nargout):
         u = np.diag(U)
         signU = np.prod(np.sign(u))  # sign of U
         detP = 1                     # compute sign (and det) of the permutation matrix P
-        p = np.dot(P,np.array(range(n)).T)
+        p = np.dot(P,np.array(list(range(n))).T)
         for ii in range(n):
             if ii != p[ii]:
                 detP = -detP
@@ -215,7 +221,7 @@ def _Psi_line(self,s,dalpha,alpha,K,m,likfunc,y,inffunc):
         f = np.dot(K,alpha) + m
         [lp,dlp,d2lp] = likfunc.evaluate(y,f,None,inffunc,None,3)
         W = -d2lp
-        Psi = np.dot(alpha.T,(f-m))/2. - lp.sum()
+        Psi = old_div(np.dot(alpha.T,(f-m)),2.) - lp.sum()
         return Psi[0],alpha,f,dlp,W
 
     def _epfitcZ(self,d,P,R,nn,gg,ttau,tnu,d0,R0,P0,y,likfunc,m,inffunc):
@@ -226,18 +232,18 @@ def _epfitcZ(self,d,P,R,nn,gg,ttau,tnu,d0,R0,P0,y,likfunc,m,inffunc):
         T = np.dot(np.dot(R,R0),P)              # temporary variable
         diag_sigma = d + np.array([(T*T).sum(axis=0)]).T
         mu = nn + np.dot(P.T,gg)                # post moments O(n*nu^2)
-        tau_n = 1./diag_sigma-ttau              # compute the log marginal likelihood
-        nu_n  = mu/diag_sigma - tnu + m*tau_n   # vectors of cavity parameters
-        lZ = likfunc.evaluate(y, nu_n/tau_n, 1/tau_n, inffunc, None, 1)
+        tau_n = old_div(1.,diag_sigma)-ttau              # compute the log marginal likelihood
+        nu_n  = old_div(mu,diag_sigma) - tnu + m*tau_n   # vectors of cavity parameters
+        lZ = likfunc.evaluate(y, old_div(nu_n,tau_n), old_div(1,tau_n), inffunc, None, 1)
         nu = gg.shape[0]
-        U = np.dot(R0,P0).T*np.tile(1/np.sqrt(d0+1/ttau),(1,nu))
+        U = np.dot(R0,P0).T*np.tile(old_div(1,np.sqrt(d0+old_div(1,ttau))),(1,nu))
         #L = np.linalg.cholesky(np.eye(nu)+np.dot(U.T,U)).T
         L = jitchol(np.eye(nu)+np.dot(U.T,U)).T
-        ld = 2.*np.log(np.diag(L)).sum() + (np.log(d0+1/ttau)).sum() + (np.log(ttau)).sum()
+        ld = 2.*np.log(np.diag(L)).sum() + (np.log(d0+old_div(1,ttau))).sum() + (np.log(ttau)).sum()
         t = np.dot(T,tnu); tnu_Sigma_tnu = np.dot(tnu.T,(d*tnu)) + np.dot(t.T,t)
-        nlZ = ld/2. - lZ.sum() -tnu_Sigma_tnu/2. \
-            -np.dot((nu_n-m*tau_n).T,((ttau/tau_n*(nu_n-m*tau_n)-2.*tnu)/(ttau+tau_n)))/2. \
-            + (tnu**2/(tau_n+ttau)).sum()/2. - (np.log(1+ttau/tau_n)).sum()/2.
+        nlZ = old_div(ld,2.) - lZ.sum() -old_div(tnu_Sigma_tnu,2.) \
+            -old_div(np.dot((nu_n-m*tau_n).T,(old_div((ttau/tau_n*(nu_n-m*tau_n)-2.*tnu),(ttau+tau_n)))),2.) \
+            + old_div((old_div(tnu**2,(tau_n+ttau))).sum(),2.) - old_div((np.log(1+old_div(ttau,tau_n))).sum(),2.)
         return nlZ,nu_n,tau_n
 
     def _epfitcRefresh(self,d0,P0,R0,R0P0,w,b):
@@ -251,7 +257,7 @@ def _epfitcRefresh(self,d0,P0,R0,R0P0,w,b):
         rot180   = lambda A: np.rot90(np.rot90(A))       # little helper functions
         #chol_inv = lambda A: np.linalg.solve( rot180( np.linalg.cholesky(rot180(A)) ),np.eye(nu)) # chol(inv(A))
         chol_inv = lambda A: np.linalg.solve( rot180( jitchol(rot180(A)) ),np.eye(nu)) # chol(inv(A))
-        t  = 1/(1+d0*w)                                  # temporary variable O(n)
+        t  = old_div(1,(1+d0*w))                                  # temporary variable O(n)
         d  = d0*t                                        # O(n)
         P  = np.tile(t.T,(nu,1))*P0                      # O(n*nu)
         T  = np.tile((w*t).T,(nu,1))*R0P0                # temporary variable O(n*nu^2)
@@ -265,21 +271,21 @@ def _epfitcUpdate(self,d,P_i,R,nn,gg,w,b,ii,w_i,b_i,m,d0,P0,R0):
         dbi = b_i-b[ii]
         hi = nn[ii] + m[ii] + np.dot(P_i.T,gg)           # posterior mean of site i O(nu)
         t = 1+dwi*d[ii]
-        d[ii] = d[ii]/t                                  # O(1)
+        d[ii] = old_div(d[ii],t)                                  # O(1)
         nn[ii] = d[ii]*b_i                               # O(1)
         r = 1+d0[ii]*w[ii]
-        r = (r*r)/dwi + r*d0[ii]
+        r = old_div((r*r),dwi) + r*d0[ii]
         v = np.dot(R,np.dot(R0,P0[:,ii]))
         v = np.reshape(v,(v.shape[0],1))
-        r = 1/(r+np.dot(v.T,v))
+        r = old_div(1,(r+np.dot(v.T,v)))
         if r>0:
             R = cholupdate(R,np.sqrt(r)*np.dot(R.T,v),'-')
         else:
             R = cholupdate(R,np.sqrt(-r)*np.dot(R.T,v),'+')
         ttemp = np.dot(R0.T,np.dot(R.T,np.dot(R,np.dot(R0,P_i))))
-        gg = gg + ((dbi-dwi*(hi-m[ii]))/t) * np.reshape(ttemp,(ttemp.shape[0],1)) # O(nu^2)
+        gg = gg + (old_div((dbi-dwi*(hi-m[ii])),t)) * np.reshape(ttemp,(ttemp.shape[0],1)) # O(nu^2)
         w[ii] = w_i; b[ii] = b_i;                          # update site parameters O(1)
-        P_i = P_i/t                                        # O(nu)
+        P_i = old_div(P_i,t)                                        # O(nu)
         return d,P_i,R,nn,gg,w,b
 
     def _mvmZ(self,x,RVdd,t):
@@ -305,7 +311,7 @@ def _Psi_lineFITC(self,s,dalpha,alpha,V,d0,m,likfunc,y,inffunc):
         vargout = likfunc.evaluate(y,f,None,inffunc,None,3)
         lp = vargout[0]; dlp = vargout[1]; d2lp = vargout[2]
         W = -d2lp
-        Psi = np.dot(alpha.T,(f-m))/2. - lp.sum()
+        Psi = old_div(np.dot(alpha.T,(f-m)),2.) - lp.sum()
         return Psi[0],alpha,f,dlp,W
 
     def _fitcRefresh(self,d0,P0,R0,R0P0, w):
@@ -319,7 +325,7 @@ def _fitcRefresh(self,d0,P0,R0,R0P0, w):
         rot180   = lambda A: np.rot90(np.rot90(A))        # little helper functions
         #chol_inv = lambda A: np.linalg.solve( rot180( np.linalg.cholesky(rot180(A)) ),np.eye(nu)) # chol(inv(A))
         chol_inv = lambda A: np.linalg.solve( rot180( jitchol(rot180(A)) ),np.eye(nu)) # chol(inv(A))
-        t  = 1/(1+d0*w)                                   # temporary variable O(n)
+        t  = old_div(1,(1+d0*w))                                   # temporary variable O(n)
         d  = d0*t                                         # O(n)
         P  = np.tile(t.T,(nu,1))*P0;                      # O(n*nu)
         T  = np.tile((w*t).T,(nu,1))*R0P0;                # temporary variable O(n*nu^2)
@@ -344,22 +350,22 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
 
         sn2   = np.exp(2*likfunc.hyp[0])                       # noise variance of likGauss
         #L     = np.linalg.cholesky(K/sn2+np.eye(n)).T         # Cholesky factor of covariance with noise
-        L     = jitchol(K/sn2+np.eye(n)).T                     # Cholesky factor of covariance with noise
-        alpha = solve_chol(L,y-m)/sn2
+        L     = jitchol(old_div(K,sn2)+np.eye(n)).T                     # Cholesky factor of covariance with noise
+        alpha = old_div(solve_chol(L,y-m),sn2)
         post = postStruct()
         post.alpha = alpha                                     # return the posterior parameters
-        post.sW    = np.ones((n,1))/np.sqrt(sn2)               # sqrt of noise precision vector
+        post.sW    = old_div(np.ones((n,1)),np.sqrt(sn2))               # sqrt of noise precision vector
         post.L     = L                                         # L = chol(eye(n)+sW*sW'.*K)
 
         if nargout>1:                                          # do we want the marginal likelihood?
-            nlZ = np.dot((y-m).T,alpha)/2. + np.log(np.diag(L)).sum() + n*np.log(2*np.pi*sn2)/2. # -log marg lik
+            nlZ = old_div(np.dot((y-m).T,alpha),2.) + np.log(np.diag(L)).sum() + n*np.log(2*np.pi*sn2)/2. # -log marg lik
             if nargout>2:                                      # do we want derivatives?
                 dnlZ = dnlZStruct(meanfunc, covfunc, likfunc)  # allocate space for derivatives
-                Q = solve_chol(L,np.eye(n))/sn2 - np.dot(alpha,alpha.T) # precompute for convenience
+                Q = old_div(solve_chol(L,np.eye(n)),sn2) - np.dot(alpha,alpha.T) # precompute for convenience
                 dnlZ.lik = [sn2*np.trace(Q)]
                 if covfunc.hyp:
                     for ii in range(len(covfunc.hyp)):
-                        dnlZ.cov[ii] = (Q*covfunc.getDerMatrix(x=x, mode='train', der=ii)).sum()/2.
+                        dnlZ.cov[ii] = old_div((Q*covfunc.getDerMatrix(x=x, mode='train', der=ii)).sum(),2.)
                 if meanfunc.hyp:
                     for ii in range(len(meanfunc.hyp)):
                         dnlZ.mean[ii] = np.dot(-meanfunc.getDerMatrix(x, ii).T,alpha)
@@ -398,37 +404,37 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
         V     = np.linalg.solve(Luu.T,Ku)                        # V = inv(Luu')*Ku => V'*V = Q
         g_sn2 = diagK + sn2 - np.array([(V*V).sum(axis=0)]).T    # g + sn2 = diag(K) + sn2 - diag(Q)
         #Lu    = np.linalg.cholesky(np.eye(nu) + np.dot(V/np.tile(g_sn2.T,(nu,1)),V.T)).T  # Lu'*Lu=I+V*diag(1/g_sn2)*V'
-        Lu    = jitchol(np.eye(nu) + np.dot(V/np.tile(g_sn2.T,(nu,1)),V.T)).T  # Lu'*Lu=I+V*diag(1/g_sn2)*V'
-        r     = (y-m)/np.sqrt(g_sn2)
-        be    = np.linalg.solve(Lu.T,np.dot(V,r/np.sqrt(g_sn2)))
+        Lu    = jitchol(np.eye(nu) + np.dot(old_div(V,np.tile(g_sn2.T,(nu,1))),V.T)).T  # Lu'*Lu=I+V*diag(1/g_sn2)*V'
+        r     = old_div((y-m),np.sqrt(g_sn2))
+        be    = np.linalg.solve(Lu.T,np.dot(V,old_div(r,np.sqrt(g_sn2))))
         iKuu  = solve_chol(Luu,np.eye(nu))                       # inv(Kuu + snu2*I) = iKuu
 
         post = postStruct()
         post.alpha = np.linalg.solve(Luu,np.linalg.solve(Lu,be)) # return the posterior parameters
         post.L  = solve_chol(np.dot(Lu,Luu),np.eye(nu)) - iKuu   # Sigma-inv(Kuu)
-        post.sW = np.ones((n,1))/np.sqrt(sn2)                    # unused for FITC prediction  with gp.m
+        post.sW = old_div(np.ones((n,1)),np.sqrt(sn2))                    # unused for FITC prediction  with gp.m
 
         if nargout>1:                                            # do we want the marginal likelihood
-            nlZ = np.log(np.diag(Lu)).sum() + (np.log(g_sn2).sum() + n*np.log(2*np.pi) + np.dot(r.T,r) - np.dot(be.T,be))/2.
+            nlZ = np.log(np.diag(Lu)).sum() + old_div((np.log(g_sn2).sum() + n*np.log(2*np.pi) + np.dot(r.T,r) - np.dot(be.T,be)),2.)
             if nargout>2:                                        # do we want derivatives?
                 dnlZ = dnlZStruct(meanfunc, covfunc, likfunc)    # allocate space for derivatives
-                al = r/np.sqrt(g_sn2) - np.dot(V.T,np.linalg.solve(Lu,be))/g_sn2 # al = (Kt+sn2*eye(n))\y
+                al = old_div(r,np.sqrt(g_sn2)) - old_div(np.dot(V.T,np.linalg.solve(Lu,be)),g_sn2) # al = (Kt+sn2*eye(n))\y
                 B = np.dot(iKuu,Ku)
                 w = np.dot(B,al)
-                W = np.linalg.solve(Lu.T,V/np.tile(g_sn2.T,(nu,1)))
+                W = np.linalg.solve(Lu.T,old_div(V,np.tile(g_sn2.T,(nu,1))))
                 for ii in range(len(covfunc.hyp)):
                     [ddiagKi,dKuui,dKui] = covfunc.getDerMatrix(x=x, mode='train', der=ii)    # eval cov deriv
                     R = 2.*dKui-np.dot(dKuui,B)
                     v = ddiagKi - np.array([(R*B).sum(axis=0)]).T          # diag part of cov deriv
-                    dnlZ.cov[ii] = ( np.dot(ddiagKi.T,1./g_sn2) + np.dot(w.T,(np.dot(dKuui,w)-2.*np.dot(dKui,al))) \
-                                   - np.dot(al.T,(v*al)) - np.dot(np.array([(W*W).sum(axis=0)]),v) - (np.dot(R,W.T)*np.dot(B,W.T)).sum() )/2.
+                    dnlZ.cov[ii] = old_div(( np.dot(ddiagKi.T,old_div(1.,g_sn2)) + np.dot(w.T,(np.dot(dKuui,w)-2.*np.dot(dKui,al))) \
+                                   - np.dot(al.T,(v*al)) - np.dot(np.array([(W*W).sum(axis=0)]),v) - (np.dot(R,W.T)*np.dot(B,W.T)).sum() ),2.)
                     dnlZ.cov[ii] = dnlZ.cov[ii][0,0]
-                dnlZ.lik = sn2*((1./g_sn2).sum() - (np.array([(W*W).sum(axis=0)])).sum() - np.dot(al.T,al))
+                dnlZ.lik = sn2*((old_div(1.,g_sn2)).sum() - (np.array([(W*W).sum(axis=0)])).sum() - np.dot(al.T,al))
                 dKuui = 2*snu2
                 R = -dKuui*B
                 v = -np.array([(R*B).sum(axis=0)]).T     # diag part of cov deriv
-                dnlZ.lik += (np.dot(w.T,np.dot(dKuui,w)) -np.dot(al.T,(v*al)) \
-                                 - np.dot(np.array([(W*W).sum(axis=0)]),v) - (np.dot(R,W.T)*np.dot(B,W.T)).sum() )/2.
+                dnlZ.lik += old_div((np.dot(w.T,np.dot(dKuui,w)) -np.dot(al.T,(v*al)) \
+                                 - np.dot(np.array([(W*W).sum(axis=0)]),v) - (np.dot(R,W.T)*np.dot(B,W.T)).sum() ),2.)
                 dnlZ.lik = list(dnlZ.lik[0])
                 for ii in range(len(meanfunc.hyp)):
                     dnlZ.mean[ii] = np.dot(-meanfunc.getDerMatrix(x, ii).T, al)
@@ -469,7 +475,7 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
             vargout = likfunc.evaluate(y, f, None, inffunc, None, 3)
             lp = vargout[0]; dlp = vargout[1]; d2lp = vargout[2]
             W= -d2lp
-            Psi_new = np.dot(alpha.T,(f-m))/2. - lp.sum() # objective for last alpha
+            Psi_new = old_div(np.dot(alpha.T,(f-m)),2.) - lp.sum() # objective for last alpha
             vargout = - likfunc.evaluate(y, m, None, inffunc, None, 1)
             Psi_def =  vargout[0]                         # objective for default init f==m
             if Psi_def < Psi_new:                         # if default is better, we use it
@@ -508,27 +514,27 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
         post.sW = np.sqrt(np.abs(W))*np.sign(W) # preserve sign in case of negative
         if isWneg:
             [ldA,iA,post.L] = self._logdetA(K,W,3)
-            nlZ = np.dot(alpha.T,(f-m))/2. - lp.sum() + ldA/2.
+            nlZ = old_div(np.dot(alpha.T,(f-m)),2.) - lp.sum() + old_div(ldA,2.)
             nlZ = nlZ[0]
         else:
             sW = post.sW
             #post.L = np.linalg.cholesky(np.eye(n)+np.dot(sW,sW.T)*K).T
             post.L = jitchol(np.eye(n)+np.dot(sW,sW.T)*K).T
-            nlZ = np.dot(alpha.T,(f-m))/2. + (np.log(np.diag(post.L))-np.reshape(lp,(lp.shape[0],))).sum()
+            nlZ = old_div(np.dot(alpha.T,(f-m)),2.) + (np.log(np.diag(post.L))-np.reshape(lp,(lp.shape[0],))).sum()
             nlZ = nlZ[0]
         if nargout>2:                                           # do we want derivatives?
             dnlZ = dnlZStruct(meanfunc, covfunc, likfunc)       # allocate space for derivatives
             if isWneg:                                          # switch between Cholesky and LU decomposition mode
                 Z = -post.L                                     # inv(K+inv(W))
-                g = np.atleast_2d((iA*K).sum(axis=1)).T /2      # deriv. of ln|B| wrt W; g = diag(inv(inv(K)+diag(W)))/2
+                g = old_div(np.atleast_2d((iA*K).sum(axis=1)).T,2)      # deriv. of ln|B| wrt W; g = diag(inv(inv(K)+diag(W)))/2
             else:
                 Z = np.tile(sW,(1,n))*solve_chol(post.L,np.diag(np.reshape(sW,(sW.shape[0],)))) #sW*inv(B)*sW=inv(K+inv(W))
                 C = np.linalg.solve(post.L.T,np.tile(sW,(1,n))*K)            # deriv. of ln|B| wrt W
-                g = np.atleast_2d((np.diag(K)-(C**2).sum(axis=0).T)).T /2.   # g = diag(inv(inv(K)+W))/2
+                g = old_div(np.atleast_2d((np.diag(K)-(C**2).sum(axis=0).T)).T,2.)   # g = diag(inv(inv(K)+W))/2
             dfhat = g* d3lp                 # deriv. of nlZ wrt. fhat: dfhat=diag(inv(inv(K)+W)).*d3lp/2
             for ii in range(len(covfunc.hyp)):                               # covariance hypers
                 dK = covfunc.getDerMatrix(x=x, mode='train', der=ii)
-                dnlZ.cov[ii] = (Z*dK).sum()/2. - np.dot(alpha.T,np.dot(dK,alpha))/2.   # explicit part
+                dnlZ.cov[ii] = old_div((Z*dK).sum(),2.) - old_div(np.dot(alpha.T,np.dot(dK,alpha)),2.)   # explicit part
                 b = np.dot(dK,dlp)                                           # b-K*(Z*b) = inv(eye(n)+K*diag(W))*b
                 dnlZ.cov[ii] -= np.dot(dfhat.T,b-np.dot(K,np.dot(Z,b)))      # implicit part
                 dnlZ.cov[ii] = dnlZ.cov[ii][0,0]
@@ -599,7 +605,7 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
             vargout = likfunc.evaluate(y, f, None, inffunc, None, 3)
             lp = vargout[0]; dlp = vargout[1]; d2lp = vargout[2]
             W=-d2lp
-            Psi_new = np.dot(alpha.T,(f-m))/2. - lp.sum()           # objective for last alpha
+            Psi_new = old_div(np.dot(alpha.T,(f-m)),2.) - lp.sum()           # objective for last alpha
             vargout = - likfunc.evaluate(y, m, None, inffunc, None, 1)
             Psi_def =  vargout[0]                                   # objective for default init f==m
             if Psi_def < Psi_new:                                   # if default is better, we use it
@@ -618,7 +624,7 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
             if isWneg:                      # stabilise the Newton direction in case W has negative values
                 W = np.maximum(W,0)         # stabilise the Hessian to guarantee postive definiteness
                 tol = 1e-8                  # increase accuracy to also get the derivatives right
-            b = W*(f-m) + dlp; dd = 1/(1+W*d0)
+            b = W*(f-m) + dlp; dd = old_div(1,(1+W*d0))
             RV = np.dot( chol_inv( np.eye(nu) + np.dot(V*np.tile((W*dd).T,(nu,1)),V.T)),V )
             dalpha = dd*b - (W*dd)*np.dot(RV.T,np.dot(RV,(dd*b))) - alpha # Newt dir + line search
             vargout = brentmin(0,smax,Nline,thr,self._Psi_lineFITC,4,dalpha,alpha,V,d0,m,likfunc,y,inffunc)
@@ -634,7 +640,7 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
         post = postStruct()
         post.alpha = np.dot(R0.T,np.dot(V,alpha))                   # return the posterior parameters
         post.sW = np.sqrt(np.abs(W))*np.sign(W)                     # preserve sign in case of negative
-        dd = 1/(1+d0*W)                                             # temporary variable O(n)
+        dd = old_div(1,(1+d0*W))                                             # temporary variable O(n)
         A = np.eye(nu) + np.dot(V*np.tile((W*dd).T,(nu,1)),V.T)     # temporary variable O(n*nu^2)
         R0tV = np.dot(R0.T,V); B = R0tV*np.tile((W*dd).T,(nu,1))    # temporary variables O(n*nu^2)
         post.L = -np.dot(B,R0tV.T)                                  # L = -R0'*V*inv(Kt+diag(1./ttau))*V'*R0, first part
@@ -642,7 +648,7 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
             raise Exception('W is too negative; nlZ and dnlZ cannot be computed.')
         #nlZ = np.dot(alpha.T,(f-m))/2. - lp.sum() - np.log(dd).sum()/2. + \
         #    np.log(np.diag(np.linalg.cholesky(A).T)).sum()
-        nlZ = np.dot(alpha.T,(f-m))/2. - lp.sum() - np.log(dd).sum()/2. + \
+        nlZ = old_div(np.dot(alpha.T,(f-m)),2.) - lp.sum() - old_div(np.log(dd).sum(),2.) + \
             np.log(np.diag(jitchol(A).T)).sum()
         RV = np.dot(chol_inv(A),V)
         RVdd = RV * np.tile((W*dd).T,(nu,1))                        # RVdd needed for dnlZ
@@ -652,8 +658,8 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
         if nargout>2:                                               # do we want derivatives?
             dnlZ = dnlZStruct(meanfunc, covfunc, likfunc)           # allocate space for derivatives
             [d,P,R] = self._fitcRefresh(d0,Ku,R0,V,W)                # g = diag(inv(inv(K)+W))/2
-            g = d/2 + 0.5*np.atleast_2d((np.dot(np.dot(R,R0),P)**2).sum(axis=0)).T
-            t = W/(1+W*d0)
+            g = old_div(d,2) + 0.5*np.atleast_2d((np.dot(np.dot(R,R0),P)**2).sum(axis=0)).T
+            t = old_div(W,(1+W*d0))
 
             dfhat = g*d3lp  # deriv. of nlZ wrt. fhat: dfhat=diag(inv(inv(K)+W)).*d3lp/2
             for ii in range(len(covfunc.hyp)):                      # covariance hypers
@@ -663,7 +669,7 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
                 v = ddiagK-w                                        # v = diag(dK)-diag(dQ);
                 dnlZ.cov[ii] = np.dot(ddiagK.T,t) - np.dot((RVdd*RVdd).sum(axis=0),v)   # explicit part
                 dnlZ.cov[ii] -= (np.dot(RVdd,dA)*np.dot(RVdd,R0tV.T)).sum()             # explicit part
-                dnlZ.cov[ii] = 0.5*dnlZ.cov[ii] - np.dot(alpha.T,np.dot(dA,np.dot(R0tV,alpha))+v*alpha)/2. # explicit
+                dnlZ.cov[ii] = 0.5*dnlZ.cov[ii] - old_div(np.dot(alpha.T,np.dot(dA,np.dot(R0tV,alpha))+v*alpha),2.) # explicit
                 b = np.dot(dA,np.dot(R0tV,dlp)) + v*dlp             # b-K*(Z*b) = inv(eye(n)+K*diag(W))*b
                 KZb = self._mvmK(self._mvmZ(b,RVdd,t),V,d0)
                 dnlZ.cov[ii] -= np.dot(dfhat.T,(b-KZb))             # implicit part
@@ -684,7 +690,7 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
                     t = np.array([(T*T).sum(axis=0)]).T
                     z = np.dot(alpha.T,np.dot(T.T,np.dot(T,alpha))-t*alpha) - np.dot(np.array([(RVdd*RVdd).sum(axis=0)]),t)
                     z += (np.dot(RVdd,T.T)**2).sum()
-                    b = (t*dlp-np.dot(T.T,np.dot(T,dlp)))/2.
+                    b = old_div((t*dlp-np.dot(T.T,np.dot(T,dlp))),2.)
                     KZb = self._mvmK(self._mvmZ(b,RVdd,t),V,d0)
                     z -= np.dot(dfhat.T,b-KZb)
                     dnlZ.lik[ii] += z
@@ -737,16 +743,16 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
         nlZ_old = np.inf; sweep = 0               # converged, max. sweeps or min. sweeps?
         while (np.abs(nlZ-nlZ_old) > tol and sweep < max_sweep) or (sweep < min_sweep):
             nlZ_old = nlZ; sweep += 1
-            rperm = xrange(n)                     # randperm(n)
+            rperm = range(n)                     # randperm(n)
             for ii in rperm:                      # iterate EP updates (in random order) over examples
-                tau_ni = 1/Sigma[ii,ii] - ttau[ii]#  first find the cavity distribution ..
-                nu_ni  = mu[ii]/Sigma[ii,ii] + m[ii]*tau_ni - tnu[ii]    # .. params tau_ni and nu_ni
+                tau_ni = old_div(1,Sigma[ii,ii]) - ttau[ii]#  first find the cavity distribution ..
+                nu_ni  = old_div(mu[ii],Sigma[ii,ii]) + m[ii]*tau_ni - tnu[ii]    # .. params tau_ni and nu_ni
                 # compute the desired derivatives of the indivdual log partition function
-                lZ,dlZ,d2lZ = likfunc.evaluate(y[ii], nu_ni/tau_ni, 1/tau_ni, inffunc, None, 3)
+                lZ,dlZ,d2lZ = likfunc.evaluate(y[ii], old_div(nu_ni,tau_ni), old_div(1,tau_ni), inffunc, None, 3)
                 ttau_old = copy(ttau[ii])         # then find the new tilde parameters, keep copy of old
-                ttau[ii] = -d2lZ  /(1.+d2lZ/tau_ni)
+                ttau[ii] = old_div(-d2lZ,(1.+old_div(d2lZ,tau_ni)))
                 ttau[ii] = max(ttau[ii],0)        # enforce positivity i.e. lower bound ttau by zero
-                tnu[ii]  = ( dlZ + (m[ii]-nu_ni/tau_ni)*d2lZ )/(1.+d2lZ/tau_ni)
+                tnu[ii]  = old_div(( dlZ + (m[ii]-old_div(nu_ni,tau_ni))*d2lZ ),(1.+old_div(d2lZ,tau_ni)))
                 ds2 = ttau[ii] - ttau_old         # finally rank-1 update Sigma ..
                 si  = np.reshape(Sigma[:,ii],(Sigma.shape[0],1))
                 Sigma = Sigma - ds2/(1.+ds2*si[ii])*np.dot(si,si.T)   # takes 70# of total time
@@ -769,17 +775,17 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
             Sigma = K - np.dot(V.T,V)
             mu = np.dot(Sigma,tnu)
             Dsigma = np.reshape(np.diag(Sigma),(np.diag(Sigma).shape[0],1))
-            tau_n = 1/Dsigma-ttau                           # compute the log marginal likelihood
-            nu_n  = mu/Dsigma-tnu                           # vectors of cavity parameters
+            tau_n = old_div(1,Dsigma)-ttau                           # compute the log marginal likelihood
+            nu_n  = old_div(mu,Dsigma)-tnu                           # vectors of cavity parameters
             F = np.dot(alpha,alpha.T) - np.tile(sW,(1,n))* \
                 solve_chol(L,np.diag(np.reshape(sW,(sW.shape[0],))))   # covariance hypers
             for jj in range(len(covfunc.hyp)):
                 dK = covfunc.getDerMatrix(x=x, mode='train', der=jj)
-                dnlZ.cov[jj] = -(F*dK).sum()/2.
+                dnlZ.cov[jj] = old_div(-(F*dK).sum(),2.)
             for ii in range(len(likfunc.hyp)):
-                dlik = likfunc.evaluate(y, nu_n/tau_n, 1/tau_n, inffunc, ii)
+                dlik = likfunc.evaluate(y, old_div(nu_n,tau_n), old_div(1,tau_n), inffunc, ii)
                 dnlZ.lik[ii] = -dlik.sum()
-            junk,dlZ = likfunc.evaluate(y, nu_n/tau_n, 1/tau_n, inffunc, None, 2) # mean hyps
+            junk,dlZ = likfunc.evaluate(y, old_div(nu_n,tau_n), old_div(1,tau_n), inffunc, None, 2) # mean hyps
             for ii in range(len(meanfunc.hyp)):
                 dm = meanfunc.getDerMatrix(x, ii)
                 dnlZ.mean[ii] = -np.dot(dlZ.T,dm)
@@ -859,19 +865,19 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
         while (np.abs(nlZ-nlZ_old) > tol and sweep < max_sweep) or (sweep < min_sweep):
             nlZ_old = nlZ
             sweep += 1
-            rperm = range(n)                            # randperm(n)
+            rperm = list(range(n))                            # randperm(n)
             for ii in rperm:                            # iterate EP updates (in random order) over examples
                 p_i = np.reshape(P[:,ii],(P.shape[0],1))
                 t = np.dot(R,np.dot(R0,p_i))            # temporary variables
                 sigma_i = d[ii] + np.dot(t.T,t); mu_i = nn[ii] + np.dot(p_i.T,gg) # post moments O(nu^2)
-                tau_ni = 1/sigma_i - ttau[ii]                   #  first find the cavity distribution ..
-                nu_ni  = mu_i/sigma_i + m[ii]*tau_ni - tnu[ii]  # .. params tau_ni and nu_ni
+                tau_ni = old_div(1,sigma_i) - ttau[ii]                   #  first find the cavity distribution ..
+                nu_ni  = old_div(mu_i,sigma_i) + m[ii]*tau_ni - tnu[ii]  # .. params tau_ni and nu_ni
                 # compute the desired derivatives of the indivdual log partition function
-                vargout = likfunc.evaluate(y[ii], nu_ni/tau_ni, 1/tau_ni, inffunc, None, 3)
+                vargout = likfunc.evaluate(y[ii], old_div(nu_ni,tau_ni), old_div(1,tau_ni), inffunc, None, 3)
                 lZ = vargout[0]; dlZ = vargout[1]; d2lZ = vargout[2]
-                ttau_i = -d2lZ  /(1.+d2lZ/tau_ni)
+                ttau_i = old_div(-d2lZ,(1.+old_div(d2lZ,tau_ni)))
                 ttau_i = max(ttau_i,0)                  # enforce positivity i.e. lower bound ttau by zero
-                tnu_i  = ( dlZ + (m[ii]-nu_ni/tau_ni)*d2lZ )/(1.+d2lZ/tau_ni)
+                tnu_i  = old_div(( dlZ + (m[ii]-old_div(nu_ni,tau_ni))*d2lZ ),(1.+old_div(d2lZ,tau_ni)))
                 [d,P[:,ii],R,nn,gg,ttau,tnu] = self._epfitcUpdate(d,P[:,ii],R,nn,gg,ttau,tnu,ii,ttau_i,tnu_i,m,d0,Ku,R0)# update representation
 
             # recompute since repeated rank-one updates can destroy numerical precision
@@ -885,7 +891,7 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
         self.last_tnu = tnu       # remember for next call
         post = postStruct()
         post.sW = np.sqrt(ttau)   # unused for FITC_EP prediction with gp.m
-        dd = 1/(d0+1/ttau)
+        dd = old_div(1,(d0+old_div(1,ttau)))
         alpha = tnu/ttau*dd
         RV = np.dot(R,V)
         R0tV = np.dot(R0.T,V)
@@ -904,10 +910,10 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
                 v = ddiagK - w                                # v = diag(dK)-diag(dQ)
                 z = np.dot(dd.T,(v+w)) - np.dot(np.atleast_2d((RVdd*RVdd).sum(axis=0)), v) \
                        - (np.dot(RVdd,dA).T * np.dot(R0tV,RVdd.T)).sum()
-                dnlZ.cov[ii] = (z - np.dot(alpha.T,(alpha*v)) - np.dot(np.dot(alpha.T,dA),np.dot(R0tV,alpha)))/2.
+                dnlZ.cov[ii] = old_div((z - np.dot(alpha.T,(alpha*v)) - np.dot(np.dot(alpha.T,dA),np.dot(R0tV,alpha))),2.)
                 dnlZ.cov[ii] = dnlZ.cov[ii][0,0]
             for ii in range(len(likfunc.hyp)):                # likelihood hypers
-                dlik = likfunc.evaluate(y, nu_n/tau_n+m, 1/tau_n, inffunc, ii, 1)
+                dlik = likfunc.evaluate(y, old_div(nu_n,tau_n)+m, old_div(1,tau_n), inffunc, ii, 1)
                 dnlZ.lik[ii] = -dlik.sum()
                 if ii == len(likfunc.hyp)-1:
                     # since snu2 is a fixed fraction of sn2, there is a covariance-like term
@@ -917,7 +923,7 @@ def evaluate(self, meanfunc, covfunc, likfunc, x, y, nargout=1):
                     z = z + np.dot(post.alpha.T,post.alpha) - np.dot(alpha.T,(v*alpha))
                     dnlZ.lik[ii] += snu2*z
                     dnlZ.lik[ii] = dnlZ.lik[ii][0,0]
-            [junk,dlZ] = likfunc.evaluate(y, nu_n/tau_n, 1/tau_n, inffunc, None, 2) # mean hyps
+            [junk,dlZ] = likfunc.evaluate(y, old_div(nu_n,tau_n), old_div(1,tau_n), inffunc, None, 2) # mean hyps
             for ii in range(len(meanfunc.hyp)):
                 dm = meanfunc.getDerMatrix(x, ii)
                 dnlZ.mean[ii] = -np.dot(dlZ.T,dm)
diff --git a/pyGPs/Core/lik.py b/pyGPs/Core/lik.py
index 4f484f6..eb18f50 100644
--- a/pyGPs/Core/lik.py
+++ b/pyGPs/Core/lik.py
@@ -1,3 +1,7 @@
+from __future__ import division
+from __future__ import absolute_import
+from past.utils import old_div
+from builtins import object
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
 #    Shan Huang [shan dot huang at iais dot fraunhofer dot de]
@@ -39,12 +43,12 @@
 
 import numpy as np
 from scipy.special import erf
-import inf
 
 class Likelihood(object):
     """Base function for Likelihood function"""
     def __init__(self):
         self.hyp = []
+
     def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
         '''
         The likelihood functions have two possible modes, the mode being selected
@@ -129,6 +133,7 @@ def __init__(self, log_sigma=np.log(0.1) ):
         self.hyp = [log_sigma]
 
     def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
+        from . import inf
         sn2 = np.exp(2. * self.hyp[0])
         if inffunc is None:              # prediction mode
             if y is None:
@@ -137,7 +142,7 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
             if (not s2 is None) and np.linalg.norm(s2) > 0:
                 s2zero = False
             if s2zero:                   # log probability
-                lp = -(y-mu)**2 /sn2/2 - np.log(2.*np.pi*sn2)/2.
+                lp = -(y-mu)**2 /sn2/2 - old_div(np.log(2.*np.pi*sn2),2.)
                 s2 = np.zeros_like(s2)
             else:
                 inf_func = inf.EP()   # prediction
@@ -154,29 +159,29 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
         else:
             if isinstance(inffunc, inf.EP):
                 if der is None:                                  # no derivative mode
-                    lZ = -(y-mu)**2/(sn2+s2)/2. - np.log(2*np.pi*(sn2+s2))/2. # log part function
+                    lZ = -(y-mu)**2/(sn2+s2)/2. - old_div(np.log(2*np.pi*(sn2+s2)),2.) # log part function
                     if nargout>1:
-                        dlZ  = (y-mu)/(sn2+s2)                   # 1st derivative w.r.t. mean
+                        dlZ  = old_div((y-mu),(sn2+s2))                   # 1st derivative w.r.t. mean
                         if nargout>2:
-                            d2lZ = -1/(sn2+s2)                   # 2nd derivative w.r.t. mean
+                            d2lZ = old_div(-1,(sn2+s2))                   # 2nd derivative w.r.t. mean
                             return lZ,dlZ,d2lZ
                         else:
                            return lZ,dlZ
                     else:
                         return lZ
                 else:                                            # derivative mode
-                    dlZhyp = ((y-mu)**2/(sn2+s2)-1) / (1+s2/sn2) # deriv. w.r.t. hyp.lik
+                    dlZhyp = old_div((old_div((y-mu)**2,(sn2+s2))-1), (1+old_div(s2,sn2))) # deriv. w.r.t. hyp.lik
                     return dlZhyp
             elif isinstance(inffunc, inf.Laplace):
                 if der is None:                                  # no derivative mode
                     if y is None:
                         y=0
                     ymmu = y-mu
-                    lp = -ymmu**2/(2*sn2) - np.log(2*np.pi*sn2)/2.
+                    lp = old_div(-ymmu**2,(2*sn2)) - old_div(np.log(2*np.pi*sn2),2.)
                     if nargout>1:
-                        dlp = ymmu/sn2                           # dlp, derivative of log likelihood
+                        dlp = old_div(ymmu,sn2)                           # dlp, derivative of log likelihood
                         if nargout>2:                            # d2lp, 2nd derivative of log likelihood
-                            d2lp = -np.ones_like(ymmu)/sn2
+                            d2lp = old_div(-np.ones_like(ymmu),sn2)
                             if nargout>3:                        # d3lp, 3rd derivative of log likelihood
                                 d3lp = np.zeros_like(ymmu)
                                 return lp,dlp,d2lp,d3lp
@@ -187,7 +192,7 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
                     else:
                         return lp
                 else:                                            # derivative mode
-                    lp_dhyp   = (y-mu)**2/sn2 - 1                # derivative of log likelihood w.r.t. hypers
+                    lp_dhyp   = old_div((y-mu)**2,sn2) - 1                # derivative of log likelihood w.r.t. hypers
                     dlp_dhyp  = 2*(mu-y)/sn2                     # first derivative,
                     d2lp_dhyp = 2*np.ones_like(mu)/sn2           # and also of the second mu derivative
                     return lp_dhyp,dlp_dhyp,d2lp_dhyp
@@ -239,6 +244,7 @@ def __init__(self):
         self.hyp = []
 
     def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
+        from . import inf
         if not y is None:
             y = np.sign(y)
             y[y==0] = 1
@@ -288,7 +294,7 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
 
             elif isinstance(inffunc, inf.EP):
                 if der is None:                          # no derivative mode
-                    z = mu/np.sqrt(1+s2)
+                    z = old_div(mu,np.sqrt(1+s2))
                     junk,lZ = self.cumGauss(y,z,2)       # log part function
                     if not y is None:
                          z = z*y
@@ -325,7 +331,7 @@ def cumGauss(self, y=None, f=None, nargout=1):
             yf = y*f
         else:
             yf = f
-        p = (1. + erf(yf/np.sqrt(2.)))/2. # likelihood
+        p = old_div((1. + erf(old_div(yf,np.sqrt(2.)))),2.) # likelihood
         if nargout>1:
             lp = self.logphi(yf,p)
             return p,lp
@@ -336,13 +342,13 @@ def gauOverCumGauss(self,f,p):
         # return n_p = gauOverCumGauss(f,p)
         n_p = np.zeros_like(f)       # safely compute Gaussian over cumulative Gaussian
         ok = f>-5                    # naive evaluation for large values of f
-        n_p[ok] = (np.exp(-f[ok]**2/2)/np.sqrt(2*np.pi)) / p[ok]
+        n_p[ok] = old_div((old_div(np.exp(old_div(-f[ok]**2,2)),np.sqrt(2*np.pi))), p[ok])
         bd = f<-6                    # tight upper bound evaluation
-        n_p[bd] = np.sqrt(f[bd]**2/4+1)-f[bd]/2
+        n_p[bd] = np.sqrt(old_div(f[bd]**2,4)+1)-old_div(f[bd],2)
         interp = np.logical_and(np.logical_not(ok),np.logical_not(bd)) # linearly interpolate between both of them
         tmp = f[interp]
         lam = -5. - f[interp]
-        n_p[interp] = (1-lam)*(np.exp(-tmp**2/2)/np.sqrt(2*np.pi))/p[interp] + lam *(np.sqrt(tmp**2/4+1)-tmp/2);
+        n_p[interp] = (1-lam)*(old_div(np.exp(old_div(-tmp**2,2)),np.sqrt(2*np.pi)))/p[interp] + lam *(np.sqrt(old_div(tmp**2,4)+1)-old_div(tmp,2));
         return n_p
 
     def logphi(self,z,p):
@@ -353,9 +359,9 @@ def logphi(self,z,p):
         bd = z<zmin                                 # use asymptotics
         nok = np.logical_not(ok)
         ip = np.logical_and(nok,np.logical_not(bd)) # interpolate between both of them
-        lam = 1/(1.+np.exp( 25.*(0.5-(z[ip]-zmin)/(zmax-zmin)) ))  # interp. weights
+        lam = old_div(1,(1.+np.exp( 25.*(0.5-old_div((z[ip]-zmin),(zmax-zmin))) )))  # interp. weights
         lp[ok] = np.log(p[ok])
-        lp[nok] = -np.log(np.pi)/2. -z[nok]**2/2. - np.log( np.sqrt(z[nok]**2/2.+2.) - z[nok]/np.sqrt(2.) )
+        lp[nok] = old_div(-np.log(np.pi),2.) -old_div(z[nok]**2,2.) - np.log( np.sqrt(old_div(z[nok]**2,2.)+2.) - old_div(z[nok],np.sqrt(2.)) )
         lp[ip] = (1-lam)*lp[ip] + lam*np.log( p[ip] )
         return lp
 
@@ -374,7 +380,8 @@ def __init__(self, log_sigma=np.log(0.1) ):
         self.hyp = [ log_sigma ]
 
     def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
-        sn = np.exp(self.hyp); b = sn/np.sqrt(2);
+        from . import inf
+        sn = np.exp(self.hyp); b = old_div(sn,np.sqrt(2));
         if y is None:
             y = np.zeros_like(mu)
         if inffunc is None:                              # prediction mode if inf is not present
@@ -385,7 +392,7 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
                 if np.linalg.norm(s2)>0:
                     s2zero = False                       # s2==0?
             if s2zero:                                   # log probability evaluation
-                lp = -np.abs(y-mu)/b -np.log(2*b); s2 = 0
+                lp = old_div(-np.abs(y-mu),b) -np.log(2*b); s2 = 0
             else:                                        # prediction
                 lp = self.evaluate(y, mu, s2, inf.EP())
             if nargout>1:
@@ -403,9 +410,9 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
                     if y is None:
                         y = np.zeros_like(mu)
                     ymmu = y-mu
-                    lp = np.abs(ymmu)/b - np.log(2*b)
+                    lp = old_div(np.abs(ymmu),b) - np.log(2*b)
                     if nargout>1:                        # derivative of log likelihood
-                        dlp = np.sign(ymmu)/b
+                        dlp = old_div(np.sign(ymmu),b)
                         if nargout>2:                    # 2nd derivative of log likelihood
                             d2lp = np.zeros_like(ymmu)
                             if nargout>3:                # 3rd derivative of log likelihood
@@ -418,8 +425,8 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
                     else:
                         return lp
                 else:                                    # derivative w.r.t. hypers
-                    lp_dhyp = np.abs(y-mu)/b - 1           # derivative of log likelihood w.r.t. hypers
-                    dlp_dhyp = np.sign(mu-y)/b              # first derivative,
+                    lp_dhyp = old_div(np.abs(y-mu),b) - 1           # derivative of log likelihood w.r.t. hypers
+                    dlp_dhyp = old_div(np.sign(mu-y),b)              # first derivative,
                     d2lp_dhyp = np.zeros(mu.shape)         # and also of the second mu derivative
                     return lp_dhyp, dlp_dhyp, d2lp_dhyp
             elif isinstance(inffunc, inf.EP):
@@ -439,7 +446,7 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
                     dlZ = np.zeros((n,1))
                     d2lZ = np.zeros((n,1))
                     if np.any(idlik):
-                        l = Gauss(log_sigma=np.log(s2[idlik])/2)
+                        l = Gauss(log_sigma=old_div(np.log(s2[idlik]),2))
                         a = l.evaluate(mu[idlik], y[idlik])
                         lZ[idlik] = a[0]; dlZ[idlik] = a[1]; d2lZ[idlik] = a[2]
                     if np.any(idgau):
@@ -448,11 +455,11 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
                         lZ[idgau] = a[0]; dlZ[idgau] = a[1]; d2lZ[idgau] = a[2]
                     if np.any(id):
                         # substitution to obtain unit variance, zero mean Laplacian
-                        tvar = s2[id]/(sn[id]**2+1e-16)
-                        tmu = (mu[id]-y[id])/(sn[id]+1e-16)
+                        tvar = old_div(s2[id],(sn[id]**2+1e-16))
+                        tmu = old_div((mu[id]-y[id]),(sn[id]+1e-16))
                         # an implementation based on logphi(t) = log(normcdf(t))
-                        zp = (tmu+np.sqrt(2)*tvar)/np.sqrt(tvar)
-                        zm = (tmu-np.sqrt(2)*tvar)/np.sqrt(tvar)
+                        zp = old_div((tmu+np.sqrt(2)*tvar),np.sqrt(tvar))
+                        zm = old_div((tmu-np.sqrt(2)*tvar),np.sqrt(tvar))
                         ap =  self._logphi(-zp)+np.sqrt(2)*tmu
                         am =  self._logphi( zm)-np.sqrt(2)*tmu
                         apam = np.vstack((ap,am)).T
@@ -461,16 +468,16 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
                     if nargout>1:
                         lqp = -0.5*zp**2 - 0.5*np.log(2*np.pi) - self._logphi(-zp);       # log( N(z)/Phi(z) )
                         lqm = -0.5*zm**2 - 0.5*np.log(2*np.pi) - self._logphi( zm);
-                        dap = -np.exp(lqp-0.5*np.log(s2[id])) + np.sqrt(2)/sn[id]
-                        dam =  np.exp(lqm-0.5*np.log(s2[id])) - np.sqrt(2)/sn[id]
+                        dap = -np.exp(lqp-0.5*np.log(s2[id])) + old_div(np.sqrt(2),sn[id])
+                        dam =  np.exp(lqm-0.5*np.log(s2[id])) - old_div(np.sqrt(2),sn[id])
                         _z1 = np.vstack((ap,am)).T
                         _z2 = np.vstack((dap,dam)).T
                         _x = np.array([[1],[1]])
                         dlZ[id] = self._expABz_expAx(_z1, _x, _z2, _x)
                         if nargout>2:
                             a = np.sqrt(8.)/sn[id]/np.sqrt(s2[id]);
-                            bp = 2./sn[id]**2 - (a - zp/s2[id])*np.exp(lqp)
-                            bm = 2./sn[id]**2 - (a + zm/s2[id])*np.exp(lqm)
+                            bp = old_div(2.,sn[id]**2) - (a - old_div(zp,s2[id]))*np.exp(lqp)
+                            bm = old_div(2.,sn[id]**2) - (a + old_div(zm,s2[id]))*np.exp(lqm)
                             _x = np.reshape(np.array([1,1]),(2,1))
                             _z1 = np.reshape(np.array([ap,am]),(1,2))
                             _z2 = np.reshape(np.array([bp,bm]),(1,2))
@@ -491,12 +498,12 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
 
                     if np.any(id):
                         # substitution to obtain unit variance, zero mean Laplacian
-                        tmu = (mu[id]-y[id])/(sn[id]+1e-16);        tvar = s2[id]/(sn[id]**2+1e-16)
-                        zp  = (tvar+tmu/np.sqrt(2))/np.sqrt(tvar);  vp = tvar+np.sqrt(2)*tmu
-                        zm  = (tvar-tmu/np.sqrt(2))/np.sqrt(tvar);  vm = tvar-np.sqrt(2)*tmu
-                        dzp = (-s2[id]/sn[id]+tmu*sn[id]/np.sqrt(2)) / np.sqrt(s2[id])
+                        tmu = old_div((mu[id]-y[id]),(sn[id]+1e-16));        tvar = old_div(s2[id],(sn[id]**2+1e-16))
+                        zp  = old_div((tvar+old_div(tmu,np.sqrt(2))),np.sqrt(tvar));  vp = tvar+np.sqrt(2)*tmu
+                        zm  = old_div((tvar-old_div(tmu,np.sqrt(2))),np.sqrt(tvar));  vm = tvar-np.sqrt(2)*tmu
+                        dzp = old_div((old_div(-s2[id],sn[id])+tmu*sn[id]/np.sqrt(2)), np.sqrt(s2[id]))
                         dvp = -2*tvar - np.sqrt(2)*tmu
-                        dzm = (-s2[id]/sn[id]-tmu*sn[id]/np.sqrt(2)) / np.sqrt(s2[id])
+                        dzm = old_div((old_div(-s2[id],sn[id])-tmu*sn[id]/np.sqrt(2)), np.sqrt(s2[id]))
                         dvm = -2*tvar + np.sqrt(2)*tmu
                         lezp = self._lerfc(zp); # ap = exp(vp).*ezp
                         lezm = self._lerfc(zm); # am = exp(vm).*ezm
@@ -505,7 +512,7 @@ def evaluate(self, y=None, mu=None, s2=None, inffunc=None, der=None, nargout=1):
                         em  = np.exp(vm+lezm-vmax)
                         dap = ep*(dvp - 2/np.sqrt(np.pi)*np.exp(-zp**2-lezp)*dzp)
                         dam = em*(dvm - 2/np.sqrt(np.pi)*np.exp(-zm**2-lezm)*dzm)
-                        dlZhyp[id] = (dap+dam)/(ep+em) - 1;
+                        dlZhyp[id] = old_div((dap+dam),(ep+em)) - 1;
                     return dlZhyp               # deriv. wrt hyp.lik
             elif isinstance(inffunc, inf.VB):
                 n = len(s2.flatten()); b = np.zeros((n,1)); y = y*np.ones((n,1)); z = y
@@ -520,8 +527,8 @@ def _lerfc(self,t):
         bd = t>tmax                              # evaluate tight bound
         nok = np.logical_not(ok)
         interp = np.logical_and(nok,np.logical_not(bd)) # interpolate between both of them
-        f[nok] = np.log(2/np.sqrt(np.pi)) -t[nok]**2 -np.log(t[nok]+np.sqrt( t[nok]**2+4/np.pi ))
-        lam = 1/(1+np.exp( 12*(0.5-(t[interp]-tmin)/(tmax-tmin)) ))   # interp. weights
+        f[nok] = np.log(old_div(2,np.sqrt(np.pi))) -t[nok]**2 -np.log(t[nok]+np.sqrt( t[nok]**2+old_div(4,np.pi) ))
+        lam = old_div(1,(1+np.exp( 12*(0.5-old_div((t[interp]-tmin),(tmax-tmin))) )))   # interp. weights
         f[interp] = lam*f[interp] + (1-lam)*np.log(erfc( t[interp] ))
         f[ok] += np.log(erfc( t[ok] ))             # safe eval
         return f
@@ -536,7 +543,7 @@ def _expABz_expAx(self,A,x,B,z):
         maxA = np.max(A,axis=1)                    # number of columns, max over columns
         maxA = np.array([maxA]).T
         A = A - np.dot(maxA, np.ones((1,N)))       # subtract maximum value
-        y = ( np.dot((np.exp(A)*B),z) ) / ( np.dot(np.exp(A),x) )
+        y = old_div(( np.dot((np.exp(A)*B),z) ), ( np.dot(np.exp(A),x) ))
         return y[0]
 
     def _logphi(self,z):
@@ -549,10 +556,10 @@ def _logphi(self,z):
         bd = z<zmin                                 # use asymptotics
         nok = np.logical_not(ok)
         ip = np.logical_and(nok,np.logical_not(bd)) # interpolate between both of them
-        lam = 1./(1.+np.exp( 25.*(0.5-(z[ip]-zmin)/(zmax-zmin)) ))  # interp. weights
-        lp[ok] = np.log( 0.5*( 1.+erf(z[ok]/np.sqrt(2.)) ) )
-        lp[nok] = -0.5*(np.log(np.pi) + z[nok]**2) - np.log( np.sqrt(2.+0.5*(z[nok]**2)) - z[nok]/np.sqrt(2)) 
-        lp[ip] = (1-lam)*lp[ip] + lam*np.log( 0.5*( 1.+erf(z[ip]/np.sqrt(2.)) ) )
+        lam = old_div(1.,(1.+np.exp( 25.*(0.5-old_div((z[ip]-zmin),(zmax-zmin))) )))  # interp. weights
+        lp[ok] = np.log( 0.5*( 1.+erf(old_div(z[ok],np.sqrt(2.))) ) )
+        lp[nok] = -0.5*(np.log(np.pi) + z[nok]**2) - np.log( np.sqrt(2.+0.5*(z[nok]**2)) - old_div(z[nok],np.sqrt(2))) 
+        lp[ip] = (1-lam)*lp[ip] + lam*np.log( 0.5*( 1.+erf(old_div(z[ip],np.sqrt(2.))) ) )
         return lp
 
     def _logsum2exp(self,logx):
diff --git a/pyGPs/Core/mean.py b/pyGPs/Core/mean.py
index be1a587..0d25e59 100644
--- a/pyGPs/Core/mean.py
+++ b/pyGPs/Core/mean.py
@@ -1,3 +1,7 @@
+from __future__ import print_function
+from builtins import str
+from builtins import range
+from builtins import object
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -52,8 +56,8 @@ def __init__(self):
 
     def __repr__(self):
         strvalue =str(type(self))+': to get the mean vector or mean derviatives use: \n'+\
-	    	  'model.meanfunc.getMean()\n'+\
-		  'model.meanfunc.getDerMatrix()'
+              'model.meanfunc.getMean()\n'+\
+          'model.meanfunc.getDerMatrix()'
         return strvalue
 
 
@@ -85,7 +89,7 @@ def __mul__(self,other):
         elif isinstance(other, Mean):
             return ProductOfMean(self,other)
         else:
-            print "only numbers and Means are allowed for *"
+            print("only numbers and Means are allowed for *")
 
 
 
@@ -105,7 +109,7 @@ def __pow__(self,number):
         if isinstance(number, int) and number > 0:
             return PowerOfMean(self,number)
         else:
-            print "only non-zero integers are supported for **"
+            print("only non-zero integers are supported for **")
 
 
 
@@ -340,10 +344,10 @@ class Linear(Mean):
     '''
     def __init__(self, D=None, alpha_list=None):
         if alpha_list is None:
-        	if D is None:
-        		self.hyp = [0.5]
-        	else:
-	            self.hyp = [0.5 for i in xrange(D)]
+            if D is None:
+                self.hyp = [0.5]
+            else:
+                self.hyp = [0.5 for i in range(D)]
         else:
             self.hyp = alpha_list
 
diff --git a/pyGPs/Core/opt.py b/pyGPs/Core/opt.py
index 73763b7..43a64b0 100644
--- a/pyGPs/Core/opt.py
+++ b/pyGPs/Core/opt.py
@@ -1,3 +1,9 @@
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+from builtins import range
+from past.utils import old_div
+from builtins import object
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -20,7 +26,7 @@
 
 import numpy as np
 import pyGPs
-import gp
+
 from scipy.optimize import fmin_bfgs as bfgs
 from scipy.optimize import fmin_cg as cg
 from pyGPs.Optimization import minimize, scg
@@ -29,6 +35,7 @@
 class Optimizer(object):
     def __init__(self, model=None, searchConfig = None):
         self.model = model
+        from . import gp
 
     def findMin(self, x, y, numIters):
         '''
@@ -103,9 +110,9 @@ def findMin(self, x, y, numIters = 100):
             funcValue  = opt[1]
             warnFlag   = opt[4]
             if warnFlag == 1:
-                print "Maximum number of iterations exceeded."
+                print("Maximum number of iterations exceeded.")
             elif warnFlag ==  2:
-                print "Gradient and/or function calls not changing."
+                print("Gradient and/or function calls not changing.")
         except:
             self.errorCounter += 1
             if not self.searchConfig:         
@@ -118,7 +125,7 @@ def findMin(self, x, y, numIters = 100):
                 raise Exception('Specify at least one of the stop conditions')
             while True:
                 self.trailsCounter += 1                 # increase counter
-                for i in xrange(hypInArray.shape[0]):   # random init of hyp
+                for i in range(hypInArray.shape[0]):   # random init of hyp
                     hypInArray[i]= np.random.uniform(low=searchRange[i][0], high=searchRange[i][1])
                 # value this time is better than optiaml min value
                 try:
@@ -128,14 +135,14 @@ def findMin(self, x, y, numIters = 100):
                         optimalHyp = thisopt[0]
                 except:
                     self.errorCounter += 1
-                if self.searchConfig.num_restarts and self.errorCounter > self.searchConfig.num_restarts/2:
-                    print "[CG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                if self.searchConfig.num_restarts and self.errorCounter > old_div(self.searchConfig.num_restarts,2):
+                    print("[CG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     raise Exception("Over half of the trails failed for conjugate gradient")
                 if self.searchConfig.num_restarts and self.trailsCounter > self.searchConfig.num_restarts-1:         # if exceed num_restarts
-                    print "[CG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                    print("[CG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     return optimalHyp, funcValue
                 if self.searchConfig.min_threshold and funcValue <= self.searchConfig.min_threshold:           # reach provided mininal
-                    print "[CG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                    print("[CG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     return optimalHyp, funcValue 
         return optimalHyp, funcValue
 
@@ -163,9 +170,9 @@ def findMin(self, x, y, numIters = 100):
             funcValue  = opt[1]
             warnFlag   = opt[6]
             if warnFlag == 1:
-                print "Maximum number of iterations exceeded."
+                print("Maximum number of iterations exceeded.")
             elif warnFlag ==  2:
-                print "Gradient and/or function calls not changing."
+                print("Gradient and/or function calls not changing.")
         except:
             self.errorCounter += 1
             if not self.searchConfig:         
@@ -179,7 +186,7 @@ def findMin(self, x, y, numIters = 100):
                 raise Exception('Specify at least one of the stop conditions')
             while True:
                 self.trailsCounter += 1                 # increase counter
-                for i in xrange(hypInArray.shape[0]):   # random init of hyp
+                for i in range(hypInArray.shape[0]):   # random init of hyp
                     hypInArray[i]= np.random.uniform(low=searchRange[i][0], high=searchRange[i][1])
                 # value this time is better than optiaml min value
                 try:
@@ -189,14 +196,14 @@ def findMin(self, x, y, numIters = 100):
                         optimalHyp = thisopt[0]
                 except:
                     self.errorCounter += 1
-                if self.searchConfig.num_restarts and self.errorCounter > self.searchConfig.num_restarts/2:
-                    print "[BFGS] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                if self.searchConfig.num_restarts and self.errorCounter > old_div(self.searchConfig.num_restarts,2):
+                    print("[BFGS] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     raise Exception("Over half of the trails failed for BFGS")
                 if self.searchConfig.num_restarts and self.trailsCounter > self.searchConfig.num_restarts-1:         # if exceed num_restarts
-                    print "[BFGS] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                    print("[BFGS] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     return optimalHyp, funcValue
                 if self.searchConfig.min_threshold and funcValue <= self.searchConfig.min_threshold:           # reach provided mininal
-                    print "[BFGS] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                    print("[BFGS] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     return optimalHyp, funcValue
 
         return optimalHyp, funcValue
@@ -235,7 +242,7 @@ def findMin(self, x, y, numIters = 100):
                 raise Exception('Specify at least one of the stop conditions')
             while True:
                 self.trailsCounter += 1                 # increase counter
-                for i in xrange(hypInArray.shape[0]):   # random init of hyp
+                for i in range(hypInArray.shape[0]):   # random init of hyp
                     hypInArray[i]= np.random.uniform(low=searchRange[i][0], high=searchRange[i][1])
                 # value this time is better than optiaml min value
                 try:
@@ -245,14 +252,14 @@ def findMin(self, x, y, numIters = 100):
                         optimalHyp = thisopt[0]
                 except:
                     self.errorCounter += 1
-                if self.searchConfig.num_restarts and self.errorCounter > self.searchConfig.num_restarts/2:
-                    print "[Minimize] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                if self.searchConfig.num_restarts and self.errorCounter > old_div(self.searchConfig.num_restarts,2):
+                    print("[Minimize] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     raise Exception("Over half of the trails failed for minimize")
                 if self.searchConfig.num_restarts and self.trailsCounter > self.searchConfig.num_restarts-1:         # if exceed num_restarts
-                    print "[Minimize] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                    print("[Minimize] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     return optimalHyp, funcValue
                 if self.searchConfig.min_threshold and funcValue <= self.searchConfig.min_threshold:           # reach provided mininal
-                    print "[Minimize] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                    print("[Minimize] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     return optimalHyp, funcValue
         return optimalHyp, funcValue
 
@@ -289,7 +296,7 @@ def findMin(self, x, y, numIters = 100):
                 raise Exception('Specify at least one of the stop conditions')
             while True:
                 self.trailsCounter += 1                 # increase counter
-                for i in xrange(hypInArray.shape[0]):   # random init of hyp
+                for i in range(hypInArray.shape[0]):   # random init of hyp
                     hypInArray[i]= np.random.uniform(low=searchRange[i][0], high=searchRange[i][1])
                 # value this time is better than optiaml min value
                 try:
@@ -299,14 +306,14 @@ def findMin(self, x, y, numIters = 100):
                         optimalHyp = thisopt[0]
                 except:
                     self.errorCounter += 1
-                if self.searchConfig.num_restarts and self.errorCounter > self.searchConfig.num_restarts/2:
-                    print "[SCG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                if self.searchConfig.num_restarts and self.errorCounter > old_div(self.searchConfig.num_restarts,2):
+                    print("[SCG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     raise Exception("Over half of the trails failed for Scaled conjugate gradient")
                 if self.searchConfig.num_restarts and self.trailsCounter > self.searchConfig.num_restarts-1:         # if exceed num_restarts
-                    print "[SCG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                    print("[SCG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     return optimalHyp, funcValue
                 if self.searchConfig.min_threshold and funcValue <= self.searchConfig.min_threshold:           # reach provided mininal
-                    print "[SCG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter)
+                    print("[SCG] %d out of %d trails failed during optimization" % (self.errorCounter, self.trailsCounter))
                     return optimalHyp, funcValue 
 
         return optimalHyp, funcValue
diff --git a/pyGPs/Core/tools.py b/pyGPs/Core/tools.py
index 3ad619d..bb9b360 100644
--- a/pyGPs/Core/tools.py
+++ b/pyGPs/Core/tools.py
@@ -1,3 +1,7 @@
+from __future__ import division
+from __future__ import print_function
+from past.builtins import cmp
+from past.utils import old_div
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -60,17 +64,17 @@ def jitchol(A,maxtries=5):
     else:
         diagA = np.diag(A)
         if np.any(diagA <= 0.):
-            raise np.linalg.LinAlgError, "kernel matrix not positive definite: non-positive diagonal elements"
+            raise np.linalg.LinAlgError("kernel matrix not positive definite: non-positive diagonal elements")
         jitter = diagA.mean() * 1e-9
         while maxtries > 0 and np.isfinite(jitter):
-            print 'Warning: adding jitter of {:.10e} to diagnol of kernel matrix for numerical stability'.format(jitter)
+            print('Warning: adding jitter of {:.10e} to diagnol of kernel matrix for numerical stability'.format(jitter))
             try:
                 return np.linalg.cholesky(A + np.eye(A.shape[0]).T * jitter, lower=True)
             except:
                 jitter *= 10
             finally:
                 maxtries -= 1
-        raise np.linalg.LinAlgError, "kernel matrix not positive definite, even with jitter."
+        raise np.linalg.LinAlgError("kernel matrix not positive definite, even with jitter.")
 
 
 
@@ -190,7 +194,7 @@ def brentmin(xlow,xupp,Nitmax,tol,f,nout=None,*args):
     funccount += 1
     fv = fx; fw = fx
     xm = 0.5*(a+b)
-    tol1 = seps*abs(xf) + tol/3.0;
+    tol1 = seps*abs(xf) + old_div(tol,3.0);
     tol2 = 2.0*tol1
     # Main loop
     while ( abs(xf-xm) > (tol2 - 0.5*(b-a)) ):
@@ -210,7 +214,7 @@ def brentmin(xlow,xupp,Nitmax,tol,f,nout=None,*args):
             # Is the parabola acceptable
             if ( (abs(p)<abs(0.5*q*r)) and (p>q*(a-xf)) and (p<q*(b-xf)) ):
                 # Yes, parabolic interpolation step
-                d = p/q
+                d = old_div(p,q)
                 x = xf+d
                 # f must not be evaluated too close to ax or bx
                 if ((x-a) < tol2) or ((b-x) < tol2):
@@ -250,7 +254,7 @@ def brentmin(xlow,xupp,Nitmax,tol,f,nout=None,*args):
             elif ( (fu <= fv) or ((v == xf) or (v == w)) ):
                 v = x; fv = fu
         xm = 0.5*(a+b)
-        tol1 = seps*abs(xf) + tol/3.0; tol2 = 2.0*tol1
+        tol1 = seps*abs(xf) + old_div(tol,3.0); tol2 = 2.0*tol1
         if funccount >= Nitmax:
             # typically we should not get here
             # print 'Warning: Specified number of function evaluation reached (brentmin)'
diff --git a/pyGPs/Demo/Airline/demo_Airline.py b/pyGPs/Demo/Airline/demo_Airline.py
index 6c8a26a..f95476e 100755
--- a/pyGPs/Demo/Airline/demo_Airline.py
+++ b/pyGPs/Demo/Airline/demo_Airline.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+from builtins import range
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -51,7 +53,7 @@
       # the deafult mean will be adapted to the average value of the training labels..
       # ..if you do not specify mean function by your own.
       model.optimize(x, y)
-      print _, model.nlZ
+      print(_, model.nlZ)
       model.predict(xt)
       model.plot()
 
diff --git a/pyGPs/Demo/Classification/demo_GPC.py b/pyGPs/Demo/Classification/demo_GPC.py
index f08bbbd..d16d634 100644
--- a/pyGPs/Demo/Classification/demo_GPC.py
+++ b/pyGPs/Demo/Classification/demo_GPC.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -18,8 +19,8 @@
 # you may want to read demo_GPR, demo_kernel and demo_optimization first!
 # Here, the focus is on the difference of classification model.
 
-print ''
-print '---------------------GPC DEMO-------------------------'
+print('')
+print('---------------------GPC DEMO-------------------------')
 
 #----------------------------------------------------------------------
 # Load demo data (generated from Gaussians)
@@ -44,20 +45,20 @@
 #----------------------------------------------------------------------
 # First example -> state default values
 #----------------------------------------------------------------------
-print 'Basic Example - Data'
+print('Basic Example - Data')
 model = pyGPs.GPC()          # binary classification (default inference method: EP)
 model.plotData_2d(x1,x2,t1,t2,p1,p2)
 model.getPosterior(x, y)     # fit default model (mean zero & rbf kernel) with data
 model.optimize(x, y)         # optimize hyperparamters (default optimizer: single run minimize)
 model.predict(z)             # predict test cases
 
-print 'Basic Example - Prediction'
+print('Basic Example - Prediction')
 model.plot(x1,x2,t1,t2)
 
 #----------------------------------------------------------------------
 # GP classification example
 #----------------------------------------------------------------------
-print 'More Advanced Example'
+print('More Advanced Example')
 # Start from a new model 
 model = pyGPs.GPC()    
 
@@ -66,9 +67,9 @@
 model.setPrior(kernel=k) 
 
 model.getPosterior(x, y)
-print "Negative log marginal liklihood before:", round(model.nlZ,3)
+print("Negative log marginal liklihood before:", round(model.nlZ,3))
 model.optimize(x, y)
-print "Negative log marginal liklihood optimized:", round(model.nlZ,3)
+print("Negative log marginal liklihood optimized:", round(model.nlZ,3))
 
 # Prediction
 n = z.shape[0]
@@ -78,7 +79,7 @@
 # plot log probability distribution for class +1
 model.plot(x1,x2,t1,t2)
 
-print '--------------------END OF DEMO-----------------------'
+print('--------------------END OF DEMO-----------------------')
 
 
 
diff --git a/pyGPs/Demo/Classification/demo_GPC_FITC.py b/pyGPs/Demo/Classification/demo_GPC_FITC.py
index 841e7ed..f6533b3 100644
--- a/pyGPs/Demo/Classification/demo_GPC_FITC.py
+++ b/pyGPs/Demo/Classification/demo_GPC_FITC.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+from builtins import zip
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -18,8 +20,8 @@
 # you may want to read demo_GPR, demo_kernel and demo_optimization first!
 # Here, the focus is on the difference of FITC classification.
 
-print ''
-print '-------------------GPC_FITC DEMO----------------------'
+print('')
+print('-------------------GPC_FITC DEMO----------------------')
 
 #----------------------------------------------------------------------
 # Load demo data (generated from Gaussians)
@@ -45,7 +47,7 @@
 # Sparse GP classification (FITC) example
 #----------------------------------------------------------------------
 
-print "Example 1: default inducing points"
+print("Example 1: default inducing points")
 
 # Start from a new model
 model = pyGPs.GPC_FITC()
@@ -60,7 +62,7 @@
 # model.setData(x, y, value_per_axis=10)
 
 model.optimize()
-print "Negative log marginal likelihood optimized:", round(model.nlZ,3)
+print("Negative log marginal likelihood optimized:", round(model.nlZ,3))
 
 # Prediction
 n = z.shape[0]
@@ -70,15 +72,15 @@
 
 
 
-print '------------------------------------------------------'
-print "Example 2: user-defined inducing points"
+print('------------------------------------------------------')
+print("Example 2: user-defined inducing points")
 
 model = pyGPs.GPC_FITC()
 
 # You can define inducing points yourself.
 # u = np.array([])
 u1,u2 = np.meshgrid(np.linspace(-2,2,5),np.linspace(-2,2,5))
-u = np.array(zip(np.reshape(u2,(np.prod(u2.shape),)),np.reshape(u1,(np.prod(u1.shape),))))
+u = np.array(list(zip(np.reshape(u2,(np.prod(u2.shape),)),np.reshape(u1,(np.prod(u1.shape),)))))
 
 # and specify inducing point when seting prior
 m = pyGPs.mean.Zero()
@@ -88,9 +90,9 @@
 # The rest is analogous to what we have done before.
 model.setData(x, y)
 model.getPosterior()
-print "Negative log marginal likelihood before optimization:", round(model.nlZ,3)
+print("Negative log marginal likelihood before optimization:", round(model.nlZ,3))
 model.optimize()
-print "Negative log marginal likelihood optimized:", round(model.nlZ,3)
+print("Negative log marginal likelihood optimized:", round(model.nlZ,3))
 
 # predict
 n = z.shape[0]
@@ -100,7 +102,7 @@
 
 
 
-print '--------------------END OF DEMO-----------------------'
+print('--------------------END OF DEMO-----------------------')
 
 
 
diff --git a/pyGPs/Demo/GraphData/demo_GraphKernel.py b/pyGPs/Demo/GraphData/demo_GraphKernel.py
index f80aba5..d5fb8ce 100644
--- a/pyGPs/Demo/GraphData/demo_GraphKernel.py
+++ b/pyGPs/Demo/GraphData/demo_GraphKernel.py
@@ -1,3 +1,6 @@
+from __future__ import print_function
+from builtins import str
+from builtins import range
 
 #! /usr/bin/env python
 #coding=utf-8
@@ -34,7 +37,7 @@
 N = graph_label.shape[0]                              # number of graphs)
 
 graph_label = np.int8(graph_label)
-for i in xrange(N):
+for i in range(N):
     if graph_label[i,0] == 0:
         graph_label[i,0] -= 1
 
@@ -43,19 +46,19 @@
 #===========================================================================
 num_Iteration = 10
 w = 1e-4
-dist = 'tv' 		# possible values: 'tv', 'hellinger'
-np.random.seed(1)	# set random seed to get reproducible kernel matrices (to account for randomness in kernel average resutls over several returns of the experiment)	
+dist = 'tv'         # possible values: 'tv', 'hellinger'
+np.random.seed(1)    # set random seed to get reproducible kernel matrices (to account for randomness in kernel average resutls over several returns of the experiment)    
 K = graphKernels.propagationKernel(A, node_label, gr_id, num_Iteration, w, dist, 'label_diffusion', SUM=True, VIS=False, showEachStep=False) 
 
 #----------------------------------------------------------------------
 # Cross Validation
 #----------------------------------------------------------------------
-print '...GP prediction (10-fold CV)'
+print('...GP prediction (10-fold CV)')
 
-for t in xrange(num_Iteration+1):
+for t in range(num_Iteration+1):
     ACC = []           # accuracy
     
-    print 'number of kernel iterations =', t
+    print('number of kernel iterations =', t)
     Matrix = K[:,:,t]
     # normalize kernel matrix (not useful for MUTAG)
     # Matrix = graphUtil.normalizeKernel(Matrix)
@@ -86,6 +89,6 @@
         ACC.append(acc)
     
     
-    print 'Accuracy: ', np.round(np.mean(ACC),2), '('+str(np.round(np.std(ACC),2))+')'
+    print('Accuracy: ', np.round(np.mean(ACC),2), '('+str(np.round(np.std(ACC),2))+')')
      
   
diff --git a/pyGPs/Demo/GraphData/graphData/toNumpyMatrix.py b/pyGPs/Demo/GraphData/graphData/toNumpyMatrix.py
index 1f344d8..be55c29 100644
--- a/pyGPs/Demo/GraphData/graphData/toNumpyMatrix.py
+++ b/pyGPs/Demo/GraphData/graphData/toNumpyMatrix.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import numpy as np
 from scipy.io import loadmat
 from scipy.sparse.csc import csc_matrix
@@ -12,11 +13,11 @@
 node_label = data['responses']   # n x 1 node label array
 graph_label = data['labels']     # N x 1 graph label array
 
-print A.shape
-print A.indices
-print A.data
-print A.indptr
-print type(A)
+print(A.shape)
+print(A.indices)
+print(A.data)
+print(A.indptr)
+print(type(A))
 
 
 np.savez("MUTAG", graph_ind=gr_id, responses=node_label, labels=graph_label, adj_data=A.data,\
diff --git a/pyGPs/Demo/Housing/demo_Housing.py b/pyGPs/Demo/Housing/demo_Housing.py
index 1167af2..5149058 100755
--- a/pyGPs/Demo/Housing/demo_Housing.py
+++ b/pyGPs/Demo/Housing/demo_Housing.py
@@ -1,3 +1,7 @@
+from __future__ import division
+from __future__ import print_function
+from builtins import range
+from past.utils import old_div
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -24,20 +28,20 @@
     N = 25
     # Get all data (exclude the 4th column which is binary) except the last 50 points for training
     x  = np.concatenate((data[:-N,:4],data[:-N,5:-1]),axis=1)
-    x = (x - np.mean(x,axis=0))/(np.std(x,axis=0)+1.e-16)
+    x = old_div((x - np.mean(x,axis=0)),(np.std(x,axis=0)+1.e-16))
     # The function we will perform regression on:  Median Value of owner occupied homes
     y  = np.reshape(data[:-N,-1],(len(data[:-N,-1]),1))
-    y = (y-np.mean(y))/(np.std(y)+1.e-16)
+    y = old_div((y-np.mean(y)),(np.std(y)+1.e-16))
     # Test on the last 50 points
     xs  = np.concatenate((data[-N:,:4],data[-N:,5:-1]),axis=1)
-    xs = (xs - np.mean(xs,axis=0))/(np.std(xs,axis=0)+1.e-16)
+    xs = old_div((xs - np.mean(xs,axis=0)),(np.std(xs,axis=0)+1.e-16))
     ys = np.reshape(data[-N:,-1],(N,1))
-    ys = (ys-np.mean(ys))/(np.std(ys)+1.e-16)
+    ys = old_div((ys-np.mean(ys)),(np.std(ys)+1.e-16))
     N,D = x.shape
     
     model = pyGPs.GPR()
     model.getPosterior(x, y)
-    print 'Initial negative log marginal likelihood = ', round(model.nlZ,3)
+    print('Initial negative log marginal likelihood = ', round(model.nlZ,3))
     
     # train and predict
     from time import clock
@@ -46,17 +50,17 @@
     t1 = clock()
     ym, ys2, fm, fs2, lp = model.predict(xs)
     xa  = np.concatenate((data[:,:4],data[:,5:-1]),axis=1)
-    xa = (xa - np.mean(xa,axis=0))/(np.std(xa,axis=0)+1.e-16)
+    xa = old_div((xa - np.mean(xa,axis=0)),(np.std(xa,axis=0)+1.e-16))
     ya, ys2a, fma, fs2a, lpa = model.predict(xa)
 
-    print 'Time to optimize = ', t1-t0
-    print 'Optimized mean = ', model.meanfunc.hyp
-    print 'Optimized covariance = ', model.covfunc.hyp
-    print 'Optimized liklihood = ', model.likfunc.hyp
-    print 'Final negative log marginal likelihood = ', round(model.nlZ,3)
+    print('Time to optimize = ', t1-t0)
+    print('Optimized mean = ', model.meanfunc.hyp)
+    print('Optimized covariance = ', model.covfunc.hyp)
+    print('Optimized liklihood = ', model.likfunc.hyp)
+    print('Final negative log marginal likelihood = ', round(model.nlZ,3))
 
     #HousingPlotter(range(len(y)),y,range(len(ym)),ym,ys2,range(len(y),len(y)+len(ys)),ys)
-    xm = np.array(range(len(y),len(y)+ym.shape[0]))
+    xm = np.array(list(range(len(y),len(y)+ym.shape[0])))
     ym = np.reshape(ym,(ym.shape[0],))
     zm = np.reshape(ys2,(ym.shape[0],))
 
diff --git a/pyGPs/Demo/JHUI/demo_Validation.py b/pyGPs/Demo/JHUI/demo_Validation.py
index 2db1ca9..d1c2ac8 100644
--- a/pyGPs/Demo/JHUI/demo_Validation.py
+++ b/pyGPs/Demo/JHUI/demo_Validation.py
@@ -1,3 +1,6 @@
+from __future__ import print_function
+from builtins import str
+from builtins import range
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -20,8 +23,8 @@
 
 # This example shows the process of cross-validation using real dataset.
 
-print ''
-print '---------------Cross-Validation DEMO------------------'
+print('')
+print('---------------Cross-Validation DEMO------------------')
 
 #----------------------------------------------------------------------
 # Load raw data (ionosphere dataset from UCI)
@@ -48,7 +51,7 @@
 # Here we deal with label in ionosphere data,
 # change "b" to"-1", and "g" to "+1"
 n,D = x.shape
-for i in xrange(n):
+for i in range(n):
     if y[i,0][0] == 'g':
         y[i,0] = 1
     else:
@@ -65,7 +68,7 @@
 
 cv_run = 0
 for x_train, x_test, y_train, y_test in valid.k_fold_validation(x, y, K):
-    print 'Run:', cv_run
+    print('Run:', cv_run)
     # This is a binary classification problem
     model = pyGPs.GPC()
     # Since no prior knowldege, leave everything default 
@@ -79,17 +82,17 @@
 
     # Evluation
     acc = valid.ACC(ymu_class, y_test)
-    print '   accuracy =', round(acc,2) 
+    print('   accuracy =', round(acc,2)) 
     rmse = valid.RMSE(ymu_class, y_test)
-    print '   rmse =', round(rmse,2)
+    print('   rmse =', round(rmse,2))
     ACC.append(acc)
     RMSE.append(rmse)
 
     # Toward next run
     cv_run += 1   
 
-print '\nAccuracy: ', np.round(np.mean(ACC),2), '('+str(np.round(np.std(ACC),2))+')'
-print 'Root-Mean-Square Error: ', np.round(np.mean(RMSE),2)
+print('\nAccuracy: ', np.round(np.mean(ACC),2), '('+str(np.round(np.std(ACC),2))+')')
+print('Root-Mean-Square Error: ', np.round(np.mean(RMSE),2))
 
 
 
@@ -100,12 +103,12 @@
 '''
 We defined the following measures(@SEE pyGPs.Valid.valid):
 
-	RMSE - Root mean squared error
-	ACC - Classification accuracy
-	Prec - Precision for class +1
-	Recall - Recall for class +1
-	NLPD - Negative log predictive density in transformed observation space
+    RMSE - Root mean squared error
+    ACC - Classification accuracy
+    Prec - Precision for class +1
+    Recall - Recall for class +1
+    NLPD - Negative log predictive density in transformed observation space
 '''
 
 
-print '--------------------END OF DEMO-----------------------'
+print('--------------------END OF DEMO-----------------------')
diff --git a/pyGPs/Demo/MaunaLoa/demo_MaunaLoa.py b/pyGPs/Demo/MaunaLoa/demo_MaunaLoa.py
index 36c3534..648b4de 100755
--- a/pyGPs/Demo/MaunaLoa/demo_MaunaLoa.py
+++ b/pyGPs/Demo/MaunaLoa/demo_MaunaLoa.py
@@ -1,3 +1,7 @@
+from __future__ import division
+from __future__ import print_function
+from builtins import zip
+from past.utils import old_div
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -32,7 +36,7 @@
 if __name__ == '__main__':
 
     # LOAD data
-    infile = 'mauna.txt'	# Note: Samples with value -99.99 were dropped.
+    infile = 'mauna.txt'    # Note: Samples with value -99.99 were dropped.
     f      = open(infile,'r')
     year   = []
     co2    = []
@@ -54,18 +58,18 @@
     n,D = x.shape
 
     # TEST POINTS
-    xs = np.arange(2004+1./24.,2024-1./24.,1./12.)
+    xs = np.arange(2004+old_div(1.,24.),2024-old_div(1.,24.),old_div(1.,12.))
     xs = xs.reshape(len(xs),1)
 
     # DEFINE parameterized covariance function
     k1 = pyGPs.cov.RBF(np.log(67.), np.log(66.))
     k2 = pyGPs.cov.Periodic(np.log(1.3), np.log(1.0), np.log(2.4)) * pyGPs.cov.RBF(np.log(90.), np.log(2.4))
     k3 = pyGPs.cov.RQ(np.log(1.2), np.log(0.66), np.log(0.78))
-    k4 = pyGPs.cov.RBF(np.log(1.6/12.), np.log(0.18)) + pyGPs.cov.Noise(np.log(0.19))
+    k4 = pyGPs.cov.RBF(np.log(old_div(1.6,12.)), np.log(0.18)) + pyGPs.cov.Noise(np.log(0.19))
     k  = k1 + k2 + k3 + k4
 
     # STANDARD GP (prediction)
-    print 'Original CO2 Data:'
+    print('Original CO2 Data:')
     model = pyGPs.GPR()
     model.setData(x,y)
     model.plotData_1d()
@@ -79,11 +83,11 @@
     t1 = clock()
     model.predict(xs)
 
-    print 'Using Handcrafted Kernel from GPML Book:'
-    print 'Time to optimize = ', t1-t0
-    print 'Optimized mean = ', model.meanfunc.hyp
-    print 'Optimized covariance = ', model.covfunc.hyp
-    print 'Optimized liklihood = ', model.likfunc.hyp
-    print 'Final negative log marginal likelihood = ', round(model.nlZ,3)
+    print('Using Handcrafted Kernel from GPML Book:')
+    print('Time to optimize = ', t1-t0)
+    print('Optimized mean = ', model.meanfunc.hyp)
+    print('Optimized covariance = ', model.covfunc.hyp)
+    print('Optimized liklihood = ', model.likfunc.hyp)
+    print('Final negative log marginal likelihood = ', round(model.nlZ,3))
 
     model.plot()
diff --git a/pyGPs/Demo/Regression/demo_GPR.py b/pyGPs/Demo/Regression/demo_GPR.py
index 145b753..bc8baec 100644
--- a/pyGPs/Demo/Regression/demo_GPR.py
+++ b/pyGPs/Demo/Regression/demo_GPR.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -27,8 +28,8 @@
 
 
 
-print ''
-print '---------------------GPR DEMO-------------------------'
+print('')
+print('---------------------GPR DEMO-------------------------')
 
 #----------------------------------------------------------------------
 # Load demo data (generated from Gaussians)
@@ -41,21 +42,21 @@
 #----------------------------------------------------------------------
 # A five-line example
 #----------------------------------------------------------------------
-print 'Basic Example'
+print('Basic Example')
 model = pyGPs.GPR()          # model
-print 'Before Optimization'
+print('Before Optimization')
 model.setData(x,y)
 model.predict(z)             # predict test cases (before optimization)
 model.plot()                 # and plot result
 model.optimize(x, y)         # optimize hyperparamters (default optimizer: single run minimize)
-print 'After Optimization'
+print('After Optimization')
 model.predict(z)             # predict test cases
 model.plot()                 # and plot result
 
 #----------------------------------------------------------------------
 # Now lets do another example to get more insight to the toolbox
 #----------------------------------------------------------------------
-print 'More Advanced Example (using a non-zero mean and Matern7 kernel)'
+print('More Advanced Example (using a non-zero mean and Matern7 kernel)')
 model = pyGPs.GPR()           # start from a new model
 
 # Specify non-default mean and covariance functions
@@ -95,7 +96,7 @@
 #   model.fm (predictive latent means)
 #   model.fs2 (predictive latent variances)
 #   model.lp (log predictive probability)
-print 'Optimized negative log marginal likelihood:', round(model.nlZ,3)
+print('Optimized negative log marginal likelihood:', round(model.nlZ,3))
 
 
 # Predict test data
@@ -125,5 +126,5 @@
 # You don't need it if you optimize it later anyway
 model.setNoise( log_sigma=np.log(0.1) )
 
-print '--------------------END OF DEMO-----------------------'
+print('--------------------END OF DEMO-----------------------')
 
diff --git a/pyGPs/Demo/Regression/demo_GPR_FITC.py b/pyGPs/Demo/Regression/demo_GPR_FITC.py
index 0bf68b1..6a2b86e 100644
--- a/pyGPs/Demo/Regression/demo_GPR_FITC.py
+++ b/pyGPs/Demo/Regression/demo_GPR_FITC.py
@@ -1,3 +1,6 @@
+from __future__ import division
+from __future__ import print_function
+from past.utils import old_div
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -18,8 +21,8 @@
 # you may want to read demo_GPR, demo_kernel and demo_optimization first!
 # Here, the focus is on the difference of FITC model.
 
-print ''
-print '-------------------GPR_FITC DEMO----------------------'
+print('')
+print('-------------------GPR_FITC DEMO----------------------')
 
 #----------------------------------------------------------------------
 # Load demo data (generated from Gaussians)
@@ -35,7 +38,7 @@
 # Sparse GP regression (FITC) example
 #----------------------------------------------------------------------
 
-print "Example 1: default inducing points"
+print("Example 1: default inducing points")
 
 # Start from a new model 
 model = pyGPs.GPR_FITC()
@@ -50,7 +53,7 @@
 # model.setData(x, y, value_per_axis=10)
 
 model.optimize()
-print "Negative log marginal liklihood optimized:", round(model.nlZ,3)
+print("Negative log marginal liklihood optimized:", round(model.nlZ,3))
 
 # Prediction             
 model.predict(z)
@@ -59,8 +62,8 @@
 
 
 
-print '------------------------------------------------------'
-print "Example 2: user-defined inducing points"
+print('------------------------------------------------------')
+print("Example 2: user-defined inducing points")
 
 # Start from a new model 
 model = pyGPs.GPR_FITC()            
@@ -70,7 +73,7 @@
 u = np.array([[-1], [-0.8], [-0.5], [0.3],[1.]])
 
 # or equally-spaced inducing points
-num_u = np.fix(x.shape[0]/2)
+num_u = np.fix(old_div(x.shape[0],2))
 u = np.linspace(-1.3,1.3,num_u).T
 u = np.reshape(u,(num_u,1))
 
@@ -83,9 +86,9 @@
 # The rest is analogous to what we have done before
 model.setData(x, y)
 model.getPosterior()
-print "Negative log marginal liklihood before optimization:", round(model.nlZ,3)
+print("Negative log marginal liklihood before optimization:", round(model.nlZ,3))
 model.optimize()
-print "Negative log marginal liklihood optimized:", round(model.nlZ,3)
+print("Negative log marginal liklihood optimized:", round(model.nlZ,3))
 
 # Prediction
 ymu, ys2, fmu, fs2, lp = model.predict(z)
@@ -93,7 +96,7 @@
 model.plot()
 
 
-print '--------------------END OF DEMO-----------------------'
+print('--------------------END OF DEMO-----------------------')
 
 
 
diff --git a/pyGPs/Demo/USPS/demo_GPMC.py b/pyGPs/Demo/USPS/demo_GPMC.py
index 9730d16..b4a6ec0 100644
--- a/pyGPs/Demo/USPS/demo_GPMC.py
+++ b/pyGPs/Demo/USPS/demo_GPMC.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -21,8 +22,8 @@
 # you may want to read demo_GPR, demo_kernel and demo_optimization first!
 # Here, the focus is on multi-class classification.
 
-print ''
-print '---------------------GPMC DEMO-----------------------'
+print('')
+print('---------------------GPMC DEMO-----------------------')
 
 # GPMC is NOT based on multi-class Laplace Approximation.
 # It works as a one vs. one classification wrapper
@@ -82,7 +83,7 @@
 
 # Accuracy of recognized digit
 acc = valid.ACC(predictive_class, ys)
-print "Accuracy of recognizing hand-writen digits:", round(acc,2)
+print("Accuracy of recognizing hand-writen digits:", round(acc,2))
 
 
 #----------------------------------------------------------------------
@@ -99,5 +100,5 @@
 # there is also an option to predict without optimization
 # model.fitAndPredict(xs)
 
-print '--------------------END OF DEMO-----------------------'
+print('--------------------END OF DEMO-----------------------')
 
diff --git a/pyGPs/Demo/USPS/demo_NodeKernel.py b/pyGPs/Demo/USPS/demo_NodeKernel.py
index 2d4c50e..cb13b06 100644
--- a/pyGPs/Demo/USPS/demo_NodeKernel.py
+++ b/pyGPs/Demo/USPS/demo_NodeKernel.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+from builtins import range
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -28,7 +30,7 @@ def load_binary(D1,D2,reduce=False):
     D1_list = []
     D2_list = []
     n,D = x.shape
-    for i in xrange(n):
+    for i in range(n):
         if y[i,D1] == 1:
             D1_list.append(i)
         elif y[i,D2] == 1:
@@ -76,12 +78,12 @@ def plotDigit(digit, title_str=''):
     exampleDigit7_2 = np.reshape(exampleDigit7_2,(16,16))
     plotDigit(exampleDigit7_2, 'This is another 2.')
 
-    # true class 1 	
+    # true class 1     
     exampleDigitBad = x[70]    # digit that predicts wrong for rbf
     exampleDigitBad = np.reshape(exampleDigitBad,(16,16))
     plotDigit(exampleDigitBad, 'This digit is an example where the rbf kernel predicts the wrong class (2). \nDiffusion kernel predicts correctly due to graph information!')
     
-    # true class 2 	
+    # true class 2     
     exampleDigitBad = x[190]    # digit that predicts wrong for diff
     exampleDigitBad = np.reshape(exampleDigitBad,(16,16))
     plotDigit(exampleDigitBad, 'This digit is an example where the diff kernel predicts the wrong class (1). \nrbf kernel, however, predicts correctly!')
@@ -95,7 +97,7 @@ def plotDigit(digit, title_str=''):
     N = Matrix.shape[0]
     
     # set training and test set
-    index_train = xrange(N)
+    index_train = range(N)
     index_test = [38, 81, 124, 129, 70, 190]
     index_train =  np.setdiff1d(index_train, index_test)
         
@@ -117,7 +119,7 @@ def plotDigit(digit, title_str=''):
     
     # evaluation 
     predictive_class_rbf = np.sign(model.ym)
-    ACC_rbf = valid.ACC(predictive_class_rbf, y_test)	
+    ACC_rbf = valid.ACC(predictive_class_rbf, y_test)    
 
     ## DIFFUSION Kernel
     # compute kernel matrix and initalize GP with precomputed kernel                  
@@ -159,10 +161,10 @@ def plotDigit(digit, title_str=''):
     ACC_sum = valid.ACC(predictive_class_sum, y_test)
     
     
-    print np.hstack((np.array(index_test, ndmin=2).T, y_test, predictive_class_rbf, predictive_class_diff, predictive_class_sum))
-    print 'accuracy (RBF): ' , ACC_rbf
-    print 'accuracy (DIFF): ' , ACC_diff
-    print 'accuracy (SUM): ' , ACC_sum
+    print(np.hstack((np.array(index_test, ndmin=2).T, y_test, predictive_class_rbf, predictive_class_diff, predictive_class_sum)))
+    print('accuracy (RBF): ' , ACC_rbf)
+    print('accuracy (DIFF): ' , ACC_diff)
+    print('accuracy (SUM): ' , ACC_sum)
 
 
    
diff --git a/pyGPs/Demo/USPS/loadBinaryUSPS.py b/pyGPs/Demo/USPS/loadBinaryUSPS.py
index 7f18496..d3b9823 100644
--- a/pyGPs/Demo/USPS/loadBinaryUSPS.py
+++ b/pyGPs/Demo/USPS/loadBinaryUSPS.py
@@ -1,3 +1,4 @@
+from builtins import range
 import numpy as np 
 from scipy.io import loadmat
 import os,sys
@@ -14,7 +15,7 @@ def load_binary(D1,D2,reduce=False):
     D1_list = []
     D2_list = []
     n,D = x.shape
-    for i in xrange(n):
+    for i in range(n):
         if y[i,D1] == 1:
             D1_list.append(i)
         elif y[i,D2] == 1:
diff --git a/pyGPs/Demo/generate_data_for_Rasmussen_examples.py b/pyGPs/Demo/generate_data_for_Rasmussen_examples.py
index faa5c6b..e05e413 100644
--- a/pyGPs/Demo/generate_data_for_Rasmussen_examples.py
+++ b/pyGPs/Demo/generate_data_for_Rasmussen_examples.py
@@ -1,3 +1,5 @@
+from builtins import zip
+from builtins import range
 import numpy as np 
 
 def save_data_regresssion():
@@ -16,7 +18,7 @@ def save_data_regresssion():
 
     # TEST points
     # test points evenly distributed in the interval [-2, 2.5]
-    xstar = np.array(range(-200,250,4), dtype=np.float64, ndmin=2).T
+    xstar = np.array(list(range(-200,250,4)), dtype=np.float64, ndmin=2).T
     xstar /= 100
     np.savez('Regression/regression_data', x=x, y=y, xstar=xstar) 
 
@@ -155,7 +157,7 @@ def save_data_classification():
     # For plotting, we superimpose the data points with the posterior equi-probability contour 
     # lines for the probability of class two given complete information about the generating mechanism.
     t1,t2 = np.meshgrid(np.arange(-4,4.1,0.1),np.arange(-4,4.1,0.1))
-    t = np.array(zip(np.reshape(t1,(np.prod(t1.shape),)),np.reshape(t2,(np.prod(t2.shape),)))) # these are the test inputs
+    t = np.array(list(zip(np.reshape(t1,(np.prod(t1.shape),)),np.reshape(t2,(np.prod(t2.shape),))))) # these are the test inputs
     n = t.shape[0]
 
     tmm = np.zeros_like(t)
diff --git a/pyGPs/GraphExtensions/__init__.py b/pyGPs/GraphExtensions/__init__.py
index 0771e01..7de7fda 100644
--- a/pyGPs/GraphExtensions/__init__.py
+++ b/pyGPs/GraphExtensions/__init__.py
@@ -1,3 +1,4 @@
-import graphKernels
-import graphUtil
-import nodeKernels
+from __future__ import absolute_import
+from . import graphKernels
+from . import graphUtil
+from . import nodeKernels
diff --git a/pyGPs/GraphExtensions/graphKernels.py b/pyGPs/GraphExtensions/graphKernels.py
index 7ba3213..6a16b26 100644
--- a/pyGPs/GraphExtensions/graphKernels.py
+++ b/pyGPs/GraphExtensions/graphKernels.py
@@ -1,3 +1,8 @@
+from __future__ import division
+from __future__ import print_function
+from builtins import str
+from builtins import range
+from past.utils import old_div
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -68,7 +73,7 @@ def propagationKernel(A, l, gr_id, h_max, w, p, ktype=None, SUM=True, VIS=False,
     ## INITIALIZE label probabilities of labeled nodes (num_nodes x num_labels)
     if l.shape[1]==1:
         lab_prob = np.zeros((num_nodes,num_labels),dtype=np.float64)            
-        for i in xrange(num_nodes):
+        for i in range(num_nodes):
             if l[i,0] > 0:
                 lab_prob[i,l[i,0]-1] = 1
     else: 
@@ -83,7 +88,7 @@ def propagationKernel(A, l, gr_id, h_max, w, p, ktype=None, SUM=True, VIS=False,
     ## INITIALIZE unlabeled/uninitialized nodes UNIFORMLY
     idx_unif = np.where(np.sum(lab_prob, axis=1)==0)[0]         
     if idx_unif.shape[0] != 0:
-        lab_prob[idx_unif,:] = 1./lab_prob.shape[1] 
+        lab_prob[idx_unif,:] = old_div(1.,lab_prob.shape[1]) 
     ## row normalize A -> transition matrix T
     if (ktype=='label_propagation') or (ktype=='label_diffusion'):
         row_sums = np.array(A.sum(axis=1))[:,0] 
@@ -93,12 +98,12 @@ def propagationKernel(A, l, gr_id, h_max, w, p, ktype=None, SUM=True, VIS=False,
     #===========================================================================
     # ## PROPAGATION KERNEL ITERATIONS
     #===========================================================================
-    for h in xrange(h_max+1):  
-        print 'ITERATION: ', h
+    for h in range(h_max+1):  
+        print('ITERATION: ', h)
         if h > 0:
             ## LABEL UPDATE 
             if showEachStep:
-                print '...computing LABEL UPDATE'
+                print('...computing LABEL UPDATE')
             
             if ktype == 'label_propagation':
                 lab_prob[idx,:] = lab_orig[idx,:]   # PUSH BACK original LABELS
@@ -112,7 +117,7 @@ def propagationKernel(A, l, gr_id, h_max, w, p, ktype=None, SUM=True, VIS=False,
                 # y(t+1) = alpha*S*y(t)+(1-alpha)*y(0)
                 alpha = 0.8
                 # compute S
-                diag = np.array(A.sum(axis=1)).T**(-1/2)
+                diag = np.array(A.sum(axis=1)).T**(old_div(-1,2))
                 D = spsp.lil_matrix((A.shape[0], A.shape[1]), dtype=float) #
                 D.setdiag(diag[0,:])
                 D = D.tocsr()
@@ -121,7 +126,7 @@ def propagationKernel(A, l, gr_id, h_max, w, p, ktype=None, SUM=True, VIS=False,
         
         ## COMPUTE hashvalues 
         if showEachStep:
-            print '...computing hashvalues'
+            print('...computing hashvalues')
         # determine path to take depending on chosen distance
         use_cauchy = (p =='L1') or (p =='tv')
         take_sqrt  = (p =='hellinger') 
@@ -136,18 +141,18 @@ def propagationKernel(A, l, gr_id, h_max, w, p, ktype=None, SUM=True, VIS=False,
         b = w * np.random.rand()         
         # compute hashes
         # hashLabels is a Vector with length: number of nodes (hashvalue for respective node)
-        hashLabels = (np.dot(lab_prob,v) + b) / w
+        hashLabels = old_div((np.dot(lab_prob,v) + b), w)
         hashLabels = np.floor(hashLabels)                           # take floor
         uniqueHash, hashLabels = np.unique(hashLabels, return_inverse=True)  # map to consecutive integer from 0        
         ## COMPUTE kernel contribution 
         if showEachStep:
-            print '...computing KERNEL contribution'                     
+            print('...computing KERNEL contribution')                     
         # aggregate counts on graphs
         # counts is a matrix: number of graphs x number of hashlabels
         num_bins = len(uniqueHash)
         counts = np.zeros((num_graphs, num_bins))      # init counts matrix 
         # accumulate counts of hash labels                              
-        for i in xrange(num_nodes):
+        for i in range(num_nodes):
             counts[ (gr_id[i,0]-1), hashLabels[i] ] +=1
             
         # compute base kernel (here: LINEAR kernel)
@@ -163,7 +168,7 @@ def propagationKernel(A, l, gr_id, h_max, w, p, ktype=None, SUM=True, VIS=False,
             K[:,:,h] = K_h
         
         if showEachStep:
-            print K[:,:,h] 
+            print(K[:,:,h]) 
 
     ## VISUALIZE KERNELS
     if VIS:  
diff --git a/pyGPs/GraphExtensions/graphUtil.py b/pyGPs/GraphExtensions/graphUtil.py
index bf21087..72695fa 100644
--- a/pyGPs/GraphExtensions/graphUtil.py
+++ b/pyGPs/GraphExtensions/graphUtil.py
@@ -1,3 +1,5 @@
+from __future__ import division
+from past.utils import old_div
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -74,7 +76,7 @@ def normalizeKernel(K):
     '''
     Kdiag = np.atleast_2d(np.diag(K))
     normalizers = np.sqrt(Kdiag*Kdiag.T)
-    K_norm = K/normalizers
+    K_norm = old_div(K,normalizers)
     return K_norm
 
 
diff --git a/pyGPs/GraphExtensions/nodeKernels.py b/pyGPs/GraphExtensions/nodeKernels.py
index 8cad516..ce9c212 100644
--- a/pyGPs/GraphExtensions/nodeKernels.py
+++ b/pyGPs/GraphExtensions/nodeKernels.py
@@ -1,3 +1,5 @@
+from __future__ import division
+from past.utils import old_div
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -26,7 +28,7 @@ def normLap(A):
     '''
     I = np.identity(A.shape[0])
     d = A.sum(axis=0)
-    d = np.sqrt(1./d)    
+    d = np.sqrt(old_div(1.,d))    
     D = np.diag(d)
     L = I - np.dot( np.dot(D,A),D )
     return L
@@ -91,7 +93,7 @@ def VNDKernel(A, alpha=0.5):
     ''' 
     I = np.identity(A.shape[0])
     d = A.sum(axis=0)
-    d = np.sqrt(1./d)    
+    d = np.sqrt(old_div(1.,d))    
     D = np.diag(d)
     S = np.dot( np.dot(D,A),D )
     K = np.linalg.inv( I - alpha*S )
@@ -110,14 +112,14 @@ def rwKernel(A, p=1, a=2):
     :return: kernel matrix
     '''
     if type(p) != int:
-	p = int(p)
+        p = int(p)
     if p < 1:
-	raise Exception('Step parameter p needs to be larger than 0.') 	
-    if a <= 1:	
-	a=1.0001
+        raise Exception('Step parameter p needs to be larger than 0.')
+    if a <= 1:
+        a=1.0001
     I = np.identity(A.shape[0])
     L = normLap(A)
-    K = np.linalg.matrix_power( a*I - L, p)	
+    K = np.linalg.matrix_power( a*I - L, p)
     return K
 
 
diff --git a/pyGPs/Optimization/__init__.py b/pyGPs/Optimization/__init__.py
index 47b198b..877e588 100644
--- a/pyGPs/Optimization/__init__.py
+++ b/pyGPs/Optimization/__init__.py
@@ -1,3 +1,4 @@
-import scg
-import minimize
-import conf
\ No newline at end of file
+from __future__ import absolute_import
+from . import scg
+from . import minimize
+from . import conf
\ No newline at end of file
diff --git a/pyGPs/Optimization/conf.py b/pyGPs/Optimization/conf.py
index 09ade68..7bc9479 100644
--- a/pyGPs/Optimization/conf.py
+++ b/pyGPs/Optimization/conf.py
@@ -1,3 +1,4 @@
+from builtins import object
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
diff --git a/pyGPs/Optimization/minimize.py b/pyGPs/Optimization/minimize.py
index 24bdc73..7b1b13d 100644
--- a/pyGPs/Optimization/minimize.py
+++ b/pyGPs/Optimization/minimize.py
@@ -1,3 +1,6 @@
+from __future__ import division
+from __future__ import print_function
+from past.utils import old_div
 
 #===============================================================================
 #   This program is distributed WITHOUT ANY WARRANTY; without even the implied 
@@ -47,7 +50,7 @@ def run(f, X, args=(), length=None, red=1.0, verbose=False):
     EXT = 3.0;              # extrapolate maximum 3 times the current step-size
     MAX = 20;                     # max 20 function evaluations per line search
     RATIO = 10;                                   # maximum allowed slope ratio
-    SIG = 0.1;RHO = SIG/2;# SIG and RHO are the constants controlling the Wolfe-
+    SIG = 0.1;RHO = old_div(SIG,2);# SIG and RHO are the constants controlling the Wolfe-
     #Powell conditions. SIG is the maximum allowed absolute ratio between
     #previous and new slopes (derivatives in the search direction), thus setting
     #SIG to low (positive) values forces higher precision in the line-searches.
@@ -66,7 +69,7 @@ def run(f, X, args=(), length=None, red=1.0, verbose=False):
     i = i + (length<0)                                          # count epochs?!
     s = -df0 
     d0 = -dot(s,s)               # initial search direction (steepest) and slope
-    x3 = red/(1.0-d0)                              # initial step is red/(|s|+1)
+    x3 = old_div(red,(1.0-d0))                              # initial step is red/(|s|+1)
     
     while i < abs(length):                                  # while not finished
         i = i + (length>0)                                  # count iterations?!
@@ -89,7 +92,7 @@ def run(f, X, args=(), length=None, red=1.0, verbose=False):
                         return
                     success = 1
                 except:                     # catch any error which occured in f
-                    x3 = (x2+x3)/2                        # bisect and try again
+                    x3 = old_div((x2+x3),2)                        # bisect and try again
             if f3 < F0:
                 X0 = X+x3*s; F0 = f3; dF0 = df3               # keep best values
             d3 = dot(df3,s)                                          # new slope
@@ -122,18 +125,18 @@ def run(f, X, args=(), length=None, red=1.0, verbose=False):
             else:
                 x2 = x3; f2 = f3; d2 = d3              # move point 3 to point 2
             if f4 > f0:           
-                x3 = x2-(0.5*d2*(x4-x2)**2)/(f4-f2-d2*(x4-x2))
+                x3 = x2-old_div((0.5*d2*(x4-x2)**2),(f4-f2-d2*(x4-x2)))
                                                        # quadratic interpolation
             else:
                 A = 6*(f2-f4)/(x4-x2)+3*(d4+d2)            # cubic interpolation
                 B = 3*(f4-f2)-(2*d2+d4)*(x4-x2)
                 if A != 0:
-                    x3=x2+(sqrt(B*B-A*d2*(x4-x2)**2)-B)/A
+                    x3=x2+old_div((sqrt(B*B-A*d2*(x4-x2)**2)-B),A)
                                                       # num. error possible, ok!
                 else:
                     x3 = inf
             if isnan(x3) or isinf(x3):
-                x3 = (x2+x4)/2       # if we had a numerical problem then bisect
+                x3 = old_div((x2+x4),2)       # if we had a numerical problem then bisect
             x3 = max(min(x3, x4-INT*(x4-x2)),x2+INT*(x4-x2))  
                                                         # don't accept too close
             result3 = f(X+x3*s, *args)
@@ -152,16 +155,16 @@ def run(f, X, args=(), length=None, red=1.0, verbose=False):
             d3 = d0; d0 = dot(df0,s)
             if d0 > 0:                              # new slope must be negative
                 s = -df0; d0 = -dot(s,s)      # otherwise use steepest direction
-            x3 = x3 * min(RATIO, d3/(d0-SMALL))      # slope ratio but max RATIO
+            x3 = x3 * min(RATIO, old_div(d3,(d0-SMALL)))      # slope ratio but max RATIO
             ls_failed = 0                        # this line search did not fail
         else:
             X = X0; f0 = F0; df0 = dF0               # restore best point so far
             if ls_failed or (i>abs(length)): # line search failed twice in a row
                 break                     # or we ran out of time, so we give up
             s = -df0; d0 = -dot(s,s)                              # try steepest
-            x3 = 1/(1-d0)                     
+            x3 = old_div(1,(1-d0))                     
             ls_failed = 1                              # this line search failed
     
-    if verbose: print "\n"
+    if verbose: print("\n")
     #print fX
     return X, fX, i
diff --git a/pyGPs/Optimization/scg.py b/pyGPs/Optimization/scg.py
index 8c72751..efd0158 100644
--- a/pyGPs/Optimization/scg.py
+++ b/pyGPs/Optimization/scg.py
@@ -1,3 +1,6 @@
+from __future__ import division
+from __future__ import print_function
+from past.utils import old_div
 #===============================================================================
 #   SCG  Scaled conjugate gradient optimization.
 #
@@ -22,7 +25,7 @@
 
 def run(f, x, args=(), niters = 100, gradcheck = False, display = 0, flog = False, pointlog = False, scalelog = False, tolX = 1.0e-8, tolO = 1.0e-8, eval = None): 
     '''Scaled conjugate gradient optimization. '''
-    if display: print '\n***** starting optimization (SCG) *****\n'
+    if display: print('\n***** starting optimization (SCG) *****\n')
     nparams = len(x);
     #  Check gradients
     if gradcheck:
@@ -73,19 +76,19 @@ def run(f, x, args=(), niters = 100, gradcheck = False, display = 0, flog = Fals
                 else:
                     return x, listF
 
-            sigma = sigma0/sqrt(kappa)
+            sigma = old_div(sigma0,sqrt(kappa))
             xplus = x + sigma*d
             gplus = f(xplus, *args)[1]
             gradCount += 1
-            theta = (np.dot(d, (gplus - gradnew)))/sigma;
+            theta = old_div((np.dot(d, (gplus - gradnew))),sigma);
      
         # Increase effective curvature and evaluate step size alpha.
         delta = theta + beta*kappa
         if (delta <= 0):
             delta = beta*kappa
-            beta = beta - theta/kappa
+            beta = beta - old_div(theta,kappa)
 
-        alpha = - mu/delta
+        alpha = old_div(- mu,delta)
          
         # Calculate the comparison ratio.
         xnew = x + alpha*d
@@ -158,7 +161,7 @@ def run(f, x, args=(), niters = 100, gradcheck = False, display = 0, flog = Fals
             nsuccess = 0;
         else:
             if (success == 1):
-                gamma = np.dot((gradold - gradnew), gradnew)/(mu)
+                gamma = old_div(np.dot((gradold - gradnew), gradnew),(mu))
                 d = gamma*d - gradnew;
 
         j += 1
@@ -167,7 +170,7 @@ def run(f, x, args=(), niters = 100, gradcheck = False, display = 0, flog = Fals
     # iterations.
     # options(8) = fold;
     if (display):
-        print "maximum number of iterations reached"
+        print("maximum number of iterations reached")
     if eval is not None:
         return x, listF, evalList, time
     else:
diff --git a/pyGPs/Testing/unit_test_cov.py b/pyGPs/Testing/unit_test_cov.py
index f369362..6f75788 100644
--- a/pyGPs/Testing/unit_test_cov.py
+++ b/pyGPs/Testing/unit_test_cov.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+from builtins import range
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -52,7 +54,7 @@ def checkCovOutput(self, train_train, train_test, self_test):
     def is_positive_semi_definite(self, K):
         v = np.linalg.eig(K)[0]
         if any(v.real<=-1e-9):
-            print v.real.min()
+            print(v.real.min())
             return False
         else:
             return True
@@ -71,7 +73,7 @@ def checkCovariance(self, k):
         k2 = k.getCovMatrix(x=self.x, z=self.z, mode='cross') # test train by test covariance
         k3 = k.getCovMatrix(z=self.z, mode='self_test')       # test test by test self covariance
         self.checkCovOutput(k1,k2,k3)
-        for der in xrange(len(k.hyp)):                        # checking derivatives for each hyperparameter
+        for der in range(len(k.hyp)):                        # checking derivatives for each hyperparameter
             kd1 = k.getDerMatrix(x=self.x, mode='train',der=der)           # test train by train derivative
             kd2 = k.getDerMatrix(x=self.x, z=self.z, mode='cross',der=der) # test train by test derivative
             kd3 = k.getDerMatrix(z=self.z, mode='self_test',der=der)       # test test by test self derivative
@@ -79,75 +81,75 @@ def checkCovariance(self, k):
 
 
     def test_covSM(self):
-        print "testing covSM..."
+        print("testing covSM...")
         k = pyGPs.cov.SM(Q=10,D=self.x.shape[1])
         self.checkCovariance(k)
 
     def test_covGabor(self):
-        print "testing covGabor..."
+        print("testing covGabor...")
         k = pyGPs.cov.Gabor()
         self.checkCovariance(k)
 
     def test_covRBF(self):
-        print "testing covRBF..."
+        print("testing covRBF...")
         k = pyGPs.cov.RBF()
         self.checkCovariance(k)
 
 
     def test_covRBFunit(self):
-        print "testing covRBFunit..."
+        print("testing covRBFunit...")
         k = pyGPs.cov.RBFunit()
         self.checkCovariance(k)
 
 
     def test_covRBFard(self):
-        print "testing covRBFard..."
+        print("testing covRBFard...")
         k = pyGPs.cov.RBFard(D=self.x.shape[1])
         self.checkCovariance(k)
 
 
     def test_covConst(self):
-        print "testing covConst..."
+        print("testing covConst...")
         k = pyGPs.cov.Const()
         self.checkCovariance(k)
 
 
     def test_covLinear(self):
-        print "testing covLinear..."
+        print("testing covLinear...")
         k = pyGPs.cov.Linear()
         self.checkCovariance(k)
 
 
     def test_covLINard(self):
-        print "testing covLINard..."
+        print("testing covLINard...")
         k = pyGPs.cov.LINard(D=self.x.shape[1])
         self.checkCovariance(k)
 
 
     def test_covMatern(self):
-        print "testing covMatern..."
-        print "...d = 1..."
+        print("testing covMatern...")
+        print("...d = 1...")
         k = pyGPs.cov.Matern(d=1)
         self.checkCovariance(k)
 
-        print "testing covMatern..."
-        print "...d = 3..."
+        print("testing covMatern...")
+        print("...d = 3...")
         k = pyGPs.cov.Matern(d=3)
         self.checkCovariance(k)
 
-        print "testing covMatern..."
-        print "...d = 5..."
+        print("testing covMatern...")
+        print("...d = 5...")
         k = pyGPs.cov.Matern(d=5)
         self.checkCovariance(k)
 
-        print "testing covMatern..."
-        print "...d = 7..."
+        print("testing covMatern...")
+        print("...d = 7...")
         k = pyGPs.cov.Matern(d=7)
         self.checkCovariance(k)
 
 
     def test_covPeriodic(self):
-        print "testing covPeriodic..."
+        print("testing covPeriodic...")
         k = pyGPs.cov.Periodic()
         n = 20          # number of training inputs
         nn = 10         # number of test inputs
@@ -160,7 +162,7 @@ def test_covPeriodic(self):
         self.assertTrue(k1.shape == (n,n))
         self.assertTrue(k2.shape == (n,nn))
         self.assertTrue(k3.shape == (nn,1))
-        for der in xrange(len(k.hyp)):                        # checking derivatives for each hyperparameter
+        for der in range(len(k.hyp)):                        # checking derivatives for each hyperparameter
             kd1 = k.getDerMatrix(x=x, mode='train',der=der)           # test train by train derivative
             kd2 = k.getDerMatrix(x=x, z=z, mode='cross',der=der) # test train by test derivative
             kd3 = k.getDerMatrix(z=z, mode='self_test',der=der)       # test test by test self derivative
@@ -169,25 +171,25 @@ def test_covPeriodic(self):
 
 
     def test_covNoise(self):
-        print "testing covNoise..."
+        print("testing covNoise...")
         k = pyGPs.cov.Noise()
         self.checkCovariance(k)
 
 
     def test_covRQ(self):
-        print "testing covRQ..."
+        print("testing covRQ...")
         k = pyGPs.cov.RQ()
         self.checkCovariance(k)
 
 
     def test_covRQard(self):
-        print "testing covRQard..."
+        print("testing covRQard...")
         k = pyGPs.cov.RQard(D=self.x.shape[1])
         self.checkCovariance(k)
 
 
     def test_covPre(self):
-        print "testing covPre..."
+        print("testing covPre...")
         k = pyGPs.cov.Pre(self.M1, self.M2)                   # load precomputed kernel matrix
         k1 = k.getCovMatrix(x=self.x, mode='train')           # test train by train covariance
         k2 = k.getCovMatrix(x=self.x, z=self.z, mode='cross') # test train by test covariance
@@ -197,7 +199,7 @@ def test_covPre(self):
         self.assertTrue(k1.shape == (n,n))
         self.assertTrue(k2.shape == (n,nn))
         self.assertTrue(k3.shape == (nn,1))
-        for der in xrange(len(k.hyp)):                        # checking derivatives for each hyperparameter
+        for der in range(len(k.hyp)):                        # checking derivatives for each hyperparameter
             kd1 = k.getDerMatrix(x=self.x, mode='train',der=der)           # test train by train derivative
             kd2 = k.getDerMatrix(x=self.x, z=self.z, mode='cross',der=der) # test train by test derivative
             kd3 = k.getDerMatrix(z=self.z, mode='self_test',der=der)       # test test by test self derivative
@@ -205,37 +207,37 @@ def test_covPre(self):
 
 
     def test_covPiecePoly(self):
-        print "testing covPiecePoly..."
+        print("testing covPiecePoly...")
         k = pyGPs.cov.PiecePoly()
         self.checkCovariance(k)
 
 
     def test_covPoly(self):
-        print "testing covPoly..."
+        print("testing covPoly...")
         k = pyGPs.cov.Poly()
         self.checkCovariance(k)
 
 
     def test_covScale(self):
-        print "testing (composing kernel) muliply by a scalar..."
+        print("testing (composing kernel) muliply by a scalar...")
         k = pyGPs.cov.RBF()*5
         self.checkCovariance(k)
 
 
     def test_covSum(self):
-        print "testing (composing kernel) sum of two kernels..."
+        print("testing (composing kernel) sum of two kernels...")
         k = pyGPs.cov.RBF() + pyGPs.cov.PiecePoly()
         self.checkCovariance(k)
 
 
     def test_covProduct(self):
-        print "testing (composing kernel) product of two kernels..."
+        print("testing (composing kernel) product of two kernels...")
         k = pyGPs.cov.RBF() * pyGPs.cov.PiecePoly()
         self.checkCovariance(k)
 
 
     def test_covFITC(self):
-        print "testing FITC kernel to be used with sparse GP..."
+        print("testing FITC kernel to be used with sparse GP...")
         n,D  = self.x.shape
         nn,D = self.z.shape
         nu = 5        # number of inducing points
@@ -252,7 +254,7 @@ def test_covFITC(self):
         self.assertTrue(k2.shape == (nu,nn))
         self.assertTrue(k3.shape == (nn,1))
 
-        for der in xrange(len(k.hyp)):
+        for der in range(len(k.hyp)):
             Kd, Kuud, Kud = k.getDerMatrix(x=self.x, mode='train',der=der) # test train by train derivative
             kd2 = k.getDerMatrix(x=self.x, z=self.z, mode='cross',der=der) # test train by test derivative
             kd3 = k.getDerMatrix(z=self.z, mode='self_test',der=der)       # test test by test self derivative
@@ -271,6 +273,6 @@ def test_cov_new(self):
     '''
 
 if __name__ == "__main__":
-    print "Running unit tests..."
+    print("Running unit tests...")
     unittest.main()
 
diff --git a/pyGPs/Testing/unit_test_inf.py b/pyGPs/Testing/unit_test_inf.py
index eb71e53..d338c14 100644
--- a/pyGPs/Testing/unit_test_inf.py
+++ b/pyGPs/Testing/unit_test_inf.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -56,7 +57,7 @@ def checkFITCOutput(self, post, nlZ, dnlZ):
 
 
     def test_infExact(self):
-        print "testing exact inference..."
+        print("testing exact inference...")
         inffunc = pyGPs.inf.Exact()
         meanfunc = pyGPs.mean.Zero()
         covfunc = pyGPs.cov.RBF()
@@ -66,7 +67,7 @@ def test_infExact(self):
 
 
     def test_infFITC_Exact(self):
-        print "testing FITC inference..."
+        print("testing FITC inference...")
         inffunc = pyGPs.inf.FITC_Exact()
         meanfunc = pyGPs.mean.Zero()
         covfunc = pyGPs.cov.RBF().fitc(self.u)
@@ -76,7 +77,7 @@ def test_infFITC_Exact(self):
 
 
     def test_infEP(self):
-        print "testing EP inference..."
+        print("testing EP inference...")
         inffunc = pyGPs.inf.EP()
         meanfunc = pyGPs.mean.Zero()
         covfunc = pyGPs.cov.RBF()
@@ -86,7 +87,7 @@ def test_infEP(self):
 
 
     def test_infFITC_EP(self):
-        print "testing FITC EP inference..."
+        print("testing FITC EP inference...")
         inffunc = pyGPs.inf.FITC_EP()
         meanfunc = pyGPs.mean.Zero()
         covfunc = pyGPs.cov.RBF().fitc(self.u)
@@ -96,7 +97,7 @@ def test_infFITC_EP(self):
 
 
     def test_infLaplace(self):
-        print "testing Laplace inference..."
+        print("testing Laplace inference...")
         inffunc = pyGPs.inf.Laplace()
         meanfunc = pyGPs.mean.Zero()
         covfunc = pyGPs.cov.RBF()
@@ -106,7 +107,7 @@ def test_infLaplace(self):
 
 
     def test_infFITC_Laplace(self):
-        print "testing FITC EP inference..."
+        print("testing FITC EP inference...")
         inffunc = pyGPs.inf.FITC_Laplace()
         meanfunc = pyGPs.mean.Zero()
         covfunc = pyGPs.cov.RBF().fitc(self.u)
@@ -126,6 +127,6 @@ def test_inf_new(self):
 
 
 if __name__ == "__main__":
-    print "Running unit tests..."
+    print("Running unit tests...")
     unittest.main()
 
diff --git a/pyGPs/Testing/unit_test_lik.py b/pyGPs/Testing/unit_test_lik.py
index 1f007c7..3b91191 100644
--- a/pyGPs/Testing/unit_test_lik.py
+++ b/pyGPs/Testing/unit_test_lik.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+from builtins import range
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [marthaler at ge dot com]
@@ -58,40 +60,40 @@ def checkDerivativeEP(self, dlZhyp):
 
 
     def checkLikelihood(self, likelihood):
-        print "predictive mode"
+        print("predictive mode")
         lp, ymu, ys2 = likelihood.evaluate(y=self.y, mu=self.mu, s2=self.s2, nargout=3)
         self.checkPrediction(lp, ymu, ys2)
 
-        print "inference mode(Laplace inference)"
+        print("inference mode(Laplace inference)")
         lp,dlp,d2lp,d3lp = likelihood.evaluate(y=self.y, mu=self.mu, s2=self.s2, inffunc=pyGPs.inf.Laplace(), nargout=4)
         self.checkInferenceLaplace(lp,dlp,d2lp,d3lp)
-        for der in xrange(len(likelihood.hyp)):
+        for der in range(len(likelihood.hyp)):
             lp_dhyp,dlp_dhyp,d2lp_dhyp = likelihood.evaluate(y=self.y, mu=self.mu, s2=self.s2, inffunc=pyGPs.inf.Laplace(), der=der, nargout=4)
             self.checkDerivativeLaplace(lp_dhyp,dlp_dhyp,d2lp_dhyp)
 
-        print "inference mode(EP inference)"
+        print("inference mode(EP inference)")
         lp,dlp,d2lp = likelihood.evaluate(y=self.y, mu=self.mu, s2=self.s2, inffunc=pyGPs.inf.EP(), nargout=3)
         self.checkInferenceEP(lp,dlp,d2lp)
-        for der in xrange(len(likelihood.hyp)):
+        for der in range(len(likelihood.hyp)):
             dlZhyp = likelihood.evaluate(y=self.y, mu=self.mu, s2=self.s2, inffunc=pyGPs.inf.EP(), der=der, nargout=3)
             self.checkDerivativeEP(dlZhyp)
 
 
     def test_likGauss(self):
-        print "testing Gaussian likelihood..."
+        print("testing Gaussian likelihood...")
         likelihood = pyGPs.lik.Gauss()
         self.checkLikelihood(likelihood)
 
 
 
     def test_likErf(self):
-        print "testing error function(cumulative Gaussian) likelihood..."
+        print("testing error function(cumulative Gaussian) likelihood...")
         likelihood = pyGPs.lik.Erf() 
         self.checkLikelihood(likelihood)
 
 
     def test_likLaplace(self):
-        print "testing Laplacian likelihood..."
+        print("testing Laplacian likelihood...")
         likelihood = pyGPs.lik.Laplace() 
         self.checkLikelihood(likelihood)
 
@@ -105,6 +107,6 @@ def test_cov_new(self):
 
 
 if __name__ == "__main__":
-    print "Running unit tests..."
+    print("Running unit tests...")
     unittest.main()
 
diff --git a/pyGPs/Testing/unit_test_mean.py b/pyGPs/Testing/unit_test_mean.py
index 595fcb9..6bdfda6 100644
--- a/pyGPs/Testing/unit_test_mean.py
+++ b/pyGPs/Testing/unit_test_mean.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+from builtins import range
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -37,56 +39,56 @@ def checkDerOutput(self, derivative):
     def checkMean(self, m):
         mean = m.getMean(x=self.x)             # get mean
         self.checkMeanOutput(mean)
-        for der in xrange(len(m.hyp)):         # get derivatives
+        for der in range(len(m.hyp)):         # get derivatives
             derivative = m.getDerMatrix(x=self.x, der=der)
             self.checkDerOutput(derivative)
 
 
     def test_meanZero(self):
-        print "testing meanZero..."
+        print("testing meanZero...")
         m = pyGPs.mean.Zero()
         self.checkMean(m)
 
 
 
     def test_meanLinear(self):
-        print "testing meanLinear..."
+        print("testing meanLinear...")
         m = pyGPs.mean.Linear(D=self.x.shape[1]) 
         self.checkMean(m)
 
 
     def test_meanOne(self):
-        print "testing meanOne..."
+        print("testing meanOne...")
         m = pyGPs.mean.One() 
         self.checkMean(m)
 
 
     def test_meanConst(self):
-        print "testing meanConst..."
+        print("testing meanConst...")
         m = pyGPs.mean.Const() 
         self.checkMean(m)
 
 
     def test_meanScale(self):
-        print "testing (compositing mean) muliply by a scalar..."
+        print("testing (compositing mean) muliply by a scalar...")
         m = pyGPs.mean.One() * 6
         self.checkMean(m)   
 
 
     def test_meanSum(self):
-        print "testing (compositing mean) sum of two means..."
+        print("testing (compositing mean) sum of two means...")
         m = pyGPs.mean.One() + pyGPs.mean.Const() 
         self.checkMean(m)
 
 
     def test_meanProduct(self):
-        print "testing (compositing mean) product of two means..."
+        print("testing (compositing mean) product of two means...")
         m = pyGPs.mean.One() * pyGPs.mean.Const() 
         self.checkMean(m)
 
 
     def test_meanPower(self):
-        print "testing (compositing mean) power of a mean..."
+        print("testing (compositing mean) power of a mean...")
         m = pyGPs.mean.Const() ** 2
         self.checkMean(m)
 
@@ -102,6 +104,6 @@ def test_mean_new(self):
 
 
 if __name__ == "__main__":
-    print "Running unit tests..."
+    print("Running unit tests...")
     unittest.main()
 
diff --git a/pyGPs/Testing/unit_test_model.py b/pyGPs/Testing/unit_test_model.py
index 7d45d3b..c54664c 100644
--- a/pyGPs/Testing/unit_test_model.py
+++ b/pyGPs/Testing/unit_test_model.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+from builtins import zip
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -35,7 +37,7 @@ def setUp(self):
         self.yc = clsData['y']            # training target
         self.zc = clsData['xstar']        # test data
         u1,u2 = np.meshgrid(np.linspace(-2,2,5),np.linspace(-2,2,5))
-        self.uc = np.array(zip(np.reshape(u2,(np.prod(u2.shape),)),np.reshape(u1,(np.prod(u1.shape),)))) # inducing points
+        self.uc = np.array(list(zip(np.reshape(u2,(np.prod(u2.shape),)),np.reshape(u1,(np.prod(u1.shape),))))) # inducing points
 
 
     def checkRegressionOutput(self, model):
@@ -55,7 +57,7 @@ def checkClassificationOutput(self, model):
 
 
     def test_GPR(self):
-        print "testing GP regression..."
+        print("testing GP regression...")
         model = pyGPs.GPR()
         m = pyGPs.mean.Zero()
         k = pyGPs.cov.RBF()
@@ -67,7 +69,7 @@ def test_GPR(self):
 
 
     def test_GPC(self):
-        print "testing GP classification..."
+        print("testing GP classification...")
         model = pyGPs.GPC()
         m = pyGPs.mean.Zero()
         k = pyGPs.cov.RBF()
@@ -88,7 +90,7 @@ def test_GPC(self):
 
 
     def test_GPR_FITC(self):
-        print "testing GP sparse regression..."
+        print("testing GP sparse regression...")
         model = pyGPs.GPR_FITC()
         m = pyGPs.mean.Zero()
         k = pyGPs.cov.RBF()
@@ -100,7 +102,7 @@ def test_GPR_FITC(self):
 
 
     def test_GPC_FITC(self):
-        print "testing GP sparse classification..."
+        print("testing GP sparse classification...")
         model = pyGPs.GPC_FITC()
         m = pyGPs.mean.Zero()
         k = pyGPs.cov.RBF()
@@ -123,6 +125,6 @@ def test_GPC_FITC(self):
 
 
 if __name__ == "__main__":
-    print "Running unit tests(about 5 min)..."
+    print("Running unit tests(about 5 min)...")
     unittest.main()
 
diff --git a/pyGPs/Testing/unit_test_opt.py b/pyGPs/Testing/unit_test_opt.py
index a7cd325..219fab8 100644
--- a/pyGPs/Testing/unit_test_opt.py
+++ b/pyGPs/Testing/unit_test_opt.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -37,33 +38,33 @@ def checkOptimizer(self, optimizer):
         # and optimalHyp is a flattened numpy array
         optimalHyp, funcValue = optimizer.findMin(self.x, self.y)
         self.assertTrue(funcValue < self.nlZ_beforeOpt)     
-        print "optimal hyperparameters in flattened array:", optimalHyp 
+        print("optimal hyperparameters in flattened array:", optimalHyp) 
 
 
 
     def test_CG(self):
-        print "testing Conjugent gradient..."
+        print("testing Conjugent gradient...")
         optimizer = pyGPs.Core.opt.CG(self.model)
         self.checkOptimizer(optimizer)
 
 
 
     def test_BFGS(self):
-        print "testing quasi-Newton method of Broyden, Fletcher, Goldfarb, and Shanno (BFGS)..."
+        print("testing quasi-Newton method of Broyden, Fletcher, Goldfarb, and Shanno (BFGS)...")
         optimizer = pyGPs.Core.opt.BFGS(self.model)
         self.checkOptimizer(optimizer)
 
 
 
     def test_SCG(self):
-        print "testing Scaled conjugent gradient..."
+        print("testing Scaled conjugent gradient...")
         optimizer = pyGPs.Core.opt.SCG(self.model)
         self.checkOptimizer(optimizer)
 
 
 
     def test_Minimize(self):
-        print "testing minimize by Carl Rasmussen ..."
+        print("testing minimize by Carl Rasmussen ...")
         optimizer = pyGPs.Core.opt.Minimize(self.model)
         self.checkOptimizer(optimizer)
 
@@ -81,6 +82,6 @@ def test_MyOptimizer(self):
 
 
 if __name__ == "__main__":
-    print "Running unit tests..."
+    print("Running unit tests...")
     unittest.main()
 
diff --git a/pyGPs/Validation/__init__.py b/pyGPs/Validation/__init__.py
index 0b19391..40b3f49 100644
--- a/pyGPs/Validation/__init__.py
+++ b/pyGPs/Validation/__init__.py
@@ -1,4 +1,5 @@
-import valid
+from __future__ import absolute_import
+from . import valid
 
 
 
diff --git a/pyGPs/Validation/valid.py b/pyGPs/Validation/valid.py
index 668eb44..5185b32 100644
--- a/pyGPs/Validation/valid.py
+++ b/pyGPs/Validation/valid.py
@@ -1,3 +1,6 @@
+from __future__ import division
+from builtins import range
+from past.utils import old_div
 #================================================================================
 #    Marion Neumann [marion dot neumann at uni-bonn dot de]
 #    Daniel Marthaler [dan dot marthaler at gmail dot com]
@@ -36,7 +39,7 @@ def k_fold_validation(x, y, K=10, randomise=False):
 
     n, D = x.shape
     assert n > K
-    for k in xrange(K):
+    for k in range(K):
         x_train = np.array([e for i, e in enumerate(x) if i % K != k])
         x_test = np.array([e for i, e in enumerate(x) if i % K == k])
         y_train = np.array([e for i, e in enumerate(y) if i % K != k])
@@ -53,10 +56,10 @@ def k_fold_index(n, K=10):
     :param K: number of folds
 
     '''
-    for k in xrange(K):
+    for k in range(K):
         indice_train = []
         indice_test = []
-        for i in xrange(n):
+        for i in range(n):
             if i % K == k:
                 indice_test.append(i)
             else:
@@ -85,10 +88,10 @@ def ACC(predict,target):
     '''
     n,D = target.shape
     count = 0.
-    for i in xrange(n):
+    for i in range(n):
         if predict[i,0] == target[i,0]:
             count += 1
-    return count/n
+    return old_div(count,n)
 
 
 def Prec(predict,target):
@@ -102,12 +105,12 @@ def Prec(predict,target):
     n,D = target.shape
     count_1 = 0.
     count_2 = 0.
-    for i in xrange(n):
+    for i in range(n):
         if predict[i,0] == 1:
             count_1 += 1
             if target[i,0] == 1:
                 count_2 += 1
-    return count_2 / count_1
+    return old_div(count_2, count_1)
 
 
 def Recall(predict,target):
@@ -121,12 +124,12 @@ def Recall(predict,target):
     n,D = target.shape
     count_1 = 0.
     count_2 = 0.
-    for i in xrange(n):   
+    for i in range(n):   
         if target[i,0] == 1:
             count_1 += 1
             if predict[i,0] == 1:
                 count_2 += 1
-    return count_2 / count_1
+    return old_div(count_2, count_1)
 
 
 def NLPD(y, MU, S2):
diff --git a/pyGPs/__init__.py b/pyGPs/__init__.py
index 4dfa102..e6cda29 100644
--- a/pyGPs/__init__.py
+++ b/pyGPs/__init__.py
@@ -1,8 +1,9 @@
-import Optimization
-import Validation
-import GraphExtensions
-from Core import *
-from Core.gp import *
+from __future__ import absolute_import
+from . import Optimization
+from . import Validation
+from . import GraphExtensions
+from .Core import *
+from .Core.gp import *
 
 __all__ = ['Optimization', 'Validation', 'GraphExtensions', 'Core']