In [36]:
import pandas as pd
import numpy as np
%load_ext Cython

In [69]:
df = pd.DataFrame(zip(np.random.random(100), np.random.choice([1,2,3],100), np.random.random(100)),\
                  columns=['col1','col2','col3'])

In [24]:
char_to_int = dict({1:'a', 2:'b', 3:'c'})

def add_column(val1, val2):
    return char_to_int[val1] + '_' +  str(val2)

In [31]:
df['new_column'] = df[['col2','col3']].apply(lambda x: add_column(*x), axis=1)

def test_time():
    for i in range(100):
        df['new_column'] = df[['col2','col3']].apply(lambda x: add_column(*x), axis=1)

In [45]:
%timeit  test_time()

1 loop, best of 3: 385 ms per loop


In [73]:
df['new_column'] = df[['col2','col3']].apply(lambda x: add_column(*x), axis=1)

def test_time():
    for i in range(100):
        df['new_column'] = df.apply(lambda x: add_column(x['col2'], x['col3']), axis=1)
        
%timeit  test_time()

1 loop, best of 3: 343 ms per loop


In [81]:
%prun -l 4 test_time()

 

         925404 function calls (884304 primitive calls) in 0.676 seconds

   Ordered by: internal time
   List reduced from 164 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    20000    0.072    0.000    0.371    0.000 base.py:2454(get_value)
    20000    0.039    0.000    0.423    0.000 series.py:598(__getitem__)
    20000    0.035    0.000    0.084    0.000 base.py:1303(_convert_scalar_indexer)
      100    0.035    0.000    0.605    0.006 {pandas._libs.lib.reduce}

In [176]:
%%cython 
cimport numpy as np
import numpy as np
char_to_int = dict({1:1, 2:2, 3:3})
char_to_int = dict({1:'a', 2:'b', 3:'c'})

cpdef str add_column_cython(int val1, int val2):

    return char_to_int[val1]# + '_' +  str(val2)

cpdef np.ndarray[str] apply_add_column_cython(np.ndarray col_1, np.ndarray col_2):
    cdef Py_ssize_t n = len(col_1)
    #cdef np.ndarray[str] res = np.empty(n, dtype=str)
    cdef np.ndarray res = np.empty(n, object)
    for i in range(len(col_1)):
        res[i] = add_column_cython(col_1[i], col_2[i])
    return res
        

In [177]:
def test_time_cython():
    for i in range(100):
        #df['new_column'] = df[['col2','col3']].apply(lambda x: add_column_cython(*x), axis=1)
        #df['new_column'] = df.apply(lambda x: add_column_cython(x['col2'], x['col3']), axis=1)
        apply_add_column_cython(df['col2'].values, df['col3'].values)

In [178]:
%timeit  test_time_cython()

100 loops, best of 3: 2.91 ms per loop


In [179]:
%prun -l 4 test_time_cython()

 

         2704 function calls in 0.005 seconds

   Ordered by: internal time
   List reduced from 18 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      100    0.003    0.000    0.003    0.000 {_cython_magic_b91fdf54214ab0d05cf287eb3a897765.apply_add_column_cython}
      200    0.001    0.000    0.002    0.000 frame.py:1940(__getitem__)
        1    0.000    0.000    0.005    0.005 <ipython-input-177-07eca7346780>:1(test_time_cython)
      200    0.000    0.000    0.000    0.000 base.py:1638(__contains__)

https://pandas.pydata.org/pandas-docs/version/0.22.0/enhancingperf.html

In [142]:
%prun -l 4 test_time_cython()

 

         2704 function calls in 0.007 seconds

   Ordered by: internal time
   List reduced from 18 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      100    0.003    0.000    0.003    0.000 {_cython_magic_e925acb518b667df47a7fdb33883c308.apply_add_column_cython}
      200    0.001    0.000    0.002    0.000 frame.py:1940(__getitem__)
        1    0.001    0.001    0.007    0.007 <ipython-input-140-07eca7346780>:1(test_time_cython)
      200    0.000    0.000    0.000    0.000 base.py:1638(__contains__)

In [160]:
import cython
cython.__version__

'0.26.1'

In [172]:
import numpy as np
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [174]:
%%cython 

import numpy as np
cimport numpy as np

cdef func(str foo):
    cdef np.ndarray res = np.empty(1, object)
    res[0] = foo
    return res
    
def testing():
    print func('blah')

In [175]:
testing()

['blah']


In [52]:
df = pd.DataFrame({'a': np.random.randn(1000),
                      'b': np.random.randn(1000),
                      'N': np.random.randint(100, 1000, (1000)),
                       'x': 'x'})


In [54]:
def f(x):
    return x * (x - 1)


def integrate_f(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f(a + i * dx)
    return s * dx

In [55]:
%timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)

1 loop, best of 3: 302 ms per loop


In [56]:
%prun -l 4 df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)

 

         668979 function calls (663970 primitive calls) in 0.434 seconds

   Ordered by: internal time
   List reduced from 135 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1000    0.237    0.000    0.354    0.000 <ipython-input-54-2af0239c16c6>:5(integrate_f)
   549446    0.114    0.000    0.114    0.000 <ipython-input-54-2af0239c16c6>:1(f)
     3000    0.010    0.000    0.054    0.000 base.py:2454(get_value)
     3000    0.006    0.000    0.062    0.000 series.py:598(__getitem__)

In [58]:
%load_ext Cython


The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [60]:
%%cython
def f_plain(x):
    return x * (x - 1)
def integrate_f_plain(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f_plain(a + i * dx)
    return s * dx


In [61]:
%timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1)

10 loops, best of 3: 87.8 ms per loop


In [62]:
%%cython
cdef double f_typed(double x) except? -2:
    return x * (x - 1)
cpdef double integrate_f_typed(double a, double b, int N):
    cdef int i
    cdef double s, dx
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f_typed(a + i * dx)
    return s * dx

In [63]:
%timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1)

10 loops, best of 3: 39.9 ms per loop


In [64]:
%prun -l 4 df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1)

 

         118533 function calls (113524 primitive calls) in 0.112 seconds

   Ordered by: internal time
   List reduced from 133 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     3000    0.015    0.000    0.077    0.000 base.py:2454(get_value)
     3000    0.008    0.000    0.087    0.000 series.py:598(__getitem__)
     3000    0.007    0.000    0.017    0.000 base.py:1303(_convert_scalar_indexer)
     3000    0.005    0.000    0.011    0.000 internals.py:4224(get_values)

In [65]:
%%cython
cimport numpy as np
import numpy as np
cdef double f_typed(double x) except? -2:
    return x * (x - 1)
cpdef double integrate_f_typed(double a, double b, int N):
    cdef int i
    cdef double s, dx
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f_typed(a + i * dx)
    return s * dx
cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, np.ndarray col_N):
    assert (col_a.dtype == np.float and col_b.dtype == np.float and col_N.dtype == np.int)
    cdef Py_ssize_t i, n = len(col_N)
    assert (len(col_a) == len(col_b) == n)
    cdef np.ndarray[double] res = np.empty(n)
    for i in range(len(col_a)):
        res[i] = integrate_f_typed(col_a[i], col_b[i], col_N[i])
    return res

In [67]:
%timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1)

10 loops, best of 3: 39.1 ms per loop
