# 0312 Performance Eval and Query

## High performance pandas: eval() and query()-- Motivation: Compound Expressions

In [1]:
from pandas import *

In [2]:
nrows, ncols= 10000, 100

In [3]:
rng = np.random.RandomState(42)   
df1, df2, df3, df4 = (DataFrame(rng.rand(nrows, ncols))
                     for i in range(4))

In [4]:
%timeit df1+df2+df3+df4

30.8 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%timeit eval('df1+df2+df3+df4')

15.8 ms ± 514 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Operation supported by pd.eval()

In [7]:
#arithmetic operator
result1= -df1 * df2 / (df3 + df4)
result2= eval('-df1 * df2 / (df3 + df4)')

In [8]:
np.allclose(result1, result2)

True

In [9]:
# comparison operator
result1=df1<df2
result2=eval('df1<df2')
np.allclose(result1, result2)

True

In [11]:
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1, result2)

True

In [12]:
result3 = eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)

True

In [17]:
df2.T[0]

0     0.595156
1     0.364717
2     0.005376
3     0.561088
4     0.896570
        ...   
95    0.636517
96    0.631943
97    0.800452
98    0.411491
99    0.590027
Name: 0, Length: 100, dtype: float64

In [20]:
df2[99]

0       0.590027
1       0.600457
2       0.051971
3       0.088268
4       0.819564
          ...   
9995    0.373253
9996    0.183797
9997    0.357924
9998    0.944890
9999    0.691616
Name: 99, Length: 10000, dtype: float64

### assignment in dataframe.eval()

In [23]:
df = DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])

In [25]:
df.eval('D=(A+B)/C',inplace=True)
df

Unnamed: 0,A,B,C,D
0,0.806987,0.221195,0.034941,29.426165
1,0.042237,0.869957,0.879758,1.036870
2,0.961407,0.777247,0.790136,2.200448
3,0.659825,0.469367,0.517064,2.183852
4,0.129864,0.750684,0.665182,1.323770
...,...,...,...,...
995,0.662551,0.903614,0.426714,3.670292
996,0.792661,0.447852,0.933819,1.328430
997,0.505293,0.732697,0.819745,1.510213
998,0.246425,0.633107,0.920428,0.955570


### local variables in eval()

In [30]:
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)

True

The @ character makes a variable name rather than a column name, and lets you efficiently evaluate expression involiving the two namespace. The @character is only supported by the df.eval() method, not by the pd.eval() funtion, because the pd.eval() only has access to the one python namespace

### df.query() method

In [34]:
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

#too clumsy
reslt3= df.query('A<0.5 & B <0.5')
np.allclose(result1, reslt3)

True

In [35]:
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)

True