In [1]:
import pandas as pd

In [5]:
import numpy as np

In [11]:
nrows, ncols = 100000, 100
rng = np.random.default_rng(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.random((nrows, ncols))) for i in range(4))

'''
rng.random((nrows, ncols)) = create random floats that are greater than or equal to 0 but less than 1. 
Number of samples produced = nrows * ncols
pd.DataFrame(rng.random((nrows, ncols))) = create a df with rows nrows and cols ncols and populate all floats here

'''

In [19]:
rng = np.random.default_rng(42)
x = rng.random((3,2))
print(x)
df = pd.DataFrame(x)
print(df)

[[0.77395605 0.43887844]
 [0.85859792 0.69736803]
 [0.09417735 0.97562235]]
          0         1
0  0.773956  0.438878
1  0.858598  0.697368
2  0.094177  0.975622


In [20]:
%timeit df1 + df2 + df3 + df4

18.6 ms ± 114 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
%timeit pd.eval('df1 + df2 + df3 + df4')
# eval is 46.5% faster

9.94 ms ± 59.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [23]:
# Returns True if two arrays are element-wise equal within a tolerance.
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))

True

In [33]:
rng = np.random.default_rng(42)
x = rng.integers(1,6, (2,2))
x

array([[1, 4],
       [4, 3]])

In [39]:
df5, df6 = (pd.DataFrame(x) for i in range(2))
print(df5)
print(df6)

   0  1
0  1  4
1  4  3
   0  1
0  1  4
1  4  3


In [35]:
rng = np.random.default_rng(42)
y = rng.integers(1,6,2)
y

array([1, 4])

In [38]:
rng = np.random.default_rng(42)
z = rng.integers(1,6,(2,1))
z

array([[1],
       [4]])

In [42]:
# col wise operations
df = pd.DataFrame(rng.random((1000, 3)), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.594096,0.244855,0.745675
1,0.084481,0.774418,0.589985
2,0.11075,0.079506,0.750836
3,0.292936,0.889302,0.762073
4,0.69819,0.278606,0.708945


In [44]:
res1 = (df['A'] + df['B'])/ (df['C'] - 1)
res2 = pd.eval("(df.A + df.B)/ (df.C - 1)")
np.allclose(res1, res2)

True

In [45]:
# cols are treated as variables
res3 = df.eval('(A + B)/ (C - 1)')
np.allclose(res1, res3)

True

In [46]:
#Assignment
# create new col
df.eval('D = (A + B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.594096,0.244855,0.745675,1.125089
1,0.084481,0.774418,0.589985,1.455797
2,0.11075,0.079506,0.750836,0.253392
3,0.292936,0.889302,0.762073,1.551344
4,0.69819,0.278606,0.708945,1.377818


In [47]:
# modify existing col
df.eval('D = (A - B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.594096,0.244855,0.745675,0.468356
1,0.084481,0.774418,0.589985,-1.169414
2,0.11075,0.079506,0.750836,0.041613
3,0.292936,0.889302,0.762073,-0.782557
4,0.69819,0.278606,0.708945,0.591843


In [50]:
# local variables
# Return the mean of the values over the requested axis, col - 1
col_mean = df.mean(1)
res1 = df['A'] + col_mean
res2 = df.eval('A + @col_mean')
np.allclose(res1, res2)

True

In [48]:
df.mean?

In [51]:
# dataframe query method
res1 = df[(df.A < .5) & (df.B < .5)]
res2 = pd.eval('df[(df.A < .5) & (df.B < .5)]')
np.allclose(res1, res2)

True

In [53]:
# error - df eval cannot be sued here
# res3 = df.eval('(A < .5) & (B < .5)')
# np.allclose(res1, res3)

In [54]:
res3 = df.query('(A < .5) & (B < .5)')
np.allclose(res1, res3)

True

In [56]:
# Diff df eval and df query
df123 = pd.DataFrame({'A': range(1, 6),
                      'B': range(10, 0, -2),
                      'C': range(10, 5, -1)})
df123

Unnamed: 0,A,B,C
0,1,10,10
1,2,8,9
2,3,6,8
3,4,4,7
4,5,2,6


In [57]:
df123.query('A > B')

Unnamed: 0,A,B,C
4,5,2,6


In [58]:
df123.eval('A > B')

0    False
1    False
2    False
3    False
4     True
dtype: bool