# Pandas - using 'apply'

In [3]:
import pandas as pd

### Apply on all elements of the DataFrame

In [4]:
dict = {'A' : [1 ,2, 3, 4, 5], 'NUM': [100, 200, 300, 400, 500 ]}
df1 = pd.DataFrame(dict)
df1

Unnamed: 0,A,NUM
0,1,100
1,2,200
2,3,300
3,4,400
4,5,500


In [5]:
def dubble(x):
    return 2 * x

In [9]:
df2 = df1.apply(dubble) # it does not change the orginal values
df2

Unnamed: 0,A,NUM
0,2,200
1,4,400
2,6,600
3,8,800
4,10,1000


In [8]:
df1

Unnamed: 0,A,NUM
0,1,100
1,2,200
2,3,300
3,4,400
4,5,500


### Apply on selected columns

In [11]:
dict = {'A' : [1 ,2, 3, 4, 5], 'B': ['a', 'b', 'c', 'd', 'e'], 'NUM': [100, 200, 300, 400, 500 ]}
df1 = pd.DataFrame(dict)
df1

Unnamed: 0,A,B,NUM
0,1,a,100
1,2,b,200
2,3,c,300
3,4,d,400
4,5,e,500


In [17]:
def devide_10(x):
    return x/10

In [14]:
df2 = df1['NUM'].apply(devide_10)
df2

0    10.0
1    20.0
2    30.0
3    40.0
4    50.0
Name: NUM, dtype: float64

In [15]:
type(df2)

pandas.core.series.Series

In [18]:
df1['NUM_10'] = df1['NUM'].apply(devide_10)
df1

Unnamed: 0,A,B,NUM,NUM_10
0,1,a,100,10.0
1,2,b,200,20.0
2,3,c,300,30.0
3,4,d,400,40.0
4,5,e,500,50.0


In [19]:
df1[['A', 'NUM']].apply(dubble)

Unnamed: 0,A,NUM
0,2,200
1,4,400
2,6,600
3,8,800
4,10,1000


### Calculate aggregates of multiple columns

In [20]:
def sum(a, b):
    return a + b

In [27]:
df1['SUM1'] = df1.apply(lambda x: sum(x['A'], x['NUM']), axis=1) # operates on the whole DataFrame
df1

Unnamed: 0,A,B,NUM,NUM_10,SUM,SUM2,SUM1
0,1,a,100,10.0,101,101,101
1,2,b,200,20.0,202,202,202
2,3,c,300,30.0,303,303,303
3,4,d,400,40.0,404,404,404
4,5,e,500,50.0,505,505,505


In [26]:
df1['SUM2'] = df1[['A', 'NUM']].apply(lambda x: sum(*x), axis=1) # operates on the needed columns only
df1

Unnamed: 0,A,B,NUM,NUM_10,SUM,SUM2
0,1,a,100,10.0,101,101
1,2,b,200,20.0,202,202
2,3,c,300,30.0,303,303
3,4,d,400,40.0,404,404
4,5,e,500,50.0,505,505


### Generate multiple new columns in one run

In [28]:
def dubble2(x, y):
    return (x * 2, y * 2)

In [31]:
 df1[['A', 'NUM_10']].apply(lambda row: dubble2(*row), axis=1)


0      (2.0, 20.0)
1      (4.0, 40.0)
2      (6.0, 60.0)
3      (8.0, 80.0)
4    (10.0, 100.0)
dtype: object