In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [3]:
df1 = pd.DataFrame(np.random.rand(1000, 5), columns=['A', 'B', 'C', 'D', 'E'])
df1.head(2)

Unnamed: 0,A,B,C,D,E
0,0.420997,0.562956,0.482889,0.464087,0.706939
1,0.98882,0.313047,0.342954,0.370695,0.343492


In [4]:
df2 = pd.read_csv('../data/P87-S5-Grouping-Resources/train.csv',
                  parse_dates=["Date"], low_memory=False)
df2.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1


### Use apply() COLUMN-WISE to DATA

In [5]:
def max_min(columns):
    diff = columns.max() - columns.min()
    return diff

In [6]:
df1.apply(max_min)

A    0.997238
B    0.995051
C    0.998363
D    0.998469
E    0.998731
dtype: float64

### Use apply() ROW-WISE to DATA

In [7]:
def sums(row):
    return row['A'] + row['B'] + row['C'] + row['D'] + row['E'] 

In [10]:
df1['SUM'] = df1.apply(sums, axis=1)
df1.head(2)

Unnamed: 0,A,B,C,D,E,SUM
0,0.420997,0.562956,0.482889,0.464087,0.706939,2.637869
1,0.98882,0.313047,0.342954,0.370695,0.343492,2.359007


### Use apply() for Sophisticate function

In [11]:
df2.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [25]:
def best_2015_store(row):
    date = pd.to_datetime(row.Date)
    if (row.Customers > 837) & (date.year == 2015) & (row.Sales > 78560):
        return 'BUSY & PROFITABLE'
    elif (row.Customers > 837) & (date.year == 2015):
        return 'BUSY'
    elif (row.Sales > 78560) & (date.year == 2015):
        return 'PROFITABLE'
    else:
        return "NORMAL"
        

In [26]:
df2.apply(best_2015_store, axis=1)

0          NORMAL
1          NORMAL
2          NORMAL
3            BUSY
4          NORMAL
            ...  
1017204    NORMAL
1017205    NORMAL
1017206    NORMAL
1017207    NORMAL
1017208    NORMAL
Length: 1017209, dtype: object

In [37]:
df3 = {'A': np.random.normal(loc=10, scale=20, size=1000),
      'B': np.random.exponential(scale=5, size=1000),
      'C': np.random.randint(10, size=1000)}
df3 = pd.DataFrame(df3)
df3.head()

Unnamed: 0,A,B,C
0,28.303752,13.960288,3
1,2.971212,1.063798,8
2,3.467566,4.092073,5
3,2.556651,3.719919,6
4,31.019526,3.528092,7


In [54]:
def normalizer(row):
    t = (row - row.mean())/ row.std()
    return t

In [55]:
df3.transform(normalizer)

Unnamed: 0,A,B,C
0,0.944423,1.968793,-0.544014
1,-0.334108,-0.804361,1.207480
2,-0.309057,-0.153186,0.156584
3,-0.355031,-0.233211,0.506883
4,1.081488,-0.274460,0.857181
...,...,...,...
995,0.736427,0.744793,1.207480
996,-1.641676,-0.642889,0.857181
997,-0.103061,0.482836,0.506883
998,2.028254,8.192235,1.207480


In [56]:
df3.apply(normalizer)

Unnamed: 0,A,B,C
0,0.944423,1.968793,-0.544014
1,-0.334108,-0.804361,1.207480
2,-0.309057,-0.153186,0.156584
3,-0.355031,-0.233211,0.506883
4,1.081488,-0.274460,0.857181
...,...,...,...
995,0.736427,0.744793,1.207480
996,-1.641676,-0.642889,0.857181
997,-0.103061,0.482836,0.506883
998,2.028254,8.192235,1.207480


In [50]:
df3.sum()

A    9591.158883
B    4804.459718
C    4553.000000
dtype: float64