## Using functions to modify columns

In [1]:
import pandas as pd 
walt = pd.read_csv('../datasets/WALMART_SALES_DATA.csv')
cars = pd.read_csv('../datasets/cars.csv')

In [2]:
walt.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [3]:
cars.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


### Built-in Functions

In [4]:
# rename column
cars = cars.rename({'Unnamed: 0': 'model'}, axis=1)

In [5]:
cars.tail()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
27,Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
28,Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,0,1,5,4
29,Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,0,1,5,6
30,Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8
31,Volvo 142E,21.4,4,121.0,109,4.11,2.78,18.6,1,1,4,2


In [6]:
# split strings from one to multiple columns
# split model into a list
brand_df = cars['model'].str.split(" ", expand=True)
# add splitted data to dataframe
cars['brand'] = brand_df[0]
cars.tail()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,brand
27,Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2,Lotus
28,Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,0,1,5,4,Ford
29,Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,0,1,5,6,Ferrari
30,Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8,Maserati
31,Volvo 142E,21.4,4,121.0,109,4.11,2.78,18.6,1,1,4,2,Volvo


### Named Functions

In [7]:
walt.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [8]:
walt['Weekly_Sales'].describe()

count    6.435000e+03
mean     1.046965e+06
std      5.643666e+05
min      2.099862e+05
25%      5.533501e+05
50%      9.607460e+05
75%      1.420159e+06
max      3.818686e+06
Name: Weekly_Sales, dtype: float64

In [9]:
# 1. define a function
# set standard
def get_std_sales(sale):
    # if sales >= 1.6M return 'Gold'
    if sale >= 1.6*(10**6):
        return 'Gold'
    # elif sales >= 1.5M return 'Platinum'
    elif sale >= 1.5*(10**6):
        return 'Platinum'
    # else return 'Average'
    else:
        return 'Average'

In [10]:
# 2. validate function
print(get_std_sales(sale=1000000))

Average


In [11]:
# 3. apply function
walt['Sales_Class'] = walt['Weekly_Sales'].apply(get_std_sales)
walt['Sales_Class'].unique()

array(['Gold', 'Average', 'Platinum'], dtype=object)

### Anonymous Functions - lambda

In [12]:
x = 100
def mult(x):
    return x*x

print(mult(x))

10000


In [13]:
(lambda x:x*x)(100)

10000

In [14]:
walt['Weekly_Sales'].apply(get_std_sales)

0           Gold
1           Gold
2           Gold
3        Average
4       Platinum
          ...   
6430     Average
6431     Average
6432     Average
6433     Average
6434     Average
Name: Weekly_Sales, Length: 6435, dtype: object

In [15]:
# lambda function
walt['Weekly_Sales'].apply(lambda s: 'Gold' if s >= 1.6*(10**6) else ('Platinum' if s >= 1.5*(10**6) else 'Average'))

0           Gold
1           Gold
2           Gold
3        Average
4       Platinum
          ...   
6430     Average
6431     Average
6432     Average
6433     Average
6434     Average
Name: Weekly_Sales, Length: 6435, dtype: object

In [16]:
walt['Weekly_Sales'].apply(lambda s:s/(10**6))

0       1.643691
1       1.641957
2       1.611968
3       1.409728
4       1.554807
          ...   
6430    0.713174
6431    0.733455
6432    0.734464
6433    0.718126
6434    0.760281
Name: Weekly_Sales, Length: 6435, dtype: float64

### Test the difference between these two using callable()

In [17]:
print(callable(mult))
print(callable((lambda x:x*x)(100)))

True
False


### A challenging example: Apply from using inputs from two columns

In [18]:
walt.columns

Index(['Store', 'Date', 'Weekly_Sales', 'Holiday_Flag', 'Temperature',
       'Fuel_Price', 'CPI', 'Unemployment', 'Sales_Class'],
      dtype='object')

In [19]:
# 1. define a function
# set standard
def get_std_sales_h(data):
    # if sales >= 1.6M return 'Gold' and that week is not holiday
    if data['Weekly_Sales'] >= 1.6*(10**6) and data['Holiday_Flag']==0:
        return 'Gold'
    # elif sales >= 1.5M return 'Platinum' and that week is not holiday
    elif data['Weekly_Sales'] >= 1.5*(10**6) and data['Holiday_Flag']==0:
        return 'Platinum'
    # else return 'Average'
    else:
        return 'Average'

In [20]:
# 2. Validate function
walt_tst = walt.head()
walt_tst = walt_tst.copy()
walt_tst['Class_No_Holiday'] = walt_tst.apply(get_std_sales_h, axis=1)

In [21]:
walt_tst

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Sales_Class,Class_No_Holiday
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106,Gold,Gold
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106,Gold,Average
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106,Gold,Gold
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106,Average,Average
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106,Platinum,Platinum


In [22]:
# 3. apply to dataframe 
walt['Class_No_Holiday'] = walt.apply(get_std_sales_h, axis=1)

In [23]:
walt.head(7)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Sales_Class,Class_No_Holiday
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106,Gold,Gold
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106,Gold,Average
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106,Gold,Gold
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106,Average,Average
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106,Platinum,Platinum
5,1,12-03-2010,1439541.59,0,57.79,2.667,211.380643,8.106,Average,Average
6,1,19-03-2010,1472515.79,0,54.58,2.72,211.215635,8.106,Average,Average
