In [27]:
import pandas as pd
import numpy as np

`DataFrame.apply(func, axis=0, raw=False, result_type=None, args=(), by_row='compat', engine='python', engine_kwargs=None, **kwargs)[source]`

- `apply()` - Row or Column wise function operation
- `pipe()` - Table wise function applications in Pandas
* Useful links:
      - https://www.youtube.com/watch?v=DsjvCKxOdgI

In [30]:
import seaborn as sns
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

- To demonstrate how to load one of these datasets, we will pick '`diamonds`'. It shows records for diamonds price and their characteristics, like colour, cut, clarity and depth
* We will load this dataset using `sns.load_dataset()`. The argument will be the dataset name

In [33]:
df = sns.load_dataset('diamonds')
df = df.head(100)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [34]:
df1 = sns.load_dataset('titanic')
df1 = df1.head(100)
df1.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Apply

We can apply a function (custom or built-in) along an axis of the DataFrame with `.apply()`. You will broadcast the function to the DataFrame. The documentation is [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html)
  * The first argument is `func`
  * The second argument is `axis`; `axis = 1` means that the function is applied to each row (using the needed columns)

* We will consider two custom functions. They don't have any particular business meaning, just for the purpose of explaining the concept
  * Sum the values of x, y, and z
  * Multiply the price by 4

* These functions are applied/broadcasted to the DataFrame and assigned to a Series

In [39]:
#### first function
def sum_of_xyz(df):
    '''
    Gets a DataFrame as an argument
    returns the total of adding the value in a row from columns named x y z
    '''
    return df['x'] + df['y'] + df['z']

# when you run this cell notice the DataFrame has a new column named xyzSum  
df['xyzSum'] = df.apply(sum_of_xyz, axis = 1)


#### second function
def multiply_price_by_4(price):
    '''
    Gets a column as an argument
    Returns the total of multiplying the value in a row from that column by 4
    '''
    return price * 4

# when you run the code below notice the DataFrame has a new column named price_times_4 
df['price_times_4'] = df['price'].apply(multiply_price_by_4)
    # df['x'] returns a Series (a one-dimensional object with the values of the column)
    # df[['x']]: Returns a DataFrame (a two-dimensional object with one column)

df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,xyzSum,price_times_4
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,10.36,1304
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,10.04,1304
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,10.43,1308
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,11.06,1336
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,11.44,1340
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.70,Good,E,VS2,57.5,58.0,2759,5.85,5.90,3.38,15.13,11036
96,0.70,Good,F,VS1,59.4,62.0,2759,5.71,5.76,3.40,14.87,11036
97,0.96,Fair,F,SI2,66.3,62.0,2759,6.27,5.95,4.07,16.29,11036
98,0.73,Very Good,E,SI1,61.6,59.0,2760,5.77,5.78,3.56,15.11,11040


In [41]:
df_practice = sns.load_dataset('tips')
print(f"DataFrame shape: {df_practice.shape}")
df_practice = df_practice.head(100)
df_practice.head(10)

DataFrame shape: (244, 7)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [43]:
def proportion_tip_total_bill(df_practice):
    return df_practice['tip'] / df_practice['total_bill']
    
# After you write your function run the cell
df_practice['tips_proportion'] = df_practice.apply(proportion_tip_total_bill, axis=1)
df_practice.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_proportion
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [45]:
def high_tips_day(df_practice):
    if df_practice['tip'] > 2.0:
        return f'High tips day is {df_practice['day']}'
    else:
        return f'Low tips day'

df_practice['high_tips_day'] = df_practice.apply(high_tips_day, axis=1)
df_practice

# pd.DataFrame(df_practice.apply(high_tips_day, axis=1)) -- It will create DataFrame just with the output

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_proportion,high_tips_day
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,Low tips day
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,Low tips day
2,21.01,3.50,Male,No,Sun,Dinner,3,0.166587,High tips day is Sun
3,23.68,3.31,Male,No,Sun,Dinner,2,0.139780,High tips day is Sun
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,High tips day is Sun
...,...,...,...,...,...,...,...,...,...
95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.117750,High tips day is Fri
96,27.28,4.00,Male,Yes,Fri,Dinner,2,0.146628,High tips day is Fri
97,12.03,1.50,Male,Yes,Fri,Dinner,2,0.124688,Low tips day
98,21.01,3.00,Male,Yes,Fri,Dinner,2,0.142789,High tips day is Fri


In [25]:
df_practice.head(5)

0            Low tips day
1            Low tips day
2    High tips day is Sun
3    High tips day is Sun
4    High tips day is Sun
dtype: object

In [14]:
df_practice['total_bill'].astype(float)

0     16.99
1     10.34
2     21.01
3     23.68
4     24.59
      ...  
95    40.17
96    27.28
97    12.03
98    21.01
99    12.46
Name: total_bill, Length: 100, dtype: float64

In [15]:
def bonus_on_bill(bonus):
    if bonus > 20.00:
        return bonus * 1.5
    else:
        return f'{bonus}'
        
df_practice['total_bill'].apply(bonus_on_bill)

0      16.99
1      10.34
2     31.515
3      35.52
4     36.885
       ...  
95    60.255
96     40.92
97     12.03
98    31.515
99     12.46
Name: total_bill, Length: 100, dtype: object

In [16]:
def bonus_on_bill(total_bill):
    if total_bill > 20.00:
        return total_bill * 1.5
    else:
        return f'-{total_bill}'
        
df_practice['Bonus_pay'] = df_practice['total_bill'].apply(bonus_on_bill)
df_practice

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_proportion,Bonus_pay
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,-16.99
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,-10.34
2,21.01,3.50,Male,No,Sun,Dinner,3,0.166587,31.515
3,23.68,3.31,Male,No,Sun,Dinner,2,0.139780,35.52
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,36.885
...,...,...,...,...,...,...,...,...,...
95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.117750,60.255
96,27.28,4.00,Male,Yes,Fri,Dinner,2,0.146628,40.92
97,12.03,1.50,Male,Yes,Fri,Dinner,2,0.124688,-12.03
98,21.01,3.00,Male,Yes,Fri,Dinner,2,0.142789,31.515


In [17]:
df = pd.DataFrame({'A': [1, 6, 3], 
                   'B': [9, 2, 7]})

# Custom function to categorize numbers
def categorize(value):
    if value > 5:
        return 'High'
    else:
        return 'Low'

# Apply custom function to each element of the 
df['A'].apply(categorize)


0     Low
1    High
2     Low
Name: A, dtype: object

In [18]:
df = pd.DataFrame({
    'A': ['John', 'Jane', 'Alice'],
    'B': ['Smith', 'Doe', 'Brown']
})

# Concatenate first and last names
df['Full_Name'] = df.apply(lambda row: row['A'] + ' ' + row['B'], axis=1)
df

Unnamed: 0,A,B,Full_Name
0,John,Smith,John Smith
1,Jane,Doe,Jane Doe
2,Alice,Brown,Alice Brown
