In [None]:
import pandas as pd
import numpy as np

In [2]:
# Pandas Diff
# By utilizing diff in Python Pandas we can find the difference between different rows and columns

df_YouTube = pd.DataFrame({
    'YouTube Views': [125, 800, 335, 400, 1500]
})


In [None]:
# Example 1 - Default periods=1

# To find the Views change for the other rows, we take the current row and subtract the prior index. For example in the index of 1: 800 – 125 = 675. In the index of 2: 335 – 800 = -465.

df_YouTube['Views_Change'] = df_YouTube['YouTube Views'].diff()
df_YouTube


Unnamed: 0,YouTube Views,Views_Change
0,125,
1,800,675.0
2,335,-465.0
3,400,65.0
4,1500,1100.0


In [5]:
# Example 2 - Periods 2
# Now it’s time to pass in the periods parameter. In this example we will use 2 which now looks at a 2 row difference. Due to this change, the first two rows (index 0 and 1) have null values.

# For the index of 2: 335 – 125 = 210

# For the index of 3: 400 – 800 = -400

df_YouTube['Views_Change_2_Periods'] = df_YouTube['YouTube Views'].diff(periods=2)

df_YouTube

Unnamed: 0,YouTube Views,Views_Change,Views_Change_2_Periods
0,125,,
1,800,675.0,
2,335,-465.0,210.0
3,400,65.0,-400.0
4,1500,1100.0,1165.0


In [6]:
# Example 3 - Abs difference
# As mentioned in example 1, sometimes we only want to see the positive difference. This is accomplished by using .abs() which gives us the absolute value. The absolute value will always be positive.

df_YouTube['Views_Absolute_Difference'] = df_YouTube['YouTube Views'].diff().abs()
df_YouTube

Unnamed: 0,YouTube Views,Views_Change,Views_Change_2_Periods,Views_Absolute_Difference
0,125,,,
1,800,675.0,,675.0
2,335,-465.0,210.0,465.0
3,400,65.0,-400.0,65.0
4,1500,1100.0,1165.0,1100.0


In [7]:
# Example 4 - Diff multiple
# In this example we are going to look at using diff on multiple columns to find the difference in rows. 

monthly_card_sales = pd.DataFrame({
    'Donald Bradman': [28, 46, 33],
    'Nolan Ryan': [511, 702, 611]
})
monthly_card_sales.diff()

Unnamed: 0,Donald Bradman,Nolan Ryan
0,,
1,18.0,191.0
2,-13.0,-91.0


In [11]:
# Example 5 - Difference across columns
df_merchants = pd.DataFrame({
    'Q1': [182, 270, 330],
    'Q2': [211, 220, 380],
    'Q3': [250, 230, 390]
}, index=[2023, 2024, 2025])
df_merchants.index.name = 'Year'
df_merchants

Unnamed: 0_level_0,Q1,Q2,Q3
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023,182,211,250
2024,270,220,230
2025,330,380,390


In [None]:
# To change us finding the difference between rows to columns,
# we have to switch the axis. By default the axis is 0 which 
# means we are finding the row difference.

# Since we are going to find the column difference, pass in axis=1. 

df_merchants.diff(axis=1)

Unnamed: 0_level_0,Q1,Q2,Q3
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023,,29,39
2024,,-50,10
2025,,50,10


In [None]:
# Example 6 -TIme series analysis with date index
dates = pd.date_range(start='2025-04-19', periods=7, freq='D')
temps = [30, 32, 31, 35, 36, 34, 33]
df_temps = pd.DataFrame({
    'Date': dates,
    'Temperature': temps
})
df_temps.set_index('Date', inplace=True)
df_temps

Unnamed: 0_level_0,Temperature
Date,Unnamed: 1_level_1
2025-04-19,30
2025-04-20,32
2025-04-21,31
2025-04-22,35
2025-04-23,36
2025-04-24,34
2025-04-25,33


In [15]:
df_temps['Temp_Change'] = df_temps['Temperature'].diff()
df_temps

Unnamed: 0_level_0,Temperature,Temp_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-04-19,30,
2025-04-20,32,2.0
2025-04-21,31,-1.0
2025-04-22,35,4.0
2025-04-23,36,1.0
2025-04-24,34,-2.0
2025-04-25,33,-1.0


In [None]:
# Example 7 - Dealing with null values prior to diff
data = {
    'Time': ['08:00', '08:15', '08:30', '08:45', '09:00', '09:15'],
    'Passengers': [120, 125, np.nan, 130, 128, np.nan]
}
nan_df = pd.DataFrame(data)
nan_df

Unnamed: 0,Time,Passengers
0,08:00,120.0
1,08:15,125.0
2,08:30,
3,08:45,130.0
4,09:00,128.0
5,09:15,


In [17]:
nan_df['diff'] = nan_df['Passengers'].fillna(100).diff()
nan_df

Unnamed: 0,Time,Passengers,diff
0,08:00,120.0,
1,08:15,125.0,5.0
2,08:30,,-25.0
3,08:45,130.0,30.0
4,09:00,128.0,-2.0
5,09:15,,-28.0


In [18]:
nan_df['diff_not_filled_na'] = nan_df['Passengers'].diff()
nan_df

Unnamed: 0,Time,Passengers,diff,diff_not_filled_na
0,08:00,120.0,,
1,08:15,125.0,5.0,5.0
2,08:30,,-25.0,
3,08:45,130.0,30.0,
4,09:00,128.0,-2.0,-2.0
5,09:15,,-28.0,


In [19]:
# Example 8 - Different ways to fill in the first value

# Thoughout examples, we have brought up the first value being null many times. Let’s now look at 3 common approaches to fill it.

df_temps['Filled_bfill'] = df_temps['Temp_Change'].bfill()
df_temps

Unnamed: 0_level_0,Temperature,Temp_Change,Filled_bfill
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-04-19,30,,2.0
2025-04-20,32,2.0,2.0
2025-04-21,31,-1.0,-1.0
2025-04-22,35,4.0,4.0
2025-04-23,36,1.0,1.0
2025-04-24,34,-2.0,-2.0
2025-04-25,33,-1.0,-1.0


In [20]:
df_temps['Filled_zero'] = df_temps['Temp_Change'].fillna(0)
df_temps

Unnamed: 0_level_0,Temperature,Temp_Change,Filled_bfill,Filled_zero
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-04-19,30,,2.0,0.0
2025-04-20,32,2.0,2.0,2.0
2025-04-21,31,-1.0,-1.0,-1.0
2025-04-22,35,4.0,4.0,4.0
2025-04-23,36,1.0,1.0,1.0
2025-04-24,34,-2.0,-2.0,-2.0
2025-04-25,33,-1.0,-1.0,-1.0


In [21]:
mean_change = df_temps['Temp_Change'].mean()

df_temps['Filled_mean'] = df_temps['Temp_Change'].fillna(mean_change)

df_temps

Unnamed: 0_level_0,Temperature,Temp_Change,Filled_bfill,Filled_zero,Filled_mean
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-04-19,30,,2.0,0.0,0.5
2025-04-20,32,2.0,2.0,2.0,2.0
2025-04-21,31,-1.0,-1.0,-1.0,-1.0
2025-04-22,35,4.0,4.0,4.0,4.0
2025-04-23,36,1.0,1.0,1.0,1.0
2025-04-24,34,-2.0,-2.0,-2.0,-2.0
2025-04-25,33,-1.0,-1.0,-1.0,-1.0


In [22]:
# Example 9 - Groupby
data = {
    'Date': pd.date_range(start='2025-01-01', periods=12, freq='ME'),
    'Event': ['5K', '10K', 'Half', 'Marathon'] * 3,
    'Time': [25, 55, 110, 240, 24, 54, 108, 238, 23, 52, 107, 237]
}
df_running = pd.DataFrame(data)
df_running.sort_values(by=['Event', 'Date'], inplace=True)
df_running.reset_index(drop=True, inplace=True)
df_running

Unnamed: 0,Date,Event,Time
0,2025-02-28,10K,55
1,2025-06-30,10K,54
2,2025-10-31,10K,52
3,2025-01-31,5K,25
4,2025-05-31,5K,24
5,2025-09-30,5K,23
6,2025-03-31,Half,110
7,2025-07-31,Half,108
8,2025-11-30,Half,107
9,2025-04-30,Marathon,240


In [23]:
df_running['Time_Change'] = df_running.groupby('Event')['Time'].diff()
df_running

Unnamed: 0,Date,Event,Time,Time_Change
0,2025-02-28,10K,55,
1,2025-06-30,10K,54,-1.0
2,2025-10-31,10K,52,-2.0
3,2025-01-31,5K,25,
4,2025-05-31,5K,24,-1.0
5,2025-09-30,5K,23,-1.0
6,2025-03-31,Half,110,
7,2025-07-31,Half,108,-2.0
8,2025-11-30,Half,107,-1.0
9,2025-04-30,Marathon,240,
