In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

In [3]:
data = {
    "Date": [
        "1965-05-08",
        "1975-08-02",
        "1981-10-25",
        "1994-11-27",
        "2002-10-22",
        "2015-06-12",
        "2019-08-30",
        "2021-11-23"
    ],
    "City": [
        "Jacksonville",
        "Jacksonville",
        "Orlando",
        "Gainesville",
        "Sunrise",
        "Orlando",
        "Miami Gardens",
        "Hollywood"
    ],
    "Venue": [
        "Jacksonville Coliseum",
        "Gator Bowl Stadium",
        "Orlando Stadium (Tangerine Bowl)",
        "Florida Field (Ben Hill Griffin Stadium)",
        "Office Depot Center",
        "Orlando Citrus Bowl (Camping World Stadium)",
        "Hard Rock Stadium",
        "Hard Rock Live at Seminole Hard Rock Hotel & Casino"
    ],
    "Estimated Capacity": [
        10000,  # Approximate capacity for Jacksonville Coliseum
        72000,  # Approximate capacity for Gator Bowl Stadium
        60000,
        88000,
        20000,
        47225,
        65326,
        7000
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Date,City,Venue,Estimated Capacity
0,1965-05-08,Jacksonville,Jacksonville Coliseum,10000
1,1975-08-02,Jacksonville,Gator Bowl Stadium,72000
2,1981-10-25,Orlando,Orlando Stadium (Tangerine Bowl),60000
3,1994-11-27,Gainesville,Florida Field (Ben Hill Griffin Stadium),88000
4,2002-10-22,Sunrise,Office Depot Center,20000
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000


In [4]:
# Converting Strings to Datetime

# pd.to_datetime() handles the conversion.

# errors=’coerce’ makes invalid or unparseable dates become NaT (Not a Time) instead of throwing an error.
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [5]:
df2 = df.copy()

In [11]:
# Extracting Date Components
df['year'] = df['Date'].dt.year


In [7]:
df['month'] = df['Date'].dt.month

In [8]:
df['dayofweek_int'] = df['Date'].dt.dayofweek

In [12]:
df['dayofweek_name'] = df['Date'].dt.day_name()
df

Unnamed: 0,Date,City,Venue,Estimated Capacity,year,month,dayofweek_int,dayofweek_name
0,1965-05-08,Jacksonville,Jacksonville Coliseum,10000,1965,5,5,Saturday
1,1975-08-02,Jacksonville,Gator Bowl Stadium,72000,1975,8,5,Saturday
2,1981-10-25,Orlando,Orlando Stadium (Tangerine Bowl),60000,1981,10,6,Sunday
3,1994-11-27,Gainesville,Florida Field (Ben Hill Griffin Stadium),88000,1994,11,6,Sunday
4,2002-10-22,Sunrise,Office Depot Center,20000,2002,10,1,Tuesday
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225,2015,6,4,Friday
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326,2019,8,4,Friday
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,2021,11,1,Tuesday


In [13]:
# Finding Minimum and Maximum Dates
df['Date'].min()

Timestamp('1965-05-08 00:00:00')

In [14]:
df['Date'].max()

Timestamp('2021-11-23 00:00:00')

In [15]:
# Filtering Rows by Date Conditions
df.loc[df['Date'] > '2020-01-01']

Unnamed: 0,Date,City,Venue,Estimated Capacity,year,month,dayofweek_int,dayofweek_name
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,2021,11,1,Tuesday


In [16]:
df.loc[(df['Date'] > '2020-01-01') & (df['Date'].dt.month == 11)]

Unnamed: 0,Date,City,Venue,Estimated Capacity,year,month,dayofweek_int,dayofweek_name
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,2021,11,1,Tuesday


In [None]:
# Shifting Dates by Months
# shfiting one month back
df['month_before_show'] = df['Date'] - pd.DateOffset(months=1)
df

Unnamed: 0,Date,City,Venue,Estimated Capacity,year,month,dayofweek_int,dayofweek_name,month_before_show
0,1965-05-08,Jacksonville,Jacksonville Coliseum,10000,1965,5,5,Saturday,1965-04-08
1,1975-08-02,Jacksonville,Gator Bowl Stadium,72000,1975,8,5,Saturday,1975-07-02
2,1981-10-25,Orlando,Orlando Stadium (Tangerine Bowl),60000,1981,10,6,Sunday,1981-09-25
3,1994-11-27,Gainesville,Florida Field (Ben Hill Griffin Stadium),88000,1994,11,6,Sunday,1994-10-27
4,2002-10-22,Sunrise,Office Depot Center,20000,2002,10,1,Tuesday,2002-09-22
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225,2015,6,4,Friday,2015-05-12
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326,2019,8,4,Friday,2019-07-30
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,2021,11,1,Tuesday,2021-10-23


In [None]:
# Calculating Days Since an Event
# calculating days from a specific date
df['days_since_event'] = (pd.Timestamp('2025-04-17') - df['Date']).dt.days
df

Unnamed: 0,Date,City,Venue,Estimated Capacity,year,month,dayofweek_int,dayofweek_name,month_before_show,days_since_event
0,1965-05-08,Jacksonville,Jacksonville Coliseum,10000,1965,5,5,Saturday,1965-04-08,21894
1,1975-08-02,Jacksonville,Gator Bowl Stadium,72000,1975,8,5,Saturday,1975-07-02,18156
2,1981-10-25,Orlando,Orlando Stadium (Tangerine Bowl),60000,1981,10,6,Sunday,1981-09-25,15880
3,1994-11-27,Gainesville,Florida Field (Ben Hill Griffin Stadium),88000,1994,11,6,Sunday,1994-10-27,11099
4,2002-10-22,Sunrise,Office Depot Center,20000,2002,10,1,Tuesday,2002-09-22,8213
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225,2015,6,4,Friday,2015-05-12,3597
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326,2019,8,4,Friday,2019-07-30,2057
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,2021,11,1,Tuesday,2021-10-23,1241


In [21]:
# Sorting by Date
df.sort_values('Date')

Unnamed: 0,Date,City,Venue,Estimated Capacity,year,month,dayofweek_int,dayofweek_name,month_before_show,days_since_event
0,1965-05-08,Jacksonville,Jacksonville Coliseum,10000,1965,5,5,Saturday,1965-04-08,21894
1,1975-08-02,Jacksonville,Gator Bowl Stadium,72000,1975,8,5,Saturday,1975-07-02,18156
2,1981-10-25,Orlando,Orlando Stadium (Tangerine Bowl),60000,1981,10,6,Sunday,1981-09-25,15880
3,1994-11-27,Gainesville,Florida Field (Ben Hill Griffin Stadium),88000,1994,11,6,Sunday,1994-10-27,11099
4,2002-10-22,Sunrise,Office Depot Center,20000,2002,10,1,Tuesday,2002-09-22,8213
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225,2015,6,4,Friday,2015-05-12,3597
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326,2019,8,4,Friday,2019-07-30,2057
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,2021,11,1,Tuesday,2021-10-23,1241


In [22]:
df.sort_values('Date', ascending=False)

Unnamed: 0,Date,City,Venue,Estimated Capacity,year,month,dayofweek_int,dayofweek_name,month_before_show,days_since_event
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,2021,11,1,Tuesday,2021-10-23,1241
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326,2019,8,4,Friday,2019-07-30,2057
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225,2015,6,4,Friday,2015-05-12,3597
4,2002-10-22,Sunrise,Office Depot Center,20000,2002,10,1,Tuesday,2002-09-22,8213
3,1994-11-27,Gainesville,Florida Field (Ben Hill Griffin Stadium),88000,1994,11,6,Sunday,1994-10-27,11099
2,1981-10-25,Orlando,Orlando Stadium (Tangerine Bowl),60000,1981,10,6,Sunday,1981-09-25,15880
1,1975-08-02,Jacksonville,Gator Bowl Stadium,72000,1975,8,5,Saturday,1975-07-02,18156
0,1965-05-08,Jacksonville,Jacksonville Coliseum,10000,1965,5,5,Saturday,1965-04-08,21894


In [23]:
# Appending New Rows to a DataFrame

# we append the new row to df2 using pd.concat()

# ignore_index=True resets the index so the DataFrame has a continuous index after appending.

new_row = pd.DataFrame([{
    'Date': pd.NaT,
    'City': 'Jacksonville',
    'Venue': 'Metlife',
    'Estimated Capacity': 70000
}])

df2 = pd.concat([df2, new_row], ignore_index=True)

In [24]:
# Handling Missing Dates
df2['Date'].isna()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8     True
Name: Date, dtype: bool

In [25]:
df2['Date'] = df2['Date'].fillna(pd.Timestamp('2019-07-19'))

In [26]:
new_row_2 = pd.DataFrame([{
    'Date': pd.NaT,
    'City': 'Plant City',
    'Venue': 'Strawberry Festival Stadium',
    'Estimated Capacity': 3000
}])

df2 = pd.concat([df2, new_row_2], ignore_index=True)

In [27]:
df2.tail()

Unnamed: 0,Date,City,Venue,Estimated Capacity
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000
8,2019-07-19,Jacksonville,Metlife,70000
9,NaT,Plant City,Strawberry Festival Stadium,3000


In [28]:
df2.dropna(subset=['Date'], inplace=True)

In [29]:
df2.tail()

Unnamed: 0,Date,City,Venue,Estimated Capacity
4,2002-10-22,Sunrise,Office Depot Center,20000
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000
8,2019-07-19,Jacksonville,Metlife,70000


In [31]:
# Finding Previous and Next Events

# This creates a new column ‘previous_concert’ by shifting the ‘Date’ column down by 1 row

df['previous_concert'] = df['Date'].shift(1)
df

Unnamed: 0,Date,City,Venue,Estimated Capacity,year,month,dayofweek_int,dayofweek_name,month_before_show,days_since_event,previous_concert
0,1965-05-08,Jacksonville,Jacksonville Coliseum,10000,1965,5,5,Saturday,1965-04-08,21894,NaT
1,1975-08-02,Jacksonville,Gator Bowl Stadium,72000,1975,8,5,Saturday,1975-07-02,18156,1965-05-08
2,1981-10-25,Orlando,Orlando Stadium (Tangerine Bowl),60000,1981,10,6,Sunday,1981-09-25,15880,1975-08-02
3,1994-11-27,Gainesville,Florida Field (Ben Hill Griffin Stadium),88000,1994,11,6,Sunday,1994-10-27,11099,1981-10-25
4,2002-10-22,Sunrise,Office Depot Center,20000,2002,10,1,Tuesday,2002-09-22,8213,1994-11-27
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225,2015,6,4,Friday,2015-05-12,3597,2002-10-22
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326,2019,8,4,Friday,2019-07-30,2057,2015-06-12
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,2021,11,1,Tuesday,2021-10-23,1241,2019-08-30


In [None]:
# to get the next date value using -1
df['next_concert'] = df['Date'].shift(-1)
df

Unnamed: 0,Date,City,Venue,Estimated Capacity,year,month,dayofweek_int,dayofweek_name,month_before_show,days_since_event,previous_concert,next_concert
0,1965-05-08,Jacksonville,Jacksonville Coliseum,10000,1965,5,5,Saturday,1965-04-08,21894,NaT,1975-08-02
1,1975-08-02,Jacksonville,Gator Bowl Stadium,72000,1975,8,5,Saturday,1975-07-02,18156,1965-05-08,1981-10-25
2,1981-10-25,Orlando,Orlando Stadium (Tangerine Bowl),60000,1981,10,6,Sunday,1981-09-25,15880,1975-08-02,1994-11-27
3,1994-11-27,Gainesville,Florida Field (Ben Hill Griffin Stadium),88000,1994,11,6,Sunday,1994-10-27,11099,1981-10-25,2002-10-22
4,2002-10-22,Sunrise,Office Depot Center,20000,2002,10,1,Tuesday,2002-09-22,8213,1994-11-27,2015-06-12
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225,2015,6,4,Friday,2015-05-12,3597,2002-10-22,2019-08-30
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326,2019,8,4,Friday,2019-07-30,2057,2015-06-12,2021-11-23
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,2021,11,1,Tuesday,2021-10-23,1241,2019-08-30,NaT


In [35]:
# Rolling and Expanding Calculations

# Here, we create a new column ‘Rolling_3_Event_Mean’ in df2:

# it calculates the moving average of ‘Estimated Capacity’ over a window of 3 rows.

df2['Rolling_3_Event_Mean'] = df2['Estimated Capacity'].rolling(window=3).mean()
df2

Unnamed: 0,Date,City,Venue,Estimated Capacity,Rolling_3_Event_Mean
0,1965-05-08,Jacksonville,Jacksonville Coliseum,10000,
1,1975-08-02,Jacksonville,Gator Bowl Stadium,72000,
2,1981-10-25,Orlando,Orlando Stadium (Tangerine Bowl),60000,47333.333333
3,1994-11-27,Gainesville,Florida Field (Ben Hill Griffin Stadium),88000,73333.333333
4,2002-10-22,Sunrise,Office Depot Center,20000,56000.0
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225,51741.666667
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326,44183.666667
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,39850.333333
8,2019-07-19,Jacksonville,Metlife,70000,47442.0


In [None]:
# it calculates the cumulative mean of ‘Estimated Capacity’ from the first row up to the current row.

# Unlike rolling,  when you use expanding, it uses all previous rows, not just a fixed window.

df2['Expanding_Mean'] = df2['Estimated Capacity'].expanding().mean()
df2

Unnamed: 0,Date,City,Venue,Estimated Capacity,Rolling_3_Event_Mean,Expanding_Mean
0,1965-05-08,Jacksonville,Jacksonville Coliseum,10000,,10000.0
1,1975-08-02,Jacksonville,Gator Bowl Stadium,72000,,41000.0
2,1981-10-25,Orlando,Orlando Stadium (Tangerine Bowl),60000,47333.333333,47333.333333
3,1994-11-27,Gainesville,Florida Field (Ben Hill Griffin Stadium),88000,73333.333333,57500.0
4,2002-10-22,Sunrise,Office Depot Center,20000,56000.0,50000.0
5,2015-06-12,Orlando,Orlando Citrus Bowl (Camping World Stadium),47225,51741.666667,49537.5
6,2019-08-30,Miami Gardens,Hard Rock Stadium,65326,44183.666667,51793.0
7,2021-11-23,Hollywood,Hard Rock Live at Seminole Hard Rock Hotel & C...,7000,39850.333333,46193.875
8,2019-07-19,Jacksonville,Metlife,70000,47442.0,48839.0


In [40]:
# Resampling Time Series Data

# Here, we resample the ‘Estimated Capacity’ column by 10 year intervals, then calculates the mean for each period.

df2['Estimated Capacity'].resample('10YS').mean()  # monthly average


Date
1965-01-01    10000.00
1975-01-01    66000.00
1985-01-01    88000.00
1995-01-01    20000.00
2005-01-01         NaN
2015-01-01    47387.75
Freq: 10YS-JAN, Name: Estimated Capacity, dtype: float64

In [41]:
df2['Estimated Capacity'].resample('10YS').agg({'mean', 'max', 'min','sum'})


Unnamed: 0_level_0,max,sum,min,mean
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1965-01-01,10000.0,10000,10000.0,10000.0
1975-01-01,72000.0,132000,60000.0,66000.0
1985-01-01,88000.0,88000,88000.0,88000.0
1995-01-01,20000.0,20000,20000.0,20000.0
2005-01-01,,0,,
2015-01-01,70000.0,189551,7000.0,47387.75


In [42]:
# Selecting Data by Date Index Ranges
df2.loc['2019']

Unnamed: 0_level_0,City,Venue,Estimated Capacity,Rolling_3_Event_Mean,Expanding_Mean
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-08-30,Miami Gardens,Hard Rock Stadium,65326,44183.666667,51793.0
2019-07-19,Jacksonville,Metlife,70000,47442.0,48839.0


In [43]:
df2.loc['2019-07']

Unnamed: 0_level_0,City,Venue,Estimated Capacity,Rolling_3_Event_Mean,Expanding_Mean
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-07-19,Jacksonville,Metlife,70000,47442.0,48839.0


In [44]:
df2.loc['2019-07-19']

City                    Jacksonville
Venue                        Metlife
Estimated Capacity             70000
Rolling_3_Event_Mean         47442.0
Expanding_Mean               48839.0
Name: 2019-07-19 00:00:00, dtype: object

In [47]:
# Creating a Train Schedule Dataset
data2 = {
    "Train Number": ["T123", "T456", "T789", "T101", "T202", "T303"],
    "Departure Time": ["08:00", "09:30", "11:00", "13:00", "15:30", "18:00"],
    "Destination": ["Zermatt", "Paris", "Zermatt", "Paris", "Zermatt", "Paris"],
    "Platform": [3, 5, 2, 6, 1, 4],
    "Status": ["On Time", "Delayed", "On Time", "On Time", "Delayed", "On Time"]
}
df_train_schedule = pd.DataFrame(data2)
df_train_schedule

Unnamed: 0,Train Number,Departure Time,Destination,Platform,Status
0,T123,08:00,Zermatt,3,On Time
1,T456,09:30,Paris,5,Delayed
2,T789,11:00,Zermatt,2,On Time
3,T101,13:00,Paris,6,On Time
4,T202,15:30,Zermatt,1,Delayed
5,T303,18:00,Paris,4,On Time


In [48]:
# Combining Date and Time
today = pd.Timestamp.now().date()

In [50]:
df_train_schedule["Departure Time"] = pd.to_datetime(df_train_schedule["Departure Time"].apply(lambda t: f"{today} {t}"))
df_train_schedule

  df_train_schedule["Departure Time"] = pd.to_datetime(df_train_schedule["Departure Time"].apply(lambda t: f"{today} {t}"))


Unnamed: 0,Train Number,Departure Time,Destination,Platform,Status
0,T123,2025-09-22 08:00:00-22:00,Zermatt,3,On Time
1,T456,2025-09-22 09:30:00-22:00,Paris,5,Delayed
2,T789,2025-09-22 11:00:00-22:00,Zermatt,2,On Time
3,T101,2025-09-22 13:00:00-22:00,Paris,6,On Time
4,T202,2025-09-22 15:30:00-22:00,Zermatt,1,Delayed
5,T303,2025-09-22 18:00:00-22:00,Paris,4,On Time


In [52]:
# # Setting and Converting Timezones
# df_train_schedule["Departure Time"] = df_train_schedule["Departure Time"].dt.tz_localize("Europe/Zurich")
# df_train_schedule

In [54]:
# Finding Next Departure per Destination
df_train_schedule['Departure Time EST'] = df_train_schedule['Departure Time'].dt.tz_convert('US/Eastern')
df_train_schedule


Unnamed: 0,Train Number,Departure Time,Destination,Platform,Status,Departure Time EST
0,T123,2025-09-22 08:00:00-22:00,Zermatt,3,On Time,2025-09-23 02:00:00-04:00
1,T456,2025-09-22 09:30:00-22:00,Paris,5,Delayed,2025-09-23 03:30:00-04:00
2,T789,2025-09-22 11:00:00-22:00,Zermatt,2,On Time,2025-09-23 05:00:00-04:00
3,T101,2025-09-22 13:00:00-22:00,Paris,6,On Time,2025-09-23 07:00:00-04:00
4,T202,2025-09-22 15:30:00-22:00,Zermatt,1,Delayed,2025-09-23 09:30:00-04:00
5,T303,2025-09-22 18:00:00-22:00,Paris,4,On Time,2025-09-23 12:00:00-04:00


In [56]:
df_train_schedule["Next Departure"] = df_train_schedule.sort_values("Departure Time").groupby("Destination")["Departure Time"].shift(-1)
df_train_schedule

Unnamed: 0,Train Number,Departure Time,Destination,Platform,Status,Departure Time EST,Next Departure
0,T123,2025-09-22 08:00:00-22:00,Zermatt,3,On Time,2025-09-23 02:00:00-04:00,2025-09-22 11:00:00-22:00
1,T456,2025-09-22 09:30:00-22:00,Paris,5,Delayed,2025-09-23 03:30:00-04:00,2025-09-22 13:00:00-22:00
2,T789,2025-09-22 11:00:00-22:00,Zermatt,2,On Time,2025-09-23 05:00:00-04:00,2025-09-22 15:30:00-22:00
3,T101,2025-09-22 13:00:00-22:00,Paris,6,On Time,2025-09-23 07:00:00-04:00,2025-09-22 18:00:00-22:00
4,T202,2025-09-22 15:30:00-22:00,Zermatt,1,Delayed,2025-09-23 09:30:00-04:00,NaT
5,T303,2025-09-22 18:00:00-22:00,Paris,4,On Time,2025-09-23 12:00:00-04:00,NaT


In [57]:
# Calculating Time Differences
df_train_schedule['next_train_time_diff'] = df_train_schedule['Next Departure'] - df_train_schedule['Departure Time']
df_train_schedule

Unnamed: 0,Train Number,Departure Time,Destination,Platform,Status,Departure Time EST,Next Departure,next_train_time_diff
0,T123,2025-09-22 08:00:00-22:00,Zermatt,3,On Time,2025-09-23 02:00:00-04:00,2025-09-22 11:00:00-22:00,0 days 03:00:00
1,T456,2025-09-22 09:30:00-22:00,Paris,5,Delayed,2025-09-23 03:30:00-04:00,2025-09-22 13:00:00-22:00,0 days 03:30:00
2,T789,2025-09-22 11:00:00-22:00,Zermatt,2,On Time,2025-09-23 05:00:00-04:00,2025-09-22 15:30:00-22:00,0 days 04:30:00
3,T101,2025-09-22 13:00:00-22:00,Paris,6,On Time,2025-09-23 07:00:00-04:00,2025-09-22 18:00:00-22:00,0 days 05:00:00
4,T202,2025-09-22 15:30:00-22:00,Zermatt,1,Delayed,2025-09-23 09:30:00-04:00,NaT,NaT
5,T303,2025-09-22 18:00:00-22:00,Paris,4,On Time,2025-09-23 12:00:00-04:00,NaT,NaT


In [59]:
# Adding Hours to Datetimes
df_train_schedule["Train Arrival Time"] = df_train_schedule["Departure Time"] + timedelta(hours=3)
df_train_schedule

Unnamed: 0,Train Number,Departure Time,Destination,Platform,Status,Departure Time EST,Next Departure,next_train_time_diff,Train Arrival Time
0,T123,2025-09-22 08:00:00-22:00,Zermatt,3,On Time,2025-09-23 02:00:00-04:00,2025-09-22 11:00:00-22:00,0 days 03:00:00,2025-09-22 11:00:00-22:00
1,T456,2025-09-22 09:30:00-22:00,Paris,5,Delayed,2025-09-23 03:30:00-04:00,2025-09-22 13:00:00-22:00,0 days 03:30:00,2025-09-22 12:30:00-22:00
2,T789,2025-09-22 11:00:00-22:00,Zermatt,2,On Time,2025-09-23 05:00:00-04:00,2025-09-22 15:30:00-22:00,0 days 04:30:00,2025-09-22 14:00:00-22:00
3,T101,2025-09-22 13:00:00-22:00,Paris,6,On Time,2025-09-23 07:00:00-04:00,2025-09-22 18:00:00-22:00,0 days 05:00:00,2025-09-22 16:00:00-22:00
4,T202,2025-09-22 15:30:00-22:00,Zermatt,1,Delayed,2025-09-23 09:30:00-04:00,NaT,NaT,2025-09-22 18:30:00-22:00
5,T303,2025-09-22 18:00:00-22:00,Paris,4,On Time,2025-09-23 12:00:00-04:00,NaT,NaT,2025-09-22 21:00:00-22:00


In [60]:
# Filtering by Time Ranges
# Here, we filter the rows in the DataFrame df_train_schedule where the train arrival time is between 12:00PM and 5:00PM

filtered_train_times = df_train_schedule[
    (df_train_schedule["Train Arrival Time"].dt.time >= pd.to_datetime("12:00").time()) &
    (df_train_schedule["Train Arrival Time"].dt.time <= pd.to_datetime("17:00").time())
]
filtered_train_times

Unnamed: 0,Train Number,Departure Time,Destination,Platform,Status,Departure Time EST,Next Departure,next_train_time_diff,Train Arrival Time
1,T456,2025-09-22 09:30:00-22:00,Paris,5,Delayed,2025-09-23 03:30:00-04:00,2025-09-22 13:00:00-22:00,0 days 03:30:00,2025-09-22 12:30:00-22:00
2,T789,2025-09-22 11:00:00-22:00,Zermatt,2,On Time,2025-09-23 05:00:00-04:00,2025-09-22 15:30:00-22:00,0 days 04:30:00,2025-09-22 14:00:00-22:00
3,T101,2025-09-22 13:00:00-22:00,Paris,6,On Time,2025-09-23 07:00:00-04:00,2025-09-22 18:00:00-22:00,0 days 05:00:00,2025-09-22 16:00:00-22:00


In [61]:
# Creating Date Ranges with pd.date_range()
pd.date_range(start='2024-01-01', end='2024-01-10')

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10'],
              dtype='datetime64[ns]', freq='D')

In [62]:
# Here, we start from January 1st, 2024,

# periods=6: we want 6 dates in total.

# freq=’ME’: ‘ME’ stands for Month End, so it will return the last day of each month.

pd.date_range(start='2024-01-01', periods=6, freq='ME')

DatetimeIndex(['2024-01-31', '2024-02-29', '2024-03-31', '2024-04-30',
               '2024-05-31', '2024-06-30'],
              dtype='datetime64[ns]', freq='ME')

In [65]:
# Handling Duplicate Dates
data = {
    'date': [
        '2025-04-01', '2025-04-02', '2025-04-02',  # duplicate on 2nd
        '2025-04-04', '2025-04-06'                 # missing 3rd and 5th
    ],
    'train': ['TGV 101', 'TGV 102', 'TGV 103', 'TGV 104', 'TGV 105']
}
df3 = pd.DataFrame(data)
df3['date'] = pd.to_datetime(df3['date'])
duplicates = df3[df3['date'].duplicated()]
duplicates

Unnamed: 0,date,train
2,2025-04-02,TGV 103


In [68]:
# Handling Missing Dates in a Schedule
data = {
    'date': [
        '2025-04-01',
        '2025-04-02', '2025-04-04',
        '2025-04-06',                             # missing 3rd, 5th
        '2025-04-07', '2025-04-09',               # missing 8th
        '2025-04-10', '2025-04-11', '2025-04-12'
    ],
    'train': [
        'TGV 101',
        'TGV 103', 'TGV 104',
        'TGV 105',
        'TGV 106', 'TGV 107',
        'TGV 108', 'TGV 109', 'TGV 110'
    ]
}
df4 = pd.DataFrame(data)
df4['date'] = pd.to_datetime(df4['date'])
df4 = df4.set_index(['date'])
df_full = df4.asfreq('D', fill_value='No departures')
df_full



Unnamed: 0_level_0,train
date,Unnamed: 1_level_1
2025-04-01,TGV 101
2025-04-02,TGV 103
2025-04-03,No departures
2025-04-04,TGV 104
2025-04-05,No departures
2025-04-06,TGV 105
2025-04-07,TGV 106
2025-04-08,No departures
2025-04-09,TGV 107
2025-04-10,TGV 108


In [69]:
idx = pd.date_range('2023-01-01', periods=4, freq='ME')
ts = pd.Series([10, 20, 30, 40], index=idx)

ts_daily = ts.asfreq('D')

ts_daily.head()

2023-01-31    10.0
2023-02-01     NaN
2023-02-02     NaN
2023-02-03     NaN
2023-02-04     NaN
Freq: D, dtype: float64

In [70]:
data = {
    'date': [
        '2025-04-01', '2025-04-02', '2025-04-03',
        '2025-04-04', '2025-04-06'                 # missing  5th
    ],
    'train': ['TGV 101', 'TGV 102', np.nan, 'TGV 104', 'TGV 105']
}
df7 = pd.DataFrame(data)
df7

Unnamed: 0,date,train
0,2025-04-01,TGV 101
1,2025-04-02,TGV 102
2,2025-04-03,
3,2025-04-04,TGV 104
4,2025-04-06,TGV 105


In [71]:
# Forward and Backward Filling Missing Values
df_filled_forward = df7.ffill()
df_filled_forward

Unnamed: 0,date,train
0,2025-04-01,TGV 101
1,2025-04-02,TGV 102
2,2025-04-03,TGV 102
3,2025-04-04,TGV 104
4,2025-04-06,TGV 105


In [72]:
df_filled_backward = df7.bfill()
df_filled_backward

Unnamed: 0,date,train
0,2025-04-01,TGV 101
1,2025-04-02,TGV 102
2,2025-04-03,TGV 104
3,2025-04-04,TGV 104
4,2025-04-06,TGV 105


In [73]:
# Finding Missing Dates in a Range
full_range = pd.date_range(start=df3['date'].min(), end=df3['date'].max())
missing = full_range.difference(df3['date'])
missing

DatetimeIndex(['2025-04-03', '2025-04-05'], dtype='datetime64[ns]', freq=None)

In [74]:
# Parsing Different Date Formats with pd.to_datetime()
df = pd.DataFrame({'date_str': ['01/04/2025', '02/04/2025', '03/04/2025']})
df['date'] = pd.to_datetime(df['date_str'], format='%d/%m/%Y')
df.head()

Unnamed: 0,date_str,date
0,01/04/2025,2025-04-01
1,02/04/2025,2025-04-02
2,03/04/2025,2025-04-03


In [75]:
df = pd.DataFrame({'date_str': ['04-01-2025', '04-02-2025']})
df['date'] = pd.to_datetime(df['date_str'], format='%m-%d-%Y')
df.head()

Unnamed: 0,date_str,date
0,04-01-2025,2025-04-01
1,04-02-2025,2025-04-02


In [76]:
df = pd.DataFrame({'timestamp': ['01/04/2025 14:30:00', '02/04/2025 08:45:00']})
df['datetime'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y %H:%M:%S')
df.head()


Unnamed: 0,timestamp,datetime
0,01/04/2025 14:30:00,2025-04-01 14:30:00
1,02/04/2025 08:45:00,2025-04-02 08:45:00


In [77]:
df = pd.DataFrame({'date_str': ['01-Apr-2025', '02-Apr-2025']})
df['date'] = pd.to_datetime(df['date_str'], format='%d-%b-%Y')
df.head()

Unnamed: 0,date_str,date
0,01-Apr-2025,2025-04-01
1,02-Apr-2025,2025-04-02


In [79]:
df = pd.DataFrame({'date_str': ['01 April 2025', '02 April 2025']})
df['date'] = pd.to_datetime(df['date_str'], format='%d %B %Y')
df

Unnamed: 0,date_str,date
0,01 April 2025,2025-04-01
1,02 April 2025,2025-04-02
