In [7]:
# 🔄 Section 1: Full Extraction
import pandas as pd

df_full = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
print(f"Extracted {len(df_full)} rows fully.")
print("Columns:", df_full.columns.tolist())
df_full.head()


Extracted 212 rows fully.
Columns: ['order_id', 'customer', 'amount', 'date', 'last_updated']


Unnamed: 0,order_id,customer,amount,date,last_updated
0,2018,Costco,938,2025-04-01,2025-04-01 05:00:00
1,9235,Costco,1436,2025-04-01,2025-04-01 02:26:00
2,6736,Amazon,956,2025-04-01,2025-04-01 07:22:00
3,5108,Costco,1243,2025-04-01,2025-04-01 08:45:00
4,9094,Costco,1114,2025-04-01,2025-04-01 17:46:00


In [6]:
import shutil

# Use a raw string (r"") to prevent backslash issues
source_path = r"C:\Users\PC\OneDrive\Desktop\ETL_Extract_Paul Mbuvi_669984\custom_data.csv"

# This moves the file to the same folder as your notebook
shutil.move(source_path, 'custom_data.csv')

'custom_data.csv'

In [9]:
# ⏩ Section 2: Incremental Extraction
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()

last_time = pd.to_datetime(last_extraction)
df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])
df_incremental = df[df["last_updated"] > last_time]

print(f"Extracted {len(df_incremental)} rows incrementally since {last_extraction}.")
df_incremental.head()


Extracted 133 rows incrementally since 2025-04-25 12:00:00.


Unnamed: 0,order_id,customer,amount,date,last_updated
77,6561,Costco,71,2025-04-25,2025-04-25 14:56:00
80,5874,BestBuy,235,2025-04-26,2025-04-26 10:33:00
81,4641,Costco,1245,2025-04-26,2025-04-26 10:32:00
82,2342,Target,424,2025-04-26,2025-04-26 05:01:00
83,2544,Walmart,1024,2025-04-27,2025-04-27 12:30:00


In [10]:
# 💾 Section 3: Save New Timestamp
new_checkpoint = df['last_updated'].max()
with open("last_extraction.txt", "w") as f:
    f.write(new_checkpoint.isoformat())

print(f"Updated last_extraction.txt to {new_checkpoint}")


Updated last_extraction.txt to 2025-05-30 18:00:00
