In [None]:
# Section  3


with open("last_extraction.txt", "w") as f:
    latest_timestamp = df['last_login'].max().strftime("%Y-%m-%d")
    f.write(latest_timestamp)


In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Simulate 60 days of sales data
customers = ['Amazon', 'Walmart', 'Target', 'Costco', 'BestBuy', 'eBay']
data = []
start_date = datetime(2025, 4, 1)

for i in range(1, 61):  # 60 days
    date = start_date + timedelta(days=i)
    for _ in range(random.randint(3, 6)):  # 3–6 sales per day
        data.append({
            'id': random.randint(1000, 9999),
            'customer': random.choice(customers),
            'date': date.date().isoformat(),
            'amount': random.randint(100, 2000),
            'last_updated': (date + timedelta(hours=random.randint(0, 23),
                                              minutes=random.randint(0, 59))).isoformat()
        })

# Save to CSV in the local directory
df = pd.DataFrame(data)
csv_path = "custom_data.csv"
df.to_csv(csv_path, index=False)

# Preview
df.head()


Unnamed: 0,id,customer,date,amount,last_updated
0,1428,Costco,2025-04-02,1051,2025-04-02T11:53:00
1,3594,Costco,2025-04-02,1345,2025-04-02T01:23:00
2,4143,Costco,2025-04-02,1850,2025-04-02T02:55:00
3,5990,Costco,2025-04-03,601,2025-04-03T18:17:00
4,4979,Target,2025-04-03,1623,2025-04-03T15:13:00


In [None]:
#  Section 1: Full Extraction 

# Step 1: Load the entire dataset
df_full = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])

# Step 2: Display basic stats
num_rows, num_cols = df_full.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")

# Step 3: Show sample of data
print("\nSample data:")
display(df_full.head())

# Step 4: Extraction summary
print(f"\n✅ Extracted {num_rows} rows fully.")


Number of rows: 267
Number of columns: 5

Sample data:


Unnamed: 0,id,customer,date,amount,last_updated
0,1428,Costco,2025-04-02,1051,2025-04-02 11:53:00
1,3594,Costco,2025-04-02,1345,2025-04-02 01:23:00
2,4143,Costco,2025-04-02,1850,2025-04-02 02:55:00
3,5990,Costco,2025-04-03,601,2025-04-03 18:17:00
4,4979,Target,2025-04-03,1623,2025-04-03 15:13:00



✅ Extracted 267 rows fully.


In [8]:
from datetime import datetime

# Step 1: Simulate a last extraction time
with open("last_extraction.txt", "w") as f:
    f.write("2025-04-25 12:00:00")  # Simulated checkpoint (can be changed later)

# Step 2: Load the last extraction timestamp
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()
last_extraction_time = pd.to_datetime(last_extraction)

# Step 3: Load the dataset again
df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])

# Step 4: Filter for new or updated records
df_incremental = df[df['last_updated'] > last_extraction_time]

# Step 5: Show results
print(f"✅ Extracted {len(df_incremental)} rows incrementally since {last_extraction}.")
display(df_incremental.head())


✅ Extracted 158 rows incrementally since 2025-04-25 12:00:00.


Unnamed: 0,id,customer,date,amount,last_updated
107,8751,BestBuy,2025-04-25,1511,2025-04-25 21:25:00
110,1542,Costco,2025-04-26,915,2025-04-26 04:14:00
111,4432,Costco,2025-04-26,1442,2025-04-26 02:33:00
112,7771,BestBuy,2025-04-26,107,2025-04-26 22:46:00
113,2789,Walmart,2025-04-26,1291,2025-04-26 01:08:00


In [1]:
## Section 1: Full Extraction

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Step 1: Load the entire dataset from CSV
print("🔄 Loading full dataset...")
df_full = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])

# Step 2: Display key statistics about the dataset
print("✅ Full Extraction Completed")
print(f"Total Rows Extracted: {df_full.shape[0]}")
print(f"Total Columns: {df_full.shape[1]}")

# Step 3: Preview the top records
df_full.head()

🔄 Loading full dataset...
✅ Full Extraction Completed
Total Rows Extracted: 267
Total Columns: 5


Unnamed: 0,id,customer,date,amount,last_updated
0,1428,Costco,2025-04-02,1051,2025-04-02 11:53:00
1,3594,Costco,2025-04-02,1345,2025-04-02 01:23:00
2,4143,Costco,2025-04-02,1850,2025-04-02 02:55:00
3,5990,Costco,2025-04-03,601,2025-04-03 18:17:00
4,4979,Target,2025-04-03,1623,2025-04-03 15:13:00


In [2]:
## Section 2: Incremental Extraction

# Step 1: Load the timestamp of the last successful extraction
print("🔄 Reading last extraction timestamp...")
with open("last_extraction.txt", "r") as f:
    last_extraction = f.read().strip()

# Step 2: Convert timestamp string to datetime object
last_extraction_time = pd.to_datetime(last_extraction)

# Step 3: Load the full dataset again for comparison
df = pd.read_csv("custom_data.csv", parse_dates=["last_updated"])

# Step 4: Filter records that have been added or updated since the last extraction
df_incremental = df[df["last_updated"] > last_extraction_time]

print(f"✅ Incremental Extraction Completed: {len(df_incremental)} new rows since {last_extraction}.")

# Step 5: Preview the new records
df_incremental.head()

🔄 Reading last extraction timestamp...
✅ Incremental Extraction Completed: 158 new rows since 2025-04-25 12:00:00.


Unnamed: 0,id,customer,date,amount,last_updated
107,8751,BestBuy,2025-04-25,1511,2025-04-25 21:25:00
110,1542,Costco,2025-04-26,915,2025-04-26 04:14:00
111,4432,Costco,2025-04-26,1442,2025-04-26 02:33:00
112,7771,BestBuy,2025-04-26,107,2025-04-26 22:46:00
113,2789,Walmart,2025-04-26,1291,2025-04-26 01:08:00


In [3]:
## Section 3: Save New Timestamp

# Step 1: If new data was extracted, update the checkpoint timestamp
if not df_incremental.empty:
    new_checkpoint = df_incremental["last_updated"].max()
    with open("last_extraction.txt", "w") as f:
        f.write(new_checkpoint.isoformat())
    print(f"✅ Extraction timestamp updated to {new_checkpoint}")
else:
    print("⚠️ No new records found. Timestamp remains unchanged.")

✅ Extraction timestamp updated to 2025-05-31 23:02:00
