In [1]:
# ETL: Extract the Data 
# Extract Phase
# 1. We begin by importing the required libraries. `pandas` is used for handling the datasets and
#    `os` is used to create folders if they don't exist.
import pandas as pd
import os

# 2. This ensures that the `data` folder exists to store 
#    our input CSV files (`raw_data.csv` and `incremental_data.csv`).
# Make sure data folder exists
os.makedirs("data", exist_ok=True)

# 3. We load both raw and incremental datasets using `pandas.read_csv()` and
#    preview their structure using `.head()` and `.info()` methods.
# Load the datasets
raw_df = pd.read_csv("data/raw_data.csv")
incremental_df = pd.read_csv("data/incremental_data.csv")

# Here, I was showing the output by previewing both datasets
print("🗃️ Raw Data:")
print(raw_df.head())
print(raw_df.info())

print("\n🗃️ Incremental Data:")
print(incremental_df.head())
print(incremental_df.info())

# 4. We check for missing values, duplicates, and suspicious data entries.
#    This helps us plan what transformations are needed later.
# Observations
print("\n✅ Observations:")
print(f"- Raw data rows: {len(raw_df)}, columns: {raw_df.shape[1]}")
print(f"- Incremental data rows: {len(incremental_df)}, columns: {incremental_df.shape[1]}")
print("- Checking for duplicates...")
print("Raw duplicates:", raw_df.duplicated().sum())
print("Incremental duplicates:", incremental_df.duplicated().sum())
print("- Checking for missing values...")
print("Raw missing values:\n", raw_df.isnull().sum())
print("Incremental missing values:\n", incremental_df.isnull().sum())

# Finally, I was able to save raw copies to /data
raw_df.to_csv("data/raw_data.csv", index=False)
incremental_df.to_csv("data/incremental_data.csv", index=False)
print("\n✅ Raw files saved in /data folder.")


🗃️ Raw Data:
   order_id customer_name product  quantity  unit_price  order_date region
0         1         Diana  Tablet       NaN       500.0  2024-01-20  South
1         2           Eve  Laptop       NaN         NaN  2024-04-29  North
2         3       Charlie  Laptop       2.0       250.0  2024-01-08    NaN
3         4           Eve  Laptop       2.0       750.0  2024-01-07   West
4         5           Eve  Tablet       3.0         NaN  2024-03-07  South
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   order_id       100 non-null    int64  
 1   customer_name  99 non-null     object 
 2   product        100 non-null    object 
 3   quantity       74 non-null     float64
 4   unit_price     65 non-null     float64
 5   order_date     99 non-null     object 
 6   region         75 non-null     object 
dtypes: float64(2), int64(1), object(4