In [None]:
import pandas as pd
from pathlib import Path

# 1. Read the CSV files directly from the project folder
df_raw = pd.read_csv("data/raw_data.csv")
df_inc = pd.read_csv("data/incremental_data.csv")

In [None]:
# 2. Preview: .head(), .info() and describe
# a. Head
print("RAW DATA - head():")
display(df_raw.head())
print("INCREMENTAL DATA - head():")
display(df_inc.head())

# b. Info
print("RAW DATA - info():")
df_raw.info()
print("INCREMENTAL DATA - info():")
df_inc.info()

# c. Describe
print("RAW DATA - describe():")
display(df_raw.describe(include='all'))
print("INCREMENTAL DATA - describe():")
display(df_inc.describe(include='all'))

In [None]:
# 3. Observations
# Check for null values
print("Missing values in RAW DATA:")
display(df_raw.isnull().sum())
print("Missing values in INCREMENTAL DATA:")
display(df_inc.isnull().sum())

# Check for duplicates
print("Duplicate rows in RAW DATA:")
print(df_raw.duplicated().sum())
print("Duplicate rows in INCREMENTAL DATA:")
print(df_inc.duplicated().sum())

# Check data types
print("Data types in RAW DATA:")
display(df_raw.dtypes)
print("Data types in INCREMENTAL DATA:")
display(df_inc.dtypes)

I was able to identify the following data quality issues:
- Missing values in both datasets.
- Duplicate rows in both datasets.
- Incorrect data types

In [None]:
# Merge datasets
df_combined = pd.concat([df_raw, df_inc], ignore_index=True)
df_combined.drop_duplicates(inplace=True)
# Save to /data/
df_combined.to_csv("data/merged_data.csv", index=False)

The incremental dataset was merged with the raw dataset to ensure our analysis included all old and newly added records. It is vital for maintaining data completeness and supporting future transformations 