### This notebook loads the raw data files, checks their structure, and performs initial setup for the rest of the project.

In [1]:
import numpy as np
import pandas as pd
import os


### Load Row Data

In [28]:
df= pd.read_csv('../data/olist_dataset.csv')
orders = pd.read_csv('..\data\olist_orders_dataset.csv')
customers = pd.read_csv("../data/olist_customers_dataset.csv")
product = pd.read_csv('..\data\product_category_name_translation.csv')

### Create the data quality function

In [14]:
def check_data_quality(df):
    print("=== DATA QUALITY REPORT ===")
    
    # 1. Missing Values Analysis
    print("\n1. MISSING VALUES ANALYSIS:")
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    missing_info = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_percentage
    })
    print(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Values', ascending=False))
    
    # 2. Data Types
    print("\n2. DATA TYPES:")
    print(df.dtypes)
    
    # 3. Duplicate Check
    print("\n3. DUPLICATE CHECK:")
    duplicates = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")
    
    # 4. Basic Statistics for Numerical Columns
    print("\n4. BASIC STATISTICS FOR NUMERICAL COLUMNS:")
    print(df.describe())
    
    # 5. Unique Values Count
    print("\n5. UNIQUE VALUES COUNT:")
    unique_counts = df.nunique()
    print(unique_counts)
     # 6. Memory Usage
    print("\n6. MEMORY USAGE:")
    print(f"Total memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    return missing_info, unique_counts

    

### Check data quality for each DataFrame


In [24]:
check_data_quality(orders)

=== DATA QUALITY REPORT ===

1. MISSING VALUES ANALYSIS:
                               Missing Values  Percentage
order_delivered_customer_date            2965    2.981668
order_delivered_carrier_date             1783    1.793023
order_approved_at                         160    0.160899

2. DATA TYPES:
order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object

3. DUPLICATE CHECK:
Number of duplicate rows: 0

4. BASIC STATISTICS FOR NUMERICAL COLUMNS:
                                order_id                       customer_id  \
count                              99441                             99441   
unique                             99441                             99441   
top     e481f51cbdc54678b7cc49136f

(                               Missing Values  Percentage
 order_id                                    0    0.000000
 customer_id                                 0    0.000000
 order_status                                0    0.000000
 order_purchase_timestamp                    0    0.000000
 order_approved_at                         160    0.160899
 order_delivered_carrier_date             1783    1.793023
 order_delivered_customer_date            2965    2.981668
 order_estimated_delivery_date               0    0.000000,
 order_id                         99441
 customer_id                      99441
 order_status                         8
 order_purchase_timestamp         98875
 order_approved_at                90733
 order_delivered_carrier_date     81018
 order_delivered_customer_date    95664
 order_estimated_delivery_date      459
 dtype: int64)

In [27]:
check_data_quality(product)

=== DATA QUALITY REPORT ===

1. MISSING VALUES ANALYSIS:
Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []

2. DATA TYPES:
product_category_name            object
product_category_name_english    object
dtype: object

3. DUPLICATE CHECK:
Number of duplicate rows: 0

4. BASIC STATISTICS FOR NUMERICAL COLUMNS:
       product_category_name product_category_name_english
count                     71                            71
unique                    71                            71
top             beleza_saude                 health_beauty
freq                       1                             1

5. UNIQUE VALUES COUNT:
product_category_name            71
product_category_name_english    71
dtype: int64

6. MEMORY USAGE:
Total memory usage: 0.01 MB


(                               Missing Values  Percentage
 product_category_name                       0         0.0
 product_category_name_english               0         0.0,
 product_category_name            71
 product_category_name_english    71
 dtype: int64)

In [21]:
check_data_quality(customers)

=== DATA QUALITY REPORT ===

1. MISSING VALUES ANALYSIS:
Empty DataFrame
Columns: [Missing Values, Percentage]
Index: []

2. DATA TYPES:
customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object

3. DUPLICATE CHECK:
Number of duplicate rows: 0

4. BASIC STATISTICS FOR NUMERICAL COLUMNS:
       customer_zip_code_prefix
count              99441.000000
mean               35137.474583
std                29797.938996
min                 1003.000000
25%                11347.000000
50%                24416.000000
75%                58900.000000
max                99990.000000

5. UNIQUE VALUES COUNT:
customer_id                 99441
customer_unique_id          96096
customer_zip_code_prefix    14994
customer_city                4119
customer_state                 27
dtype: int64

6. MEMORY USAGE:
Total memory usage: 29.62 MB


(                          Missing Values  Percentage
 customer_id                            0         0.0
 customer_unique_id                     0         0.0
 customer_zip_code_prefix               0         0.0
 customer_city                          0         0.0
 customer_state                         0         0.0,
 customer_id                 99441
 customer_unique_id          96096
 customer_zip_code_prefix    14994
 customer_city                4119
 customer_state                 27
 dtype: int64)