In [5]:
import os
import pandas as pd
from sqlalchemy import create_engine
from getpass import getpass
from pathlib import Path

# Read data from Brazilian E-Commerce Olist dataset
https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce?resource=download

In [1]:
# Path to a folder with csv
folder_path = "/Users/peter/Documents/Work/DataAnalysisPortfolio/brazilianecomdata"

table_to_path = {} # Match postgre table name and path to csv files

# Pick all the files
for file in os.listdir(folder_path):
    if file.endswith(".csv"):  
        file_path = os.path.join(folder_path, file)
        table_name = file.replace(".csv", "").replace("olist_", "").replace("_dataset", "")
        table_to_path[table_name] = file_path
        
        df = pd.read_csv(file_path)  
        
        print(f" {file} — {df.shape[0]} rows, {df.shape[1]} columns")
        print(df.head())  
        print("\n" + "="*50 + "\n") 

 olist_sellers_dataset.csv — 3095 rows, 4 columns
                          seller_id  seller_zip_code_prefix  \
0  3442f8959a84dea7ee197c632cb2df15                   13023   
1  d1b65fc7debc3361ea86b5f14c68d2e2                   13844   
2  ce3ad9de960102d0677a81f5d0bb7b2d                   20031   
3  c0f3eea2e14555b6faeea3dd58c1b1c3                    4195   
4  51a04a8a6bdcb23deccc82b0b80742cf                   12914   

         seller_city seller_state  
0           campinas           SP  
1         mogi guacu           SP  
2     rio de janeiro           RJ  
3          sao paulo           SP  
4  braganca paulista           SP  


 product_category_name_translation.csv — 71 rows, 2 columns
    product_category_name product_category_name_english
0            beleza_saude                 health_beauty
1  informatica_acessorios         computers_accessories
2              automotivo                          auto
3         cama_mesa_banho                bed_bath_table
4        move

In [3]:
# Sort tables to avoid errors with dependencies when uploading data to Postgre
sorted_tables = ['customers', 'sellers', 'products', 'orders', 'order_items', 'order_payments',
                 'order_reviews', 'geolocation', 'product_category_name_translation']

# Upload csv data to PostgreSQL

In [6]:
# Connection parameters to PostgreSQL
DB_USER = "postgres"         
DB_PASSWORD = getpass()
DB_HOST = "localhost"         
DB_PORT = "5432"             
DB_NAME = "ecommerce"         

# Create connection
engine = create_engine(f'postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

 ········


In [45]:
# Create tables in Postgre DB

# Read DDL
with open(str(Path.cwd().parent) + "/sql_queries/create_tables.sql", "r", encoding="utf-8") as file:
    ddl_query = file.read()

# Execute DDL
with engine.connect() as connection:
    connection.execute(text(ddl_query))
    connection.commit()  
    
print("DDL executed successfully!")


 ········


DDL executed successfully!


In [46]:
# Write csv to postgres

for table_name in sorted_tables:
    file_path = table_to_path[table_name]

    print(f"Table {table_name}...")

    df = pd.read_csv(file_path)

    # Normalize column names
    df.columns = df.columns.str.lower().str.replace(" ", "_")

    # Write to Postgre
    df.to_sql(table_name, engine, if_exists="append", index=False)
        
    print(f"Uploaded: {table_name}")

print("Upload sucessfull!")


 ········


Table customers...
Uploaded: customers
Table sellers...
Uploaded: sellers
Table products...
Uploaded: products
Table orders...
Uploaded: orders
Table order_items...
Uploaded: order_items
Table order_payments...
Uploaded: order_payments
Table order_reviews...
Uploaded: order_reviews
Table geolocation...
Uploaded: geolocation
Table product_category_name_translation...
Uploaded: product_category_name_translation
Upload sucessfull!


# Let's check the data in Postgres

### Check records numbers in uploaded postgres tables

In [53]:
df = pd.read_sql('''
SELECT relname AS table_name, reltuples::bigint AS row_count
FROM pg_class
JOIN pg_namespace ON pg_class.relnamespace = pg_namespace.oid
WHERE nspname = 'public' AND relkind = 'r'
ORDER BY row_count DESC;
''', engine)

error_cnt = 0 # Count of rows number in Postgre mismatching with csv

for i in df.index:
    print(df.loc[i])
    
    file_path = table_to_path[df.loc[i]['table_name']]
    csv_rows_cnt = pd.read_csv(file_path).shape[0]
    
    if df.loc[i]['row_count'] != csv_rows_cnt:
        print(f"Error, rows number mismatch {csv_rows_cnt} in csv, {df.loc[i]['row_count']} in postgre")
        error_cnt += 1

print(f"\n{50*'='}\nTotal mismatches number: {error_cnt}")

table_name    geolocation
row_count         1000163
Name: 0, dtype: object
table_name    order_items
row_count          112650
Name: 1, dtype: object
table_name    order_payments
row_count             103886
Name: 2, dtype: object
table_name    orders
row_count      99441
Name: 3, dtype: object
table_name    customers
row_count         99441
Name: 4, dtype: object
table_name    order_reviews
row_count             99224
Name: 5, dtype: object
table_name    products
row_count        32951
Name: 6, dtype: object
table_name    sellers
row_count        3095
Name: 7, dtype: object
table_name    product_category_name_translation
row_count                                    71
Name: 8, dtype: object

Total mismatches number: 0


### Are there any orders with nonexisting clients?

In [49]:
df = pd.read_sql('''
SELECT o.*
FROM orders o
LEFT JOIN customers c ON o.customer_id = c.customer_id
WHERE c.customer_id IS NULL;
''', engine)

df

 ········


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date


### Are there any orders with nonexisting products?

In [7]:
df = pd.read_sql('''
SELECT oi.*
FROM order_items oi
LEFT JOIN products p ON oi.product_id = p.product_id
WHERE p.product_id IS NULL;
''', engine)

df

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
