In [1]:
# Connect to PostgreSQL database
import os
import psycopg2
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

conn = psycopg2.connect(
    host=os.getenv("DB_HOST", "localhost"),
    port=os.getenv("DB_PORT", "5441"),
    user=os.getenv("DB_USER", "postgres"),
    password=os.getenv("DB_PASS"),
    database=os.getenv("DB_NAME", "superstore_db")
)

def run_query(sql: str):
    return pd.read_sql(sql, conn)

# Test connection
print(f"Successfully connected to SuperStore database!")
tables = run_query("SELECT table_name FROM information_schema.tables WHERE table_schema='public'")
print(f"Tables: {', '.join(tables['table_name'].tolist())}")


Successfully connected to SuperStore database!
Tables: orders, products, people, returned_orders
Tables: orders, products, people, returned_orders


  return pd.read_sql(sql, conn)


# Cleaning a PostgreSQL Database
![Clean PostgreSQL Database](Project_Image.jpeg)

In this project, you will work with data from a hypothetical Super Store to challenge and enhance your SQL skills in data cleaning. This project will engage you in identifying top categories based on the highest profit margins and detecting missing values, utilizing your comprehensive knowledge of SQL concepts.

## Data Dictionary:

### `orders`:
| Column | Definition | Data type | Comments |
|--------|------------|-----------|----------|
| `row_id`| Unique Record ID | `INTEGER` |
| `order_id` | Identifier for each order in table | `TEXT` | Connects to `order_id` in `returned_orders` table |
| `order_date` | Date when order was placed | `TEXT` |
| `market` | Market order_id belongs to | `TEXT` |
| `region` | Region Customer belongs to | `TEXT` | Connects to `region` in `people` table |
| `product_id` | Identifier of Product bought | `TEXT` | Connects to `product_id` in `products` table |
| `sales` | Total Sales Amount for the Line Item | `DOUBLE PRECISION` |
| `quantity` | Total Quantity for the Line Item | `DOUBLE PRECISION` |
| `discount` | Discount applied for the Line Item | `DOUBLE PRECISION` |
| `profit` | Total Profit earned on the Line Item | `DOUBLE PRECISION` |

### `returned_orders`:
| Column | Definition | Data type |
|--------|------------|-----------|
| `returned`| Yes values for Order / Line Item Returned | `TEXT` |
| `order_id` | Identifier for each order in table | `TEXT` |
| `market` | Market order_id belongs to | `TEXT` |

### `people`:
| Column | Definition | Data type |
|--------|------------|-----------|
| `person`| Name of Salesperson credited with Order | `TEXT` |
| `region` | Region Salesperson in operating in | `TEXT` |

### `products`:
| Column | Definition | Data type |
|--------|------------|-----------|
| `product_id`| Unique Identifier for the Product | `TEXT` |
| `category` | Category Product belongs to | `TEXT` |
| `sub_category` | Sub Category Product belongs to | `TEXT` |
| `product_name` | Detailed Name of the Product | `TEXT` |

As you can see in the Data Dictionary above, date fields have been written to the `orders` table as `TEXT` and numeric fields like sales, profit, etc. have been written to the `orders` table as `Double Precision`. You will need to take care of these types in some of the queries. This project is an excellent opportunity to apply your SQL skills in a practical setting and gain valuable experience in data cleaning and analysis. Good luck, and happy querying!

In [2]:
# Top 5 products in each category by total sales
query = """
WITH product_totals AS (
    SELECT
        p.category,
        p.product_name,
        SUM(o.sales)   AS sum_sales,
        SUM(o.profit)  AS sum_profit
    FROM orders o
    JOIN products p
      ON p.product_id = o.product_id
    GROUP BY p.category, p.product_name
),
ranked AS (
    SELECT
        category,
        product_name,
        ROUND(sum_sales::numeric,  2) AS product_total_sales,
        ROUND(sum_profit::numeric, 2) AS product_total_profit,
        DENSE_RANK() OVER (
            PARTITION BY category
            ORDER BY sum_sales DESC, product_name ASC
        ) AS product_rank
    FROM product_totals
)
SELECT
    category,
    product_name,
    product_total_sales,
    product_total_profit,
    product_rank
FROM ranked
WHERE product_rank <= 5
ORDER BY category ASC, product_total_sales DESC;
"""

df_top5 = run_query(query)
print(f"Top 5 products in each category (Total: {len(df_top5)} rows)")
display(df_top5)


  return pd.read_sql(sql, conn)


Top 5 products in each category (Total: 15 rows)


Unnamed: 0,category,product_name,product_total_sales,product_total_profit,product_rank
0,Furniture,"Hon Executive Leather Armchair, Adjustable",58193.48,5997.25,1
1,Furniture,"Office Star Executive Leather Armchair, Adjust...",51449.8,4925.8,2
2,Furniture,"Harbour Creations Executive Leather Armchair, ...",50121.52,10427.33,3
3,Furniture,"SAFCO Executive Leather Armchair, Black",41923.53,7154.28,4
4,Furniture,"Novimex Executive Leather Armchair, Adjustable",40585.13,5562.35,5
5,Office Supplies,"Eldon File Cart, Single Width",39873.23,5571.26,1
6,Office Supplies,"Hoover Stove, White",32842.6,-2180.63,2
7,Office Supplies,"Hoover Stove, Red",32644.13,11651.68,3
8,Office Supplies,"Rogers File Cart, Single Width",29558.82,2368.82,4
9,Office Supplies,"Smead Lockers, Industrial",28991.66,3630.44,5


In [3]:
# Impute missing quantity values using median unit price per product
query = """
WITH missing AS (
    SELECT
        product_id,
        COALESCE(discount, 0.0) AS discount,
        market,
        region,
        sales,
        quantity
    FROM orders
    WHERE quantity IS NULL
),
unit_prices AS (
    SELECT
        product_id,
        percentile_cont(0.5) WITHIN GROUP (ORDER BY sales / NULLIF(quantity, 0)) AS median_uprice
    FROM orders
    WHERE quantity IS NOT NULL
      AND quantity <> 0
      AND sales IS NOT NULL
    GROUP BY product_id
)
SELECT
    m.product_id,
    m.discount,
    m.market,
    m.region,
    m.sales,
    m.quantity,
    CASE
        WHEN up.median_uprice IS NOT NULL AND up.median_uprice <> 0 AND m.sales IS NOT NULL
            THEN ROUND((m.sales / up.median_uprice)::numeric, 0)
        ELSE NULL
    END AS calculated_quantity
FROM missing m
LEFT JOIN unit_prices up
  ON m.product_id = up.product_id;
"""

df_imputed = run_query(query)
print(f"Orders with missing quantity values: {len(df_imputed)}")
print(f"Successfully imputed: {df_imputed['calculated_quantity'].notna().sum()}")
display(df_imputed.head(10))


  return pd.read_sql(sql, conn)


Orders with missing quantity values: 5
Successfully imputed: 5


Unnamed: 0,product_id,discount,market,region,sales,quantity,calculated_quantity
0,FUR-ADV-10000571,0.0,EMEA,EMEA,438.96,,4.0
1,FUR-ADV-10004395,0.0,EMEA,EMEA,84.12,,5.0
2,FUR-BO-10001337,0.15,US,West,308.499,,3.0
3,TEC-STA-10003330,0.0,Africa,Africa,506.64,,2.0
4,TEC-STA-10004542,0.0,Africa,Africa,160.32,,4.0
