In [1]:
from google.colab import files
import os, pandas as pd
from sqlalchemy import create_engine, inspect
from pathlib import Path

DATA_DIR = "/content/olist_manual"
DB_PATH  = "/content/olist.sqlite"
os.makedirs(DATA_DIR, exist_ok=True)

# Upload all 9 CSVs in one go
uploads = files.upload()  # multi-select the 9 CSVs on your computer
for name, data in uploads.items():
    with open(os.path.join(DATA_DIR, name), "wb") as f:
        f.write(data)

# Verify
required = [
    "olist_customers_dataset.csv","olist_sellers_dataset.csv","olist_order_reviews_dataset.csv",
    "olist_order_items_dataset.csv","olist_products_dataset.csv","olist_geolocation_dataset.csv",
    "product_category_name_translation.csv","olist_orders_dataset.csv","olist_order_payments_dataset.csv",
]
missing = [fn for fn in required if not (Path(DATA_DIR)/fn).exists()]
assert not missing, f"Missing files: {missing}"

# Create SQLite
engine = create_engine(f"sqlite:///{DB_PATH}", echo=False)
mapping = {
    "olist_customers_dataset.csv": "olist_customers",
    "olist_sellers_dataset.csv": "olist_sellers",
    "olist_order_reviews_dataset.csv": "olist_order_reviews",
    "olist_order_items_dataset.csv": "olist_order_items",
    "olist_products_dataset.csv": "olist_products",
    "olist_geolocation_dataset.csv": "olist_geolocation",
    "product_category_name_translation.csv": "product_category_name_translation",
    "olist_orders_dataset.csv": "olist_orders",
    "olist_order_payments_dataset.csv": "olist_order_payments",
}
for fname, tname in mapping.items():
    pd.read_csv(Path(DATA_DIR)/fname).to_sql(tname, con=engine, if_exists="replace", index=False)

print("Tables:", inspect(engine).get_table_names())
pd.read_sql_query("SELECT * FROM olist_customers LIMIT 5;", con=engine)


Saving olist_customers_dataset.csv to olist_customers_dataset.csv
Saving olist_geolocation_dataset.csv to olist_geolocation_dataset.csv
Saving olist_order_items_dataset.csv to olist_order_items_dataset.csv
Saving olist_order_payments_dataset.csv to olist_order_payments_dataset.csv
Saving olist_order_reviews_dataset.csv to olist_order_reviews_dataset.csv
Saving olist_orders_dataset.csv to olist_orders_dataset.csv
Saving olist_products_dataset.csv to olist_products_dataset.csv
Saving olist_sellers_dataset.csv to olist_sellers_dataset.csv
Saving product_category_name_translation.csv to product_category_name_translation.csv
Tables: ['olist_customers', 'olist_geolocation', 'olist_order_items', 'olist_order_payments', 'olist_order_reviews', 'olist_orders', 'olist_products', 'olist_sellers', 'product_category_name_translation']


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


**Query 1: Count and Percentage of Orders Purchased in Jan 2018 with 5 Review Score**

Write and execute a SQL query to count the number of orders purchased in January 2018 that have a review score of 5 and calculate the percentage of such orders.

In [15]:
q1 = '''
SELECT
    SUM(review_score_5)                           AS orders_with_review_5,
    COUNT(*)                                      AS total_orders,
    ROUND(100.0 * SUM(review_score_5) / NULLIF(COUNT(*), 0), 2) AS percentage_5_star

FROM (
    SELECT
        o.order_id,
        CASE WHEN MAX(CASE WHEN r.review_score = 5 THEN 1 ELSE 0 END) = 1
             THEN 1 ELSE 0 END AS review_score_5
    FROM olist_orders o
    LEFT JOIN olist_order_reviews r
           ON r.order_id = o.order_id
    WHERE date(o.order_purchase_timestamp) >= date('2018-01-01')
      AND date(o.order_purchase_timestamp) <  date('2018-02-01')
    GROUP BY o.order_id
) per_order;
'''
df_q1 = pd.read_sql_query(q1, con=engine)
df_q1.head()

Unnamed: 0,orders_with_review_5,total_orders,percentage_5_star
0,4077,7269,56.09


**Query 2: Customer Purchase Trend Year-on-Year**

Write and execute a SQL query to analyze the customer purchase trend year-on-year.

In [3]:
q2 = """
WITH yearly AS (
    SELECT
        strftime('%Y', o.order_purchase_timestamp) AS yr,
        COUNT(*) AS orders,
        COUNT(DISTINCT o.customer_id) AS unique_customers
    FROM olist_orders o
    GROUP BY strftime('%Y', o.order_purchase_timestamp)
)
SELECT
    yr,
    orders,
    unique_customers,
    ROUND(100 * (orders - LAG(orders) OVER (ORDER BY yr))
          / NULLIF(LAG(orders) OVER (ORDER BY yr), 0), 2) AS orders_yoy_pct,

  /* NULLIF(value, 0) turns the divisor into NULL if it’s zero, so we avoid division-by-zero errors.
  If the previous year’s orders = 0, the result of the division will be NULL instead of crashing. */

    ROUND(100 * (unique_customers - LAG(unique_customers) OVER (ORDER BY yr))
          / NULLIF(LAG(unique_customers) OVER (ORDER BY yr), 0), 2) AS customers_yoy_pct
FROM yearly
ORDER BY yr;
"""
df_q2 = pd.read_sql_query(q2, engine)
df_q2


Unnamed: 0,yr,orders,unique_customers,orders_yoy_pct,customers_yoy_pct
0,2016,329,329,,
1,2017,45101,45101,13608.0,13608.0
2,2018,54011,54011,19.0,19.0


In [4]:
sql = '''
SELECT
    month AS month_no,
    CASE
        WHEN a.month = '01' THEN 'Jan'
        WHEN a.month = '02' THEN 'Feb'
        WHEN a.month = '03' THEN 'Mar'
        WHEN a.month = '04' THEN 'Apr'
        WHEN a.month = '05' THEN 'May'
        WHEN a.month = '06' THEN 'Jun'
        WHEN a.month = '07' THEN 'Jul'
        WHEN a.month = '08' THEN 'Aug'
        WHEN a.month = '09' THEN 'Sep'
        WHEN a.month = '10' THEN 'Oct'
        WHEN a.month = '11' THEN 'Nov'
        WHEN a.month = '12' THEN 'Dec'
        ELSE 0
    END AS month,
    SUM(CASE WHEN a.year = '2016' THEN 1 ELSE 0 END) AS Year2016,
    SUM(CASE WHEN a.year = '2017' THEN 1 ELSE 0 END) AS Year2017,
    SUM(CASE WHEN a.year = '2018' THEN 1 ELSE 0 END) AS Year2018
FROM (
    SELECT
        customer_id,
        order_id,
        order_delivered_customer_date,
        order_status,
        strftime('%Y', order_delivered_customer_date) AS Year,
        strftime('%m', order_delivered_customer_date) AS Month
    FROM olist_orders
    WHERE order_status = 'delivered' AND order_delivered_customer_date IS NOT NULL
    GROUP BY customer_id, order_id, order_delivered_customer_date
    ORDER BY order_delivered_customer_date ASC
) a
GROUP BY month
ORDER BY month_no ASC
'''

df_sql = pd.read_sql_query(sql, con=engine)
print(df_sql.head(12))

   month_no month  Year2016  Year2017  Year2018
0        01   Jan         0       283      6597
1        02   Feb         0      1351      5850
2        03   Mar         0      2382      6824
3        04   Apr         0      1849      7850
4        05   May         0      3751      7111
5        06   Jun         0      3223      6829
6        07   Jul         0      3455      5839
7        08   Aug         0      4302      8314
8        09   Sep         0      3965        56
9        10   Oct       205      4494         3
10       11   Nov        58      4670         0
11       12   Dec         4      7205         0


**Query 3: Average Order Values of Customers**

Write and execute a SQL query to calculate the average order values of customers.

In [None]:
q3 = """
WITH order_values AS (
    SELECT
        oi.order_id,
        SUM(oi.price + oi.freight_value) AS order_value
    FROM olist_order_items oi
    GROUP BY oi.order_id
),
customer_orders AS (
    SELECT
        o.customer_id,
        ov.order_value
    FROM olist_orders o
    JOIN order_values ov ON ov.order_id = o.order_id
)
SELECT
    c.customer_id,
    COUNT(*) AS orders_count,
    ROUND(AVG(order_value), 2) AS avg_order_value,
    ROUND(SUM(order_value), 2) AS total_spent
FROM customer_orders c
GROUP BY c.customer_id
ORDER BY total_spent DESC;  -- optional: see top spenders first
"""
df_q3 = pd.read_sql_query(q3, engine)
df_q3.head(20)  # show a sample


Unnamed: 0,customer_id,orders_count,avg_order_value,total_spent
0,1617b1357756262bfa56ab541c47bc16,1,13664.08,13664.08
1,ec5b2ba62e574342386871631fafd3fc,1,7274.88,7274.88
2,c6e2731c5b391845f6800c97401a43a9,1,6929.31,6929.31
3,f48d464a0baaea338cb25f816991ab1f,1,6922.21,6922.21
4,3fd6777bbce08a352fddd04e4a7cc8f6,1,6726.66,6726.66
5,05455dfa7cd02f13d132aa7a6a9729c6,1,6081.54,6081.54
6,df55c14d1476a9a3467f131269c2477f,1,4950.34,4950.34
7,e0a2412720e9ea4f26c1ac985f6a7358,1,4809.44,4809.44
8,24bbf5fd2f2e1b359ee7de94defc4a15,1,4764.34,4764.34
9,3d979689f636322c62418b6346b1c6d2,1,4681.78,4681.78


**Query 4: Top 5 Cities with Highest Revenue from 2016 to 2018**

Write and execute a SQL query to find the top 5 cities with the highest revenue from 2016 to 2018.

In [7]:
q4 = """
WITH order_revenue AS (
    SELECT
        oi.order_id,
        SUM(oi.price + oi.freight_value) AS revenue
    FROM olist_order_items oi
    GROUP BY oi.order_id
)
SELECT
    oc.customer_city,
    ROUND(SUM(orv.revenue), 2) AS total_revenue
FROM olist_orders o
JOIN olist_customers oc ON o.customer_id = oc.customer_id
JOIN order_revenue orv ON o.order_id = orv.order_id
WHERE strftime('%Y', o.order_purchase_timestamp) BETWEEN '2016' AND '2018'
GROUP BY oc.customer_city
ORDER BY total_revenue DESC
LIMIT 5;
"""
df_q4 = pd.read_sql_query(q4, engine)
df_q4.head() #showing the top 5 cities

Unnamed: 0,customer_city,total_revenue
0,sao paulo,2170227.12
1,rio de janeiro,1154234.02
2,belo horizonte,416733.39
3,brasilia,352305.14
4,curitiba,244739.87


**Query 5: State Wise Revenue Table Between 2016 to 2018**

Write and execute a SQL query to create a state-wise revenue table between 2016 to 2018.

In [8]:
q5 = """
WITH order_revenue AS (
    SELECT
        oi.order_id,
        SUM(oi.price + oi.freight_value) AS revenue
    FROM olist_order_items oi
    GROUP BY oi.order_id
)
SELECT
    oc.customer_state,
    ROUND(SUM(orv.revenue), 2) AS total_revenue
FROM olist_orders o
JOIN olist_customers oc ON o.customer_id = oc.customer_id
JOIN order_revenue orv ON o.order_id = orv.order_id
WHERE strftime('%Y', o.order_purchase_timestamp) BETWEEN '2016' AND '2018'
GROUP BY oc.customer_state
ORDER BY total_revenue DESC;
"""
df_q5 = pd.read_sql_query(q5, engine)
df_q5.head() #showing the revenue according to state

Unnamed: 0,customer_state,total_revenue
0,SP,5921678.12
1,RJ,2129681.98
2,MG,1856161.49
3,RS,885826.76
4,PR,800935.44


**Query 6: Top Successful Sellers in Terms of Goods Sold, Revenue, and Customer Count**

Write and execute a SQL query to identify the top successful sellers in terms of the number of goods sold, total revenue, customer count, and sellers with the highest 5-star ratings.

In [9]:
q6 = """
WITH seller_sales AS (
    SELECT
        oi.seller_id,
        COUNT(oi.order_item_id) AS goods_sold,
        SUM(oi.price + oi.freight_value) AS total_revenue,
        COUNT(DISTINCT o.customer_id) AS num_customers
    FROM olist_order_items oi
    JOIN olist_orders o ON oi.order_id = o.order_id
    GROUP BY oi.seller_id
),
seller_reviews AS (
    SELECT
        oi.seller_id,
        COUNT(CASE WHEN r.review_score = 5 THEN 1 END) AS five_star_reviews,
        COUNT(r.review_score) AS total_reviews,
        ROUND(100.0 * COUNT(CASE WHEN r.review_score = 5 THEN 1 END) / NULLIF(COUNT(r.review_score), 0), 2) AS five_star_percentage
    FROM olist_order_items oi
    JOIN olist_order_reviews r ON oi.order_id = r.order_id
    GROUP BY oi.seller_id
)
SELECT
    ss.seller_id,
    ss.goods_sold,
    ss.total_revenue,
    ss.num_customers,
    COALESCE(sr.five_star_reviews, 0) AS five_star_reviews,
    COALESCE(sr.total_reviews, 0) AS total_reviews,
    COALESCE(sr.five_star_percentage, 0.0) AS five_star_percentage
FROM seller_sales ss
LEFT JOIN seller_reviews sr ON ss.seller_id = sr.seller_id
ORDER BY ss.total_revenue DESC, COALESCE(sr.five_star_percentage, 0.0) DESC, ss.goods_sold DESC
LIMIT 10;
"""
df_q6 = pd.read_sql_query(q6, engine)
df_q6.head(10) #showing the top sellers according to number of goods sold, total revenue, customer count and 5 star ratings

Unnamed: 0,seller_id,goods_sold,total_revenue,num_customers,five_star_reviews,total_reviews,five_star_percentage
0,4869f7a5dfa277a7dca6462dcf3b52b2,1156,249640.7,1132,683,1148,59.49
1,7c67e1448b00f6e969d365cea6b010ab,1364,239536.44,982,437,1367,31.97
2,53243585a1d6dc2643021fd1853d8905,410,235856.68,358,210,408,51.47
3,4a3ca9315b744ce9f8e9374361493884,1987,235539.96,1806,947,1984,47.73
4,fa1c13f2614d7b5c4749cbc52fecda94,586,204084.73,585,394,582,67.7
5,da8622b14eb17ae2831f4ac5b9dab84a,1551,185192.32,1314,893,1568,56.95
6,7e93a43ef30c4f03f38b393420bc753a,340,182754.05,336,213,339,62.83
7,1025f0e2d44d7041d6cf58b6550e0bfa,1428,172860.69,915,729,1431,50.94
8,7a67c85e85bb2ce8582c35f2203ad736,1171,162648.38,1160,717,1166,61.49
9,955fee9216a65b617aa5c0531780ce60,1499,160602.68,1287,804,1489,54.0


**Query 7: Delivery Success Rate Across States**

Write and execute a SQL query to calculate the delivery success rate across different states.

In [10]:
q7 = """
SELECT
    oc.customer_state,
    COUNT(o.order_id) AS total_orders,
    SUM(CASE WHEN o.order_status = 'delivered' AND o.order_delivered_customer_date IS NOT NULL THEN 1 ELSE 0 END) AS delivered_orders,
    ROUND(100.0 * SUM(CASE WHEN o.order_status = 'delivered' AND o.order_delivered_customer_date IS NOT NULL THEN 1 ELSE 0 END) / COUNT(o.order_id), 2) AS delivery_success_rate
FROM olist_orders o
JOIN olist_customers oc ON o.customer_id = oc.customer_id
GROUP BY oc.customer_state
ORDER BY delivery_success_rate DESC, total_orders DESC;
"""
df_q7 = pd.read_sql_query(q7, engine)
df_q7.head(10) #showing the top states with high delivery success rate

Unnamed: 0,customer_state,total_orders,delivered_orders,delivery_success_rate
0,AC,81,80,98.77
1,AP,68,67,98.53
2,ES,2033,1995,98.13
3,MS,715,701,98.04
4,AM,148,145,97.97
5,TO,280,274,97.86
6,RS,5466,5344,97.77
7,RN,485,474,97.73
8,MT,907,886,97.68
9,MG,11635,11354,97.58


**Query 8: Preferred Form of Payment for Different Categories**

Write and execute a SQL query to find the preferred form of payment for different product categories.

In [13]:
q8 = """
SELECT
    p.product_category_name,
    pt.product_category_name_english,
    op.payment_type,
    COUNT(op.payment_type) AS payment_count
FROM olist_order_items oi
JOIN olist_products p ON oi.product_id = p.product_id
LEFT JOIN product_category_name_translation pt ON p.product_category_name = pt.product_category_name
JOIN olist_orders o ON oi.order_id = o.order_id
JOIN olist_order_payments op ON o.order_id = op.order_id
WHERE pt.product_category_name_english IS NOT NULL
GROUP BY
    p.product_category_name,
    pt.product_category_name_english,
    op.payment_type
ORDER BY
    pt.product_category_name_english ASC,
    payment_count DESC;

"""
df_q8 = pd.read_sql_query(q8, engine)
df_q8.head(10) #showing the preferred form of payment for products, excluding the null values

Unnamed: 0,product_category_name,product_category_name_english,payment_type,payment_count
0,agro_industria_e_comercio,agro_industry_and_commerce,credit_card,145
1,agro_industria_e_comercio,agro_industry_and_commerce,boleto,60
2,agro_industria_e_comercio,agro_industry_and_commerce,voucher,42
3,agro_industria_e_comercio,agro_industry_and_commerce,debit_card,5
4,climatizacao,air_conditioning,credit_card,222
5,climatizacao,air_conditioning,boleto,69
6,climatizacao,air_conditioning,voucher,8
7,climatizacao,air_conditioning,debit_card,3
8,artes,art,credit_card,153
9,artes,art,boleto,47


**Query 9: Distance Between Cities**

Write and execute a SQL query to calculate the distance between cities.

In [14]:
q9 = """
WITH CityCoordinates AS (
    SELECT
        LOWER(geolocation_city) AS city,
        AVG(geolocation_lat) AS avg_lat,
        AVG(geolocation_lng) AS avg_lng
    FROM olist_geolocation
    GROUP BY LOWER(geolocation_city)
),
Distances AS (
    SELECT
        c1.city AS city1,
        c2.city AS city2,
        6371 * 2 * ASIN(
            SQRT(
                POWER(SIN(RADIANS((c2.avg_lat - c1.avg_lat) / 2)), 2) +
                COS(RADIANS(c1.avg_lat)) * COS(RADIANS(c2.avg_lat)) *
                POWER(SIN(RADIANS((c2.avg_lng - c1.avg_lng) / 2)), 2)
            )
        ) AS distance_km
    FROM CityCoordinates c1, CityCoordinates c2
    WHERE c1.city < c2.city
)
SELECT
    city1,
    city2,
    ROUND(distance_km, 2) AS distance_km
FROM Distances
ORDER BY distance_km DESC
LIMIT 10;
"""
df_q9 = pd.read_sql_query(q9, engine)
df_q9.head(10) #showing the distances between cities

Unnamed: 0,city1,city2,distance_km
0,conquista d'oeste,santa lucia do piai,19946.34
1,nova lacerda,santa lucia do piai,19939.57
2,santa lucia do piai,vale de são domingos,19934.5
3,reserva do cabacal,santa lucia do piai,19934.2
4,santa lucia do piai,vale de sao domingos,19934.16
5,jauru,santa lucia do piai,19931.14
6,pontes e lacerda,santa lucia do piai,19928.74
7,humaitá,santa lucia do piai,19921.56
8,figueirópolis d'oeste,santa lucia do piai,19917.93
9,figueiropolis doeste,santa lucia do piai,19917.81
