In [None]:
import pandas as pd
import numpy as np


In [None]:
geo = pd.read_csv("data/01_raw/olist_geolocation_dataset.csv")
geo.head()

In [None]:
geo.info()

In [None]:
geo.describe()

In [None]:
import matplotlib.pyplot as plt

plt.scatter(geo['geolocation_lng'], geo['geolocation_lat'], s=0.01, alpha=0.5)
plt.title('Geolocation Points Across Brazil')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

In [None]:
geo_range = (
    geo.groupby("geolocation_zip_code_prefix")
    .agg({
        "geolocation_lat": ["min", "max", "std"],
        "geolocation_lng": ["min", "max", "std"],
        "geolocation_city": "nunique"
    })
)

# Flatten column names
geo_range.columns = ['lat_min', 'lat_max', 'lat_std', 'lng_min', 'lng_max', 'lng_std', 'n_cities']
geo_range["lat_range"] = geo_range["lat_max"] - geo_range["lat_min"]
geo_range["lng_range"] = geo_range["lng_max"] - geo_range["lng_min"]
geo_range = geo_range.reset_index()

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.hist(geo_range["lat_range"], bins=100)
plt.title("Latitude Range per Zip Code Prefix")
plt.xlabel("Latitude Range (degrees)")
plt.ylabel("Frequency")
plt.xlim(0, 5)

plt.subplot(1, 2, 2)
plt.hist(geo_range["lng_range"], bins=100)
plt.title("Longitude Range per Zip Code Prefix")
plt.xlabel("Longitude Range (degrees)")
plt.ylabel("Frequency")
plt.xlim(0, 5)

plt.tight_layout()
plt.show()

In [None]:
noisy_zips = geo_range[
    (geo_range["lat_range"] > 0.5) | (geo_range["lng_range"] > 0.5)
]
print(f"{len(noisy_zips)} zip prefixes have high spatial variance.")

In [None]:
clean_geo = geo[~geo["geolocation_zip_code_prefix"].isin(noisy_zips["geolocation_zip_code_prefix"])]

In [None]:
# Define Brazil's bounding box
valid_lat_range = (-33.75116944, 5.27438888)
valid_lng_range = (-73.98283055, -34.79314722)

# Apply filtering
geo_clipped = clean_geo[
    (clean_geo["geolocation_lat"].between(*valid_lat_range)) &
    (clean_geo["geolocation_lng"].between(*valid_lng_range))
]

print(f"Original: {len(clean_geo):,} | Clipped: {len(geo_clipped):,} | Removed: {len(clean_geo) - len(geo_clipped):,}")

In [None]:
geo_clipped.duplicated().sum()

In [None]:
orders = pd.read_parquet("data/02_intermediate/clean_orders.parquet")
customers = pd.read_parquet("data/02_intermediate/clean_customers.parquet")
order_items = pd.read_parquet("data/02_intermediate/clean_items.parquet")
sellers = pd.read_parquet("data/02_intermediate/clean_sellers.parquet")

In [None]:
geo_centroids = (
    geo_clipped
    .groupby("geolocation_zip_code_prefix")[["geolocation_lat", "geolocation_lng"]]
    .mean()
    .reset_index()
)

customers["customer_zip_code_prefix"] = customers["customer_zip_code_prefix"].astype(str)
sellers["seller_zip_code_prefix"] = sellers["seller_zip_code_prefix"].astype(str)
geo_centroids["geolocation_zip_code_prefix"] = geo_centroids["geolocation_zip_code_prefix"].astype(str)

# Rename columns for clarity before merge
customer_coords = geo_centroids.rename(columns={
    "geolocation_zip_code_prefix": "customer_zip_code_prefix",
    "geolocation_lat": "customer_lat",
    "geolocation_lng": "customer_lng"
})

seller_coords = geo_centroids.rename(columns={
    "geolocation_zip_code_prefix": "seller_zip_code_prefix",
    "geolocation_lat": "seller_lat",
    "geolocation_lng": "seller_lng"
})

# Step 1: Merge orders with customers to get customer zip prefix
orders_customers = orders.merge(
    customers[["customer_id", "customer_zip_code_prefix"]],
    on="customer_id", how="left"
)

# Step 2: Merge order_items with sellers to get seller zip prefix
order_items_sellers = order_items.merge(
    sellers[["seller_id", "seller_zip_code_prefix"]],
    on="seller_id", how="left"
)

# Step 3: Merge the two on order_id to get both customer and seller zip prefixes
transactions = order_items_sellers.merge(
    orders_customers[["order_id", "customer_zip_code_prefix"]],
    on="order_id", how="left"
)

transactions["customer_zip_code_prefix"] = transactions["customer_zip_code_prefix"].astype(str)
transactions["seller_zip_code_prefix"] = transactions["seller_zip_code_prefix"].astype(str)

merged = transactions \
    .merge(customer_coords, on="customer_zip_code_prefix", how="left") \
    .merge(seller_coords, on="seller_zip_code_prefix", how="left")
    
def haversine_distance(lat1, lng1, lat2, lng2):
    R = 6371  # Earth radius in km
    lat1, lng1, lat2, lng2 = map(np.radians, [lat1, lng1, lat2, lng2])
    dlat = lat2 - lat1
    dlng = lng2 - lng1

    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlng/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

merged["distance_km"] = haversine_distance(
    merged["customer_lat"],
    merged["customer_lng"],
    merged["seller_lat"],
    merged["seller_lng"]
)

merged = merged.dropna(subset=["distance_km"])

merged.head()

In [None]:
import sys
import os

# File path to the src directory for both linux and windows
# workaround for the issue of relative imports in Jupyter notebooks to import modules from src without using the full path
src_path = os.path.abspath("src")
if src_path not in sys.path:
    sys.path.insert(0, src_path)
    
# Rerun this cell after making changes to the utils module
from the_team.utils import etl, viz
import importlib
importlib.reload(etl)
importlib.reload(viz)

# Set custom plot style for consistency
viz.set_plot_style()

viz.plot_numeric_distribution(merged[['distance_km']])