In [None]:
import pandas as pd
from haversine import haversine, Unit
df = pd.read_parquet("data/03_primary/mega_id_labels.parquet")
customers = pd.read_parquet("data/02_intermediate/clean_customers.parquet")
sellers = pd.read_parquet("data/02_intermediate/clean_sellers.parquet")
orders = pd.read_parquet("data/02_intermediate/clean_orders.parquet")
items = pd.read_parquet("data/02_intermediate/clean_items.parquet")
geolocation = pd.read_parquet("data/02_intermediate/clean_geolocation.parquet")
products = pd.read_parquet("data/02_intermediate/clean_products.parquet")
cs = pd.read_parquet("data/04_feature/distance_seller_stats.parquet")
cs.head()

In [None]:
df.info()

In [None]:
df.groupby("is_repeat_buyer").agg({"customer_unique_id": "nunique"})

In [None]:
customers_loc = pd.merge(
    customers,
    geolocation,
    left_on="customer_zip_code_prefix",
    right_on="geolocation_zip_code_prefix",
    how="left"
).rename(columns={
    "geolocation_lat": "customer_lat",
    "geolocation_lng": "customer_lng"
})[["customer_unique_id", "customer_id", "customer_lat", "customer_lng"]]

customers_loc.head()

In [None]:
sellers_loc = pd.merge(
    sellers,
    geolocation,
    left_on="seller_zip_code_prefix",
    right_on="geolocation_zip_code_prefix",
    how="left"
).rename(columns={
    "geolocation_lat": "seller_lat",
    "geolocation_lng": "seller_lng"
})[["seller_id", "seller_lat", "seller_lng"]]

sellers_loc.head()

In [None]:
customers_loc.info()
print('')
sellers_loc.info()

In [None]:
# Drop customers with missing coordinates
customers_loc = customers_loc.dropna(subset=["customer_lat", "customer_lng"])

# Drop sellers with missing coordinates
sellers_loc = sellers_loc.dropna(subset=["seller_lat", "seller_lng"])

customers_loc.info()
print('')
sellers_loc.info()

In [None]:
order_details = pd.merge(
    items,
    orders,
    on="order_id",
    how="left"
)[["order_id", "customer_id", "seller_id"]]

order_details.info()

In [None]:
order_details = order_details.dropna(subset=["customer_id"])
order_details.info()

In [None]:
# Step 1: Merge orders with items
order_details = pd.merge(
    items,
    orders,
    on="order_id",
    how="left"
)

# Step 2: Merge with customers_loc (to get customer coordinates)
order_details = pd.merge(
    order_details,
    customers_loc,
    on="customer_id",
    how="left"
)

# Step 3: Merge with sellers_loc (to get seller coordinates)
order_details = pd.merge(
    order_details,
    sellers_loc,
    on="seller_id",
    how="left"
)

# Preview relevant fields
order_details = order_details[[
    "order_id", "customer_id", "seller_id",
    "customer_lat", "customer_lng",
    "seller_lat", "seller_lng"
]]

# order_details.head(10)
order_details.info()


In [None]:
order_details = order_details.dropna(
    subset=["customer_lat", "customer_lng", "seller_lat", "seller_lng"]
)
order_details.info()

In [None]:
def compute_distance(row):
    return haversine(
        (row["customer_lat"], row["customer_lng"]),
        (row["seller_lat"], row["seller_lng"]),
        unit=Unit.KILOMETERS
    )

order_details["distance_km"] = order_details.apply(compute_distance, axis=1)
order_details = order_details[["order_id", "customer_id", "seller_id", "distance_km"]]
order_details.head()

In [None]:
# Step 7: Reduce to 1 row per order
order_distances = order_details.drop_duplicates("order_id")[["order_id", "distance_km"]]
order_distances.info()

In [None]:
# Step 8: Merge with base dataset to ensure all orders are included
df = pd.merge(
    df,
    order_distances,
    on="order_id",
    how="left"
)
df.dropna(subset=["distance_km"], inplace=True)
df.info()


In [None]:
df.head()

In [None]:
# Step 1: Count repeat buyers per seller
repeat_buyers = (
    df[df["is_repeat_buyer"] == True]
    .groupby("seller_id")["customer_unique_id"]
    .nunique()
    .reset_index()
    .rename(columns={"customer_unique_id": "num_repeat_buyers"})
)


In [None]:
# step 2: Count total buyers per seller
total_buyers = (
    df.groupby("seller_id")["customer_unique_id"]
    .nunique()
    .reset_index()
    .rename(columns={"customer_unique_id": "num_unique_buyers"})
)


In [None]:
# Step 3: Merge & calculate repeat buyer ratio
seller_repeat_stats = pd.merge(total_buyers, repeat_buyers, on="seller_id", how="left")
seller_repeat_stats["num_repeat_buyers"] = seller_repeat_stats["num_repeat_buyers"].fillna(0)
seller_repeat_stats["seller_repeat_buyer_rate"] = (
    seller_repeat_stats["num_repeat_buyers"] / seller_repeat_stats["num_unique_buyers"]
)

In [None]:
seller_repeat_stats.head(10)

In [None]:
# Step 4: Merge this back into df
df = pd.merge(df, seller_repeat_stats[["seller_id", "seller_repeat_buyer_rate"]], on="seller_id", how="left")
df.head(10)

In [None]:
# Count customer city frequencies
top_cities = customers["customer_city"].value_counts().nlargest(5).index.tolist()

# Assign flag
customers["high_density_customer_area"] = customers["customer_city"].isin(top_cities).astype(int)

customers.head(10)