In [21]:
import pandas as pd
from requests import get
import numpy as np
from sklearn.model_selection import train_test_split

In [29]:
customers = pd.read_csv("data/customers_dataset.csv")
sellers = pd.read_csv("data/sellers_dataset.csv")
order_items = pd.read_csv("data/order_items_dataset.csv")
order_payments = pd.read_csv("data/order_payments_dataset.csv")
order_reviews = pd.read_csv("data/order_reviews_dataset.csv")
orders = pd.read_csv("data/orders_dataset.csv")
product_translation = pd.read_csv("data/product_category_name_translation.csv")
products = pd.read_csv("data/products_dataset.csv")
geolocations = pd.read_csv("data/geolocation_dataset.csv")

In [8]:
state = pd.read_html(get("https://brazil-help.com/brazilian_states.htm").content)[2]
state.columns = state.iloc[1]
state = state.iloc[2:]
state = state[["Common Two Letter Abbreviation", "State"]]
state.sample(5, random_state=42)

1,Common Two Letter Abbreviation,State
10,GO,Goiás
15,PA,Pará
11,MA,Maranhão
23,RO,Rondônia
2,AC,Acre


In [38]:
products = pd.merge(
    products, product_translation, how="inner", on="product_category_name"
)
products.drop(columns="product_category_name", inplace=True)
products.rename(
    columns={"product_category_name_english": "product_category_name"}, inplace=True
)
products.sample(5, random_state=42)

Unnamed: 0,product_id,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name
408,068ceaa8a3d9d385cbf53d5335f89f80,42.0,395.0,1.0,1200.0,27.0,18.0,23.0,housewares
32182,e6d39d7cdf7c19c0359c348a89a4a191,38.0,367.0,1.0,100.0,17.0,3.0,12.0,fashion_bags_accessories
12865,ea3da91e6ba865972f9344cb9265296b,19.0,262.0,1.0,10500.0,36.0,47.0,36.0,furniture_decor
15287,0b0f32c8bd5426698d01d72f155b75f5,36.0,1099.0,1.0,2400.0,45.0,15.0,40.0,baby
26537,d3e08a8d7b87b86014638167087a8eaa,60.0,668.0,5.0,300.0,17.0,4.0,12.0,telephony


In [None]:
for df, file in (
    [customers, "customers"],
    [sellers, "sellers"],
    [order_items, "order_items"],
    [order_payments, "order_payments"],
    [order_reviews, "order_reviews"],
    [orders, "orders"],
    [product_translation, "product_translation"],
    [products, "products"],
    [geolocations, "geolocations"],
):
    print(
        f"""
file : {file}
{customers.isna().sum()}
{'*'*100}
          """
    )


file : customers
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
****************************************************************************************************
          

file : sellers
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
****************************************************************************************************
          

file : order_items
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
****************************************************************************************************
          

file : order_payments
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer

In [67]:
customers.replace(dict(state.values), inplace=True)
sellers.replace(dict(state.values), inplace=True)
geolocations.replace(dict(state.values), inplace=True)

sellers.sample(5, random_state=42)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
1947,1f7fd2a6fcd5a6fa5d8a4dabc72aaae0,95800,venancio aires,Rio Grande do Sul
1309,6e1862e15f33d9994bc25922a85e1efc,13505,rio claro,São Paulo
1606,e628d4a53c109f09ca88098338b3a3f5,30170,belo horizonte,Minas Gerais
2626,0249d282d911d23cb8b869ab49c99f53,5676,sao paulo,São Paulo
2440,bdae679a9b282249bc23b9b69dae9a99,72210,brasilia,Distrito Federal


In [219]:
def getCoordinate(zip_code: int, geolocation_data: pd.DataFrame):
    result = geolocation_data[geolocation_data.geolocation_zip_code_prefix == zip_code][
        ["geolocation_lat", "geolocation_lng"]
    ]
    if len(result) == 0:
        closest_zip_code = (
            geolocation_data.geolocation_zip_code_prefix - zip_code
        ).abs()
        closest_zip_code_index = closest_zip_code.idxmin()
        result = (
            geolocation_data.iloc[[closest_zip_code_index]][
                ["geolocation_lat", "geolocation_lng"]
            ]
            .sample(frac=0.5)
            .mean()
            .to_frame()
            .T
        )
    else:
        result = result.sample(1)
    return result.iloc[0]


sellers[["seller_geolocation_lat", "seller_geolocation_lng"]] = (
    sellers.seller_zip_code_prefix.apply(lambda x: getCoordinate(x, geolocations))
)
sellers.sample(5, random_state=42)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,seller_geolocation_lat,seller_geolocation_lng
1947,1f7fd2a6fcd5a6fa5d8a4dabc72aaae0,95800,venancio aires,Rio Grande do Sul,-29.620771,-52.205269
1309,6e1862e15f33d9994bc25922a85e1efc,13505,rio claro,São Paulo,-22.380127,-47.577583
1606,e628d4a53c109f09ca88098338b3a3f5,30170,belo horizonte,Minas Gerais,-19.933596,-43.951448
2626,0249d282d911d23cb8b869ab49c99f53,5676,sao paulo,São Paulo,-23.59432,-46.696418
2440,bdae679a9b282249bc23b9b69dae9a99,72210,brasilia,Distrito Federal,-15.802253,-48.115932


In [220]:
customers[["customer_geolocation_lat", "customer_geolocation_lng"]] = (
    customers.customer_zip_code_prefix.apply(lambda x: getCoordinate(x, geolocations))
)
customers.sample(5, random_state=42)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_geolocation_lat,customer_geolocation_lng
52263,c7432c6d237ffd6aa36a007b4237ec38,9a897ea48bf988012c00b802f1104a92,2971,sao paulo,São Paulo,-23.486544,-46.711094
46645,7f399d641e2e2064470145178c9e8778,90436a67885a57f147fb79e6d0e4bc1c,38610,unai,Minas Gerais,-16.358528,-46.897901
37546,ba5642b730704dc0f74b7cf715b41ed5,4d8056f71519ae1069e6747c63c676f7,88820,icara,Santa Catarina,-28.71412,-49.304704
94756,0f346a2cc84ebb2d52f0759d0acfd030,6117c9ef3251089693a6abb90c195eba,25250,duque de caxias,Rio de Janeiro,-22.601489,-43.300188
14771,d393b9491df482cf448e60aa9955b7f2,5caf3a2a5d1ef808e3dd182e79baa392,36955,mutum,Minas Gerais,-19.819012,-41.436745


In [221]:
import os

save_dir = "data/updated"
os.makedirs(save_dir, exist_ok=True)

sellers.to_csv(os.path.join(save_dir, "sellers_updated_dataset.csv"), index=False)
customers.to_csv(os.path.join(save_dir, "customers_updated_dataset.csv"), index=False)