In [1]:
import pandas as pd
from requests import get
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
customers = pd.read_csv("data/customers_dataset.csv")
sellers = pd.read_csv("data/sellers_dataset.csv")
order_items = pd.read_csv("data/order_items_dataset.csv")
order_payments = pd.read_csv("data/order_payments_dataset.csv")
order_reviews = pd.read_csv("data/order_reviews_dataset.csv")
orders = pd.read_csv("data/orders_dataset.csv")
product_translation = pd.read_csv("data/product_category_name_translation.csv")
products = pd.read_csv("data/products_dataset.csv")
geolocations = pd.read_csv("data/geolocation_dataset.csv")

In [None]:
state = pd.read_html(get("https://brazil-help.com/brazilian_states.htm").content)[2]
state.columns = state.iloc[1]
state = state.iloc[2:]
state = state[["Common Two Letter Abbreviation", "State"]]
state.sample(5, random_state=42)

1,Common Two Letter Abbreviation,State
10,GO,Goiás
15,PA,Pará
11,MA,Maranhão
23,RO,Rondônia
2,AC,Acre


In [None]:
products.replace(dict(product_translation.values), inplace=True)
products.sample(5, random_state=42)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
24880,f819f0c84a64f02d3a5606ca95edd272,watches_gifts,59.0,452.0,1.0,710.0,19.0,13.0,14.0
6366,b9de40e43fccb6ba53b7eadbd5c49077,housewares,41.0,1188.0,3.0,700.0,70.0,10.0,15.0
4989,26afe4ed5890d941fcba14205863eec7,furniture_decor,47.0,1092.0,1.0,3850.0,30.0,30.0,30.0
8135,986870b9985d95ef6205bae1953a61a5,furniture_decor,50.0,646.0,1.0,300.0,16.0,30.0,20.0
19482,28ce57ecf4afba85e6020ec0209cada9,cool_stuff,31.0,248.0,2.0,5650.0,73.0,73.0,20.0


In [5]:
datetime_columns = [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
]
for i in datetime_columns:
    orders[i] = pd.to_datetime(orders[i])

orders.dtypes

order_id                                 object
customer_id                              object
order_status                             object
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
dtype: object

In [6]:
customers.replace(dict(state.values), inplace=True)
sellers.replace(dict(state.values), inplace=True)
geolocations.replace(dict(state.values), inplace=True)

sellers.sample(5, random_state=42)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
1947,1f7fd2a6fcd5a6fa5d8a4dabc72aaae0,95800,venancio aires,Rio Grande do Sul
1309,6e1862e15f33d9994bc25922a85e1efc,13505,rio claro,São Paulo
1606,e628d4a53c109f09ca88098338b3a3f5,30170,belo horizonte,Minas Gerais
2626,0249d282d911d23cb8b869ab49c99f53,5676,sao paulo,São Paulo
2440,bdae679a9b282249bc23b9b69dae9a99,72210,brasilia,Distrito Federal


In [7]:
for df, file in (
    [customers, "customers"],
    [sellers, "sellers"],
    [order_items, "order_items"],
    [order_payments, "order_payments"],
    [order_reviews, "order_reviews"],
    [orders, "orders"],
    [product_translation, "product_translation"],
    [products, "products"],
    [geolocations, "geolocations"],
):
    print(
        f"""
file : {file}
{df.isna().sum()}
{'*'*100}
        """
    )


file : customers
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
****************************************************************************************************
        

file : sellers
seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64
****************************************************************************************************
        

file : order_items
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64
****************************************************************************************************
        

file : order_payments
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value

In [8]:
order_items = order_items.interpolate("linear")
order_payments = order_payments.interpolate("linear")
order_reviews = order_reviews.interpolate("linear")
orders = orders.interpolate("linear")
products = products.interpolate("linear")
geolocations = geolocations.interpolate("linear")

  order_items = order_items.interpolate("linear")
  order_payments = order_payments.interpolate("linear")
  order_reviews = order_reviews.interpolate("linear")
  orders = orders.interpolate("linear")
  products = products.interpolate("linear")
  geolocations = geolocations.interpolate("linear")


In [9]:
for df, file in (
    [customers, "customers"],
    [sellers, "sellers"],
    [order_items, "order_items"],
    [order_payments, "order_payments"],
    [order_reviews, "order_reviews"],
    [orders, "orders"],
    [product_translation, "product_translation"],
    [products, "products"],
    [geolocations, "geolocations"],
):
    print(
        f"""
file : {file}
{df.isna().sum()}
{'*'*100}
        """
    )


file : customers
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
****************************************************************************************************
        

file : sellers
seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64
****************************************************************************************************
        

file : order_items
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64
****************************************************************************************************
        

file : order_payments
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value

In [10]:
def getCoordinate(zip_code: int, geolocation_data: pd.DataFrame):
    result = geolocation_data[geolocation_data.geolocation_zip_code_prefix == zip_code][
        ["geolocation_lat", "geolocation_lng"]
    ]
    if len(result) == 0:
        closest_zip_code = (
            geolocation_data.geolocation_zip_code_prefix - 82040
        ).abs()
        closest_zip_code_index = closest_zip_code.idxmin()
        closest_zip_code = geolocation_data.iloc[[closest_zip_code_index]].geolocation_zip_code_prefix.values[0]
        result = (
            geolocation_data[geolocation_data.geolocation_zip_code_prefix == closest_zip_code][
                ["geolocation_lat", "geolocation_lng"]
            ]
            .sample(frac=0.5)
            .mean()
            .to_frame()
            .T
        )
    else:
        result = result.sample(1)
    return result.iloc[0]


sellers[["seller_geolocation_lat", "seller_geolocation_lng"]] = (
    sellers.seller_zip_code_prefix.apply(lambda x: getCoordinate(x, geolocations))
)
sellers.sample(5, random_state=42)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,seller_geolocation_lat,seller_geolocation_lng
1947,1f7fd2a6fcd5a6fa5d8a4dabc72aaae0,95800,venancio aires,Rio Grande do Sul,-29.612031,-52.195957
1309,6e1862e15f33d9994bc25922a85e1efc,13505,rio claro,São Paulo,-22.382049,-47.574681
1606,e628d4a53c109f09ca88098338b3a3f5,30170,belo horizonte,Minas Gerais,-19.930587,-43.940176
2626,0249d282d911d23cb8b869ab49c99f53,5676,sao paulo,São Paulo,-23.599065,-46.697073
2440,bdae679a9b282249bc23b9b69dae9a99,72210,brasilia,Distrito Federal,-15.804566,-48.107983


In [11]:
customers[["customer_geolocation_lat", "customer_geolocation_lng"]] = (
    customers.customer_zip_code_prefix.apply(lambda x: getCoordinate(x, geolocations))
)
customers.sample(5, random_state=42)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_geolocation_lat,customer_geolocation_lng
52263,c7432c6d237ffd6aa36a007b4237ec38,9a897ea48bf988012c00b802f1104a92,2971,sao paulo,São Paulo,-23.48481,-46.710731
46645,7f399d641e2e2064470145178c9e8778,90436a67885a57f147fb79e6d0e4bc1c,38610,unai,Minas Gerais,-16.361967,-46.892062
37546,ba5642b730704dc0f74b7cf715b41ed5,4d8056f71519ae1069e6747c63c676f7,88820,icara,Santa Catarina,-28.682176,-49.326034
94756,0f346a2cc84ebb2d52f0759d0acfd030,6117c9ef3251089693a6abb90c195eba,25250,duque de caxias,Rio de Janeiro,-22.603181,-43.301169
14771,d393b9491df482cf448e60aa9955b7f2,5caf3a2a5d1ef808e3dd182e79baa392,36955,mutum,Minas Gerais,-19.82312,-41.44235


In [None]:
detailed_orders_data = order_items.merge(orders, on='order_id') \
                    .merge(order_payments, on='order_id') \
                    .merge(products, on='product_id') \
                    .merge(customers, on='customer_id') \
                    .merge(sellers, on='seller_id')

detailed_orders_data.sample(5, random_state=42)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,customer_id,order_status,order_purchase_timestamp,...,customer_zip_code_prefix,customer_city,customer_state,customer_geolocation_lat,customer_geolocation_lng,seller_zip_code_prefix,seller_city,seller_state,seller_geolocation_lat,seller_geolocation_lng
55284,783c795ca45c01ac1ebe459d8c26751d,3,73258e01fadbaf7648577beb152e1320,87142160b41353c4e5fca2360caf6f92,2017-08-18 06:43:31,30.0,17.92,17d3670bca39b5c4cf9b10007975cbe3,delivered,2017-08-10 04:58:12,...,13503,rio claro,São Paulo,-22.422145,-47.571172,90230,porto alegre,Rio Grande do Sul,-30.003408,-51.200978
77991,aa0a58e418df41f6352802feb9999599,1,4ac50dbde931c0a5ed8c33d3dc047351,e5a38146df062edaf55c38afa99e42dc,2017-05-03 22:30:17,61.8,10.61,217cca4dac52c6eacf31baa6b6ce05eb,delivered,2017-04-26 22:18:44,...,4545,sao paulo,São Paulo,-23.595956,-46.675071,1233,sao paulo,São Paulo,-23.532868,-46.660336
97315,d34b91927899aaba97d258d038884053,1,b0e9f0cc1a936b07b4fc11b3a35ad519,04308b1ee57b6625f47df1d56f00eedf,2018-03-16 18:09:37,529.9,16.97,cf50af49e1f2f2757e157b3ff3506bcb,delivered,2018-03-12 17:38:17,...,88064,florianopolis,Santa Catarina,-27.706829,-48.52681,88215,bombinhas,Santa Catarina,-27.199593,-48.495753
20419,2ccc4454c10457bb1cd9fb7465167f2f,1,183c95ad186f48c320bbac4643829d3f,cab85505710c7cb9b720bceb52b01cee,2018-08-21 15:04:07,49.9,7.61,75913254f909c2b5c2a0bd9713c483e4,delivered,2018-08-16 14:43:12,...,4272,sao paulo,São Paulo,-23.60308,-46.614821,2252,sao paulo,São Paulo,-23.47784,-46.58824
80844,b02f16483f20ca4c7d1d5b147eca4c79,1,5a6e53c3b4e8684b13388d6aa4afdf12,7299e27ed73d2ad986de7f7c77d919fa,2017-03-30 11:45:16,14.99,14.52,8fbb6b597798c39667b1c353bac76098,delivered,2017-03-24 11:32:35,...,14810,araraquara,São Paulo,-21.779907,-48.159169,38440,araguari,Minas Gerais,-18.645851,-48.19019


In [30]:
from math import radians, sin, cos, sqrt, atan2


def haversine(data:pd.Series):
    R = 6371.0

    lat1, lon1, lat2, lon2 = map(radians, data.values)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return round(distance, 2)

detailed_orders_data["seller_customer_distance_km"] = detailed_orders_data[
    [
        "customer_geolocation_lat",
        "customer_geolocation_lng",
        "seller_geolocation_lat",
        "seller_geolocation_lng",
    ]
].agg(haversine, axis=1)

detailed_orders_data.sample(5, random_state=42)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,customer_id,order_status,order_purchase_timestamp,...,customer_city,customer_state,customer_geolocation_lat,customer_geolocation_lng,seller_zip_code_prefix,seller_city,seller_state,seller_geolocation_lat,seller_geolocation_lng,seller_customer_distance_km
49672,6bd41909d90c9f91e22ad236ea395623,1,c5e9b73e5f4c644885862ba3e1137b57,46dc3b2cc0980fb8ec44634e21d2718e,2017-10-09 16:49:50,199.99,16.16,f45ac6dfd34f67fc16e581fe9ae0028e,delivered,2017-10-03 16:41:47,...,brasilia,Distrito Federal,-15.836123,-48.040185,22240,rio de janeiro,Rio de Janeiro,-22.934772,-43.187988,938.94
23948,344c86e9cc82d497519e09dc351d28ff,1,b5e13c9a353102f79c6206ff5cb61a50,a49928bcdf77c55c6d6e05e09a9b4ca5,2017-11-17 11:55:30,89.9,11.83,d5b301a2087b017c4a9ca9730a2b18cf,delivered,2017-11-12 11:43:17,...,sao paulo,São Paulo,-23.541889,-46.454047,3017,sao paulo,São Paulo,-23.539434,-46.612197,16.12
105553,e55e4860fd8e46e38f351f994f9e06cb,1,3a7c9b0413d7b9cc7f4a18318fc5afb3,7a67c85e85bb2ce8582c35f2203ad736,2017-03-16 01:19:21,129.99,9.28,676ee955ce60a3b6eaa8c816cfb7fb04,delivered,2017-03-12 01:19:21,...,sao paulo,São Paulo,-23.491627,-46.380396,3426,sao paulo,São Paulo,-23.554792,-46.531428,16.92
68942,96bac00ebfd18ba64dabd7cb27471282,1,5411e9269501a870cabf632f05655131,3d871de0142ce09b7081e2b9d1733cb1,2018-01-19 10:32:21,129.0,38.45,2f7319232811fe9879212e93efb30147,delivered,2018-01-09 19:34:58,...,joao pessoa,Paraíba,-7.143258,-34.813604,13232,campo limpo paulista,São Paulo,-23.210456,-46.770739,2196.56
80847,b02fefc3800702d58c988da39c132902,1,144266e1585320d057797a369596bb8b,8cc6a0e5738e61a87b03c78b2ba9db4b,2017-03-28 10:44:34,739.9,21.92,374d1a370cc4fc67244345f046bbdff6,delivered,2017-03-22 10:44:34,...,cotia,São Paulo,-23.598478,-46.845742,37795,andradas,Minas Gerais,-22.068318,-46.574375,172.4


In [33]:
detailed_orders_data.columns

Index(['order_id', 'order_item_id', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value', 'customer_id',
       'order_status', 'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value', 'product_category_name',
       'product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'customer_geolocation_lat', 'customer_geolocation_lng',
       'seller_zip_code_prefix', 'seller_city', 'seller_state',
       'seller_geolocation_lat', 'seller_geolocation_lng',
       'seller_customer_distance_km'],
      dtype='object')