In [101]:
!pip install pandas requests numpy scikit-learn



In [71]:
import pandas as pd
from requests import get
import numpy as np
import re
import unicodedata
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [2]:
customers = pd.read_csv("data/customers_dataset.csv")
sellers = pd.read_csv("data/sellers_dataset.csv")
order_items = pd.read_csv("data/order_items_dataset.csv")
order_payments = pd.read_csv("data/order_payments_dataset.csv")
order_reviews = pd.read_csv("data/order_reviews_dataset.csv")
orders = pd.read_csv("data/orders_dataset.csv")
product_translation = pd.read_csv("data/product_category_name_translation.csv")
products = pd.read_csv("data/products_dataset.csv")
geolocations = pd.read_csv("data/geolocation_dataset.csv")

In [3]:
def save_encoder_data(label_encoder:LabelEncoder, save_path:str):
    data = dict(enumerate(label_encoder.classes_))
    dir = os.path.split(save_path)[0]
    if not os.path.isdir(dir):
        os.makedirs(dir, exist_ok=True)
    np.save(save_path, data)

In [4]:
def clean_text(input:str, only_normalize:bool):
    input = unicodedata.normalize("NFKD", input)
    input = "".join([c for c in input if not unicodedata.combining(c)])

    if not only_normalize:
        input = re.sub(r"^[0-9\*\.]+", "", input)
        input = re.sub(r"[^\w\s]", "", input)
        input = re.sub(r"\s+", " ", input)
        input = input.replace('-', '').strip().lower()

    return input

In [5]:
def getCoordinate(zip_code: int, geolocation_data: pd.DataFrame):
    result = geolocation_data[geolocation_data.geolocation_zip_code_prefix == zip_code][
        ["geolocation_lat", "geolocation_lng"]
    ]
    if len(result) == 0:
        closest_zip_code = (geolocation_data.geolocation_zip_code_prefix - 82040).abs()
        closest_zip_code_index = closest_zip_code.idxmin()
        closest_zip_code = geolocation_data.iloc[
            [closest_zip_code_index]
        ].geolocation_zip_code_prefix.values[0]
        result = (
            geolocation_data[
                geolocation_data.geolocation_zip_code_prefix == closest_zip_code
            ][["geolocation_lat", "geolocation_lng"]]
            .sample(frac=0.5)
            .mean()
            .to_frame()
            .T
        )
    else:
        result = result.sample(1)
    return result.iloc[0]

## State's Abbreviation

In [6]:
state = pd.read_html(get("https://brazil-help.com/brazilian_states.htm").content)[2]
state.columns = state.iloc[1]
state = state.iloc[2:]
state = state[["Common Two Letter Abbreviation", "State"]]

state.State = state.State.apply(lambda x: clean_text(x, True))
state.sample(5, random_state=42)

1,Common Two Letter Abbreviation,State
10,GO,Goias
15,PA,Para
11,MA,Maranhao
23,RO,Rondonia
2,AC,Acre


In [7]:
customers.replace(dict(state.values), inplace=True)
sellers.replace(dict(state.values), inplace=True)
geolocations.replace(dict(state.values), inplace=True)

geolocations.sample(5, random_state=42)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
162456,6412,-23.497836,-46.882982,barueri,Sao Paulo
65543,3707,-23.507041,-46.543324,sao paulo,Sao Paulo
129566,5406,-23.555752,-46.674463,são paulo,Sao Paulo
999061,99709,-27.661007,-52.285391,erechim,Rio Grande do Sul
564558,31310,-19.869566,-43.986149,belo horizonte,Minas Gerais


## Geolocation

In [8]:
geolocations.sample(5, random_state=42)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
162456,6412,-23.497836,-46.882982,barueri,Sao Paulo
65543,3707,-23.507041,-46.543324,sao paulo,Sao Paulo
129566,5406,-23.555752,-46.674463,são paulo,Sao Paulo
999061,99709,-27.661007,-52.285391,erechim,Rio Grande do Sul
564558,31310,-19.869566,-43.986149,belo horizonte,Minas Gerais


In [9]:
geolocations.dtypes

geolocation_zip_code_prefix      int64
geolocation_lat                float64
geolocation_lng                float64
geolocation_city                object
geolocation_state               object
dtype: object

In [10]:
geolocations.isna().sum()

geolocation_zip_code_prefix    0
geolocation_lat                0
geolocation_lng                0
geolocation_city               0
geolocation_state              0
dtype: int64

In [11]:
geolocations.geolocation_city = geolocations.geolocation_city.apply(lambda x: clean_text(x, False))
geolocations.sample(5, random_state=42)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
162456,6412,-23.497836,-46.882982,barueri,Sao Paulo
65543,3707,-23.507041,-46.543324,sao paulo,Sao Paulo
129566,5406,-23.555752,-46.674463,sao paulo,Sao Paulo
999061,99709,-27.661007,-52.285391,erechim,Rio Grande do Sul
564558,31310,-19.869566,-43.986149,belo horizonte,Minas Gerais


In [12]:
city_encoder = LabelEncoder()
geolocations.geolocation_city = city_encoder.fit_transform(geolocations.geolocation_city)

save_encoder_data(city_encoder, "data/fix/city_encoder.npy")
dict(enumerate(city_encoder.classes_))

{0: 'abadia de goias',
 1: 'abadia dos dourados',
 2: 'abadiania',
 3: 'abaete',
 4: 'abaetetuba',
 5: 'abaiara',
 6: 'abaira',
 7: 'abare',
 8: 'abatia',
 9: 'abdon batista',
 10: 'abel figueiredo',
 11: 'abelardo luz',
 12: 'abrantes',
 13: 'abre campo',
 14: 'abreu e lima',
 15: 'abreulandia',
 16: 'abreus',
 17: 'acaiaca',
 18: 'acailandia',
 19: 'acajutiba',
 20: 'acara',
 21: 'acarape',
 22: 'acarau',
 23: 'acari',
 24: 'acaua',
 25: 'acegua',
 26: 'acioli',
 27: 'acopiara',
 28: 'acorizal',
 29: 'acrelandia',
 30: 'acreuna',
 31: 'acu',
 32: 'acu da torre',
 33: 'acucena',
 34: 'acupe',
 35: 'adamantina',
 36: 'adao colares',
 37: 'adelandia',
 38: 'adhemar de barros',
 39: 'adolfo',
 40: 'adrianopolis',
 41: 'adustina',
 42: 'afogados da ingazeira',
 43: 'afonso arinos',
 44: 'afonso bezerra',
 45: 'afonso claudio',
 46: 'afonso cunha',
 47: 'afranio',
 48: 'afua',
 49: 'agisse',
 50: 'agrestina',
 51: 'agricolandia',
 52: 'agrolandia',
 53: 'agronomica',
 54: 'agua azul do nor

In [13]:
state_encoder = LabelEncoder()
geolocations.geolocation_state = state_encoder.fit_transform(geolocations.geolocation_state)

save_encoder_data(state_encoder, "data/fix/state_encoder.npy")
dict(enumerate(state_encoder.classes_))

{0: 'Acre',
 1: 'Alagoas',
 2: 'Amapa',
 3: 'Amazonas',
 4: 'Bahia',
 5: 'Ceara',
 6: 'Distrito Federal',
 7: 'Espirito Santo',
 8: 'Goias',
 9: 'Maranhao',
 10: 'MatoGrosso',
 11: 'MatoGrosso do Sul',
 12: 'Minas Gerais',
 13: 'Para',
 14: 'Paraiba',
 15: 'Parana',
 16: 'Pernambuco',
 17: 'Piaui',
 18: 'Rio Grande do Norte',
 19: 'Rio Grande do Sul',
 20: 'Rio de Janeiro',
 21: 'Rondonia',
 22: 'Roraima',
 23: 'Santa Catarina',
 24: 'Sao Paulo',
 25: 'Sergipe',
 26: 'Tocantins'}

## Product

In [14]:
products.sample(5, random_state=42)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
24880,f819f0c84a64f02d3a5606ca95edd272,relogios_presentes,59.0,452.0,1.0,710.0,19.0,13.0,14.0
6366,b9de40e43fccb6ba53b7eadbd5c49077,utilidades_domesticas,41.0,1188.0,3.0,700.0,70.0,10.0,15.0
4989,26afe4ed5890d941fcba14205863eec7,moveis_decoracao,47.0,1092.0,1.0,3850.0,30.0,30.0,30.0
8135,986870b9985d95ef6205bae1953a61a5,moveis_decoracao,50.0,646.0,1.0,300.0,16.0,30.0,20.0
19482,28ce57ecf4afba85e6020ec0209cada9,cool_stuff,31.0,248.0,2.0,5650.0,73.0,73.0,20.0


In [15]:
products.dtypes

product_id                     object
product_category_name          object
product_name_lenght           float64
product_description_lenght    float64
product_photos_qty            float64
product_weight_g              float64
product_length_cm             float64
product_height_cm             float64
product_width_cm              float64
dtype: object

In [16]:
products.product_category_name = products.product_category_name.replace(dict(product_translation.values))
products.sample(5, random_state=42)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
24880,f819f0c84a64f02d3a5606ca95edd272,watches_gifts,59.0,452.0,1.0,710.0,19.0,13.0,14.0
6366,b9de40e43fccb6ba53b7eadbd5c49077,housewares,41.0,1188.0,3.0,700.0,70.0,10.0,15.0
4989,26afe4ed5890d941fcba14205863eec7,furniture_decor,47.0,1092.0,1.0,3850.0,30.0,30.0,30.0
8135,986870b9985d95ef6205bae1953a61a5,furniture_decor,50.0,646.0,1.0,300.0,16.0,30.0,20.0
19482,28ce57ecf4afba85e6020ec0209cada9,cool_stuff,31.0,248.0,2.0,5650.0,73.0,73.0,20.0


In [17]:
products.isna().sum()

product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

In [18]:
product_category_name_encoder = LabelEncoder()
products.product_category_name = product_category_name_encoder.fit_transform(products.product_category_name)

save_encoder_data(product_category_name_encoder, "data/fix/product_category_name_encoder.npy")
dict(enumerate(product_category_name_encoder.classes_))

{0: 'agro_industry_and_commerce',
 1: 'air_conditioning',
 2: 'art',
 3: 'arts_and_craftmanship',
 4: 'audio',
 5: 'auto',
 6: 'baby',
 7: 'bed_bath_table',
 8: 'books_general_interest',
 9: 'books_imported',
 10: 'books_technical',
 11: 'cds_dvds_musicals',
 12: 'christmas_supplies',
 13: 'cine_photo',
 14: 'computers',
 15: 'computers_accessories',
 16: 'consoles_games',
 17: 'construction_tools_construction',
 18: 'construction_tools_lights',
 19: 'construction_tools_safety',
 20: 'cool_stuff',
 21: 'costruction_tools_garden',
 22: 'costruction_tools_tools',
 23: 'diapers_and_hygiene',
 24: 'drinks',
 25: 'dvds_blu_ray',
 26: 'electronics',
 27: 'fashio_female_clothing',
 28: 'fashion_bags_accessories',
 29: 'fashion_childrens_clothes',
 30: 'fashion_male_clothing',
 31: 'fashion_shoes',
 32: 'fashion_sport',
 33: 'fashion_underwear_beach',
 34: 'fixed_telephony',
 35: 'flowers',
 36: 'food',
 37: 'food_drink',
 38: 'furniture_bedroom',
 39: 'furniture_decor',
 40: 'furniture_living

In [19]:
products.product_category_name = products.product_category_name.replace({73: np.nan})
products = products.interpolate("linear")

products[
    [
        "product_category_name",
        "product_name_lenght",
        "product_description_lenght",
        "product_photos_qty",
    ]
] = products[
    [
        "product_category_name",
        "product_name_lenght",
        "product_description_lenght",
        "product_photos_qty",
    ]
].astype(
    int
)

products.sample(5, random_state=42)

  products = products.interpolate("linear")


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
24880,f819f0c84a64f02d3a5606ca95edd272,72,59,452,1,710.0,19.0,13.0,14.0
6366,b9de40e43fccb6ba53b7eadbd5c49077,49,41,1188,3,700.0,70.0,10.0,15.0
4989,26afe4ed5890d941fcba14205863eec7,39,47,1092,1,3850.0,30.0,30.0,30.0
8135,986870b9985d95ef6205bae1953a61a5,39,50,646,1,300.0,16.0,30.0,20.0
19482,28ce57ecf4afba85e6020ec0209cada9,20,31,248,2,5650.0,73.0,73.0,20.0


In [20]:
products.isna().sum()

product_id                    0
product_category_name         0
product_name_lenght           0
product_description_lenght    0
product_photos_qty            0
product_weight_g              0
product_length_cm             0
product_height_cm             0
product_width_cm              0
dtype: int64

## Orders

In [21]:
orders.sample(5, random_state=42)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
52263,b9a6c5f5df52c7226ac85aee7524c27f,f160aaf480efdfa7268f0fa535f73e76,delivered,2018-06-12 20:07:44,2018-06-12 20:44:26,2018-06-13 13:09:00,2018-06-19 12:44:08,2018-07-17 00:00:00
46645,261e71d2349c713eafa9f3df5972b95d,d6708bbbd2d419475869a84e41f620a1,delivered,2018-01-20 12:15:57,2018-01-20 12:37:13,2018-01-25 21:42:52,2018-01-30 11:32:35,2018-02-15 00:00:00
37546,67b50899f52995848c427e361e10dde3,1b353c00c71689afba44554e43cc5a76,delivered,2018-06-16 21:24:10,2018-06-16 21:36:59,2018-06-21 13:55:00,2018-06-27 13:17:27,2018-07-16 00:00:00
94756,32733fc014b67ef70fa6039dd8c6ba82,ad66699beab381d0747605f108cc02b4,delivered,2017-08-30 21:12:28,2017-08-31 02:50:24,2017-09-12 20:16:46,2017-09-25 17:53:23,2017-09-22 00:00:00
14771,39a70e9e9b729b11dee34ac12478597f,20c678bfce3c8252288f70dd2bee5e51,delivered,2017-08-10 21:26:25,2017-08-10 21:44:13,2017-08-11 19:11:16,2017-08-22 16:45:00,2017-09-12 00:00:00


In [22]:
orders.dtypes

order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object

In [23]:
datetime_columns = [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
]
for i in datetime_columns:
    orders[i] = pd.to_datetime(orders[i])

orders.dtypes

order_id                                 object
customer_id                              object
order_status                             object
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
dtype: object

In [24]:
orders.isna().sum()

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

In [25]:
order_status_encoder = LabelEncoder()
orders.order_status = order_status_encoder.fit_transform(orders.order_status)

save_encoder_data(order_status_encoder, "data/fix/order_status_encoder.npy")
dict(enumerate(order_status_encoder.classes_))

{0: 'approved',
 1: 'canceled',
 2: 'created',
 3: 'delivered',
 4: 'invoiced',
 5: 'processing',
 6: 'shipped',
 7: 'unavailable'}

In [26]:
orders.interpolate("linear", inplace=True)

orders.isna().sum()

  orders.interpolate("linear", inplace=True)


order_id                         0
customer_id                      0
order_status                     0
order_purchase_timestamp         0
order_approved_at                0
order_delivered_carrier_date     0
order_delivered_customer_date    0
order_estimated_delivery_date    0
dtype: int64

## Order Items

In [27]:
order_items.sample(5, random_state=42)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
107777,f4ee4273538924bda6212f5948e80fde,1,69455f41626a745aea9ee9164cb9eafd,7d13fca15225358621be4086e1eb0964,2018-06-19 09:18:15,180.0,20.45
2391,056349f85a73d794119c4286c95a52de,1,af35be35db4ad0dc288b571453337376,d20b021d3efdf267a402c402a48ea64b,2017-03-09 14:35:09,10.99,16.05
77829,b124967afcc82ef17ec41020fe2a9136,1,12e6d0f655986ceff00c74658dec97b1,3be634553519fb6536a03e1358e9fdc7,2018-06-06 17:18:09,49.99,8.88
99819,e257ae8610fb4fb68a1f459c3a4b1f51,1,a50acd33ba7a8da8e9db65094fa990a4,8581055ce74af1daba164fdbd55a40de,2017-05-18 08:02:19,117.3,14.43
41297,5e114d8e3840661abc3d9c4820f427b3,1,5cca3efb9521cc1d7099d610d4a12017,3d871de0142ce09b7081e2b9d1733cb1,2018-05-03 18:15:09,58.9,13.77


In [28]:
order_items.dtypes

order_id                object
order_item_id            int64
product_id              object
seller_id               object
shipping_limit_date     object
price                  float64
freight_value          float64
dtype: object

In [29]:
datetime_columns = [
    "shipping_limit_date"
]
for i in datetime_columns:
    order_items[i] = pd.to_datetime(order_items[i])

order_items.dtypes

order_id                       object
order_item_id                   int64
product_id                     object
seller_id                      object
shipping_limit_date    datetime64[ns]
price                         float64
freight_value                 float64
dtype: object

In [30]:
order_items.isna().sum()

order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64

## Order Payments

In [31]:
order_payments.sample(5, random_state=42)

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
21856,61c7b1cf28b58654fe411aaf87fe249d,1,debit_card,1,39.6
82913,a01f943f4a95572159f0603bd160d67a,1,credit_card,1,111.42
81375,267968a4442f2e55c2904cac31c26660,1,credit_card,1,37.37
9274,303c1a4cb953c69bf9ad12194dc3d44d,1,credit_card,1,47.96
77826,c5f5087eaff455f93653900411e34432,1,boleto,1,141.88


In [32]:
order_payments.dtypes

order_id                 object
payment_sequential        int64
payment_type             object
payment_installments      int64
payment_value           float64
dtype: object

In [33]:
order_payments.isna().sum()

order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

In [34]:
payment_type_encoder = LabelEncoder()
order_payments.payment_type = payment_type_encoder.fit_transform(order_payments.payment_type)

save_encoder_data(payment_type_encoder, "data/fix/payment_type_encoder.npy")
dict(enumerate(payment_type_encoder.classes_))

{0: 'boleto',
 1: 'credit_card',
 2: 'debit_card',
 3: 'not_defined',
 4: 'voucher'}

## Order Reviews

In [35]:
order_reviews.sample(5, random_state=42)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
90252,406e32984dd5273582105460a79571af,e00ed9d20c3479f9f0e9727ca9d60946,5,,,2017-12-22 00:00:00,2017-12-22 21:10:19
24436,3bca8d9922bed47eb96f23d121945290,3b5351f5f99b46339212291661a9d226,5,,Cumpriu o acordado!,2018-04-10 00:00:00,2018-04-13 01:57:05
11313,65895b807ac5dfe062c82400b3f210b8,f395e98fb5c1c6ce1306e80de2fe125b,4,,,2017-04-20 00:00:00,2017-04-21 11:31:32
75442,2a6faa65a6e893105c60b1018d40e14a,57899333b5e286632bd2599d3f7864ce,5,,,2018-04-06 00:00:00,2018-04-07 00:34:14
7217,a738aa683a09dc5979abc7d9c2cc8029,0cc76fbe09687fda664178e9fc6c404f,5,,,2018-01-07 00:00:00,2018-01-08 13:48:20


In [36]:
order_reviews.dtypes

review_id                  object
order_id                   object
review_score                int64
review_comment_title       object
review_comment_message     object
review_creation_date       object
review_answer_timestamp    object
dtype: object

In [37]:
datetime_columns = [
    "review_creation_date",
    "review_answer_timestamp",
]
for i in datetime_columns:
    order_reviews[i] = pd.to_datetime(order_reviews[i])

order_reviews.dtypes

review_id                          object
order_id                           object
review_score                        int64
review_comment_title               object
review_comment_message             object
review_creation_date       datetime64[ns]
review_answer_timestamp    datetime64[ns]
dtype: object

In [38]:
order_reviews.isna().sum()

review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64

## Customers

In [39]:
customers.sample(5, random_state=42)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
52263,c7432c6d237ffd6aa36a007b4237ec38,9a897ea48bf988012c00b802f1104a92,2971,sao paulo,Sao Paulo
46645,7f399d641e2e2064470145178c9e8778,90436a67885a57f147fb79e6d0e4bc1c,38610,unai,Minas Gerais
37546,ba5642b730704dc0f74b7cf715b41ed5,4d8056f71519ae1069e6747c63c676f7,88820,icara,Santa Catarina
94756,0f346a2cc84ebb2d52f0759d0acfd030,6117c9ef3251089693a6abb90c195eba,25250,duque de caxias,Rio de Janeiro
14771,d393b9491df482cf448e60aa9955b7f2,5caf3a2a5d1ef808e3dd182e79baa392,36955,mutum,Minas Gerais


In [40]:
customers.dtypes

customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object

In [41]:
customers.isna().sum()

customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64

In [42]:
customers.drop(columns=["customer_city","customer_state"], inplace=True)
customers.sample(5, random_state=42)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix
52263,c7432c6d237ffd6aa36a007b4237ec38,9a897ea48bf988012c00b802f1104a92,2971
46645,7f399d641e2e2064470145178c9e8778,90436a67885a57f147fb79e6d0e4bc1c,38610
37546,ba5642b730704dc0f74b7cf715b41ed5,4d8056f71519ae1069e6747c63c676f7,88820
94756,0f346a2cc84ebb2d52f0759d0acfd030,6117c9ef3251089693a6abb90c195eba,25250
14771,d393b9491df482cf448e60aa9955b7f2,5caf3a2a5d1ef808e3dd182e79baa392,36955


In [43]:
customers[["customer_geolocation_lat", "customer_geolocation_lng"]] = (
    customers.customer_zip_code_prefix.apply(lambda x: getCoordinate(x, geolocations))
)
customers.sample(5, random_state=42)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_geolocation_lat,customer_geolocation_lng
52263,c7432c6d237ffd6aa36a007b4237ec38,9a897ea48bf988012c00b802f1104a92,2971,-23.481629,-46.712144
46645,7f399d641e2e2064470145178c9e8778,90436a67885a57f147fb79e6d0e4bc1c,38610,-16.368648,-46.898484
37546,ba5642b730704dc0f74b7cf715b41ed5,4d8056f71519ae1069e6747c63c676f7,88820,-28.699661,-49.304257
94756,0f346a2cc84ebb2d52f0759d0acfd030,6117c9ef3251089693a6abb90c195eba,25250,-22.607244,-43.314948
14771,d393b9491df482cf448e60aa9955b7f2,5caf3a2a5d1ef808e3dd182e79baa392,36955,-19.821216,-41.445674


In [44]:
customers.isna().sum()

customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_geolocation_lat    0
customer_geolocation_lng    0
dtype: int64

# Seller

In [45]:
sellers.sample(5, random_state=42)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
1947,1f7fd2a6fcd5a6fa5d8a4dabc72aaae0,95800,venancio aires,Rio Grande do Sul
1309,6e1862e15f33d9994bc25922a85e1efc,13505,rio claro,Sao Paulo
1606,e628d4a53c109f09ca88098338b3a3f5,30170,belo horizonte,Minas Gerais
2626,0249d282d911d23cb8b869ab49c99f53,5676,sao paulo,Sao Paulo
2440,bdae679a9b282249bc23b9b69dae9a99,72210,brasilia,Distrito Federal


In [46]:
sellers.dtypes

seller_id                 object
seller_zip_code_prefix     int64
seller_city               object
seller_state              object
dtype: object

In [47]:
sellers.isna().sum()

seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64

In [48]:
sellers.drop(columns=["seller_city","seller_state"], inplace=True)
sellers.sample(5, random_state=42)

Unnamed: 0,seller_id,seller_zip_code_prefix
1947,1f7fd2a6fcd5a6fa5d8a4dabc72aaae0,95800
1309,6e1862e15f33d9994bc25922a85e1efc,13505
1606,e628d4a53c109f09ca88098338b3a3f5,30170
2626,0249d282d911d23cb8b869ab49c99f53,5676
2440,bdae679a9b282249bc23b9b69dae9a99,72210


In [49]:
sellers[["seller_geolocation_lat", "seller_geolocation_lng"]] = (
    sellers.seller_zip_code_prefix.apply(lambda x: getCoordinate(x, geolocations))
)
sellers.sample(5, random_state=42)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_geolocation_lat,seller_geolocation_lng
1947,1f7fd2a6fcd5a6fa5d8a4dabc72aaae0,95800,-29.572522,-52.158738
1309,6e1862e15f33d9994bc25922a85e1efc,13505,-22.380729,-47.556773
1606,e628d4a53c109f09ca88098338b3a3f5,30170,-19.93398,-43.944004
2626,0249d282d911d23cb8b869ab49c99f53,5676,-23.596003,-46.698785
2440,bdae679a9b282249bc23b9b69dae9a99,72210,-15.804566,-48.107983


In [50]:
sellers.isna().sum()

seller_id                 0
seller_zip_code_prefix    0
seller_geolocation_lat    0
seller_geolocation_lng    0
dtype: int64

# Assambling Data

In [51]:
detailed_orders_data = order_items.merge(orders, on='order_id') \
                    .merge(order_payments, on='order_id') \
                    .merge(products, on='product_id') \
                    .merge(customers, on='customer_id') \
                    .merge(sellers, on='seller_id')

detailed_orders_data.sample(5, random_state=42)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,customer_id,order_status,order_purchase_timestamp,...,product_length_cm,product_height_cm,product_width_cm,customer_unique_id,customer_zip_code_prefix,customer_geolocation_lat,customer_geolocation_lng,seller_zip_code_prefix,seller_geolocation_lat,seller_geolocation_lng
49672,6bd41909d90c9f91e22ad236ea395623,1,c5e9b73e5f4c644885862ba3e1137b57,46dc3b2cc0980fb8ec44634e21d2718e,2017-10-09 16:49:50,199.99,16.16,f45ac6dfd34f67fc16e581fe9ae0028e,3,2017-10-03 16:41:47,...,29.0,11.0,28.0,98407dc3f7283a8a678d1d97e139bce9,71931,-15.836123,-48.040185,22240,-22.932957,-43.187386
23948,344c86e9cc82d497519e09dc351d28ff,1,b5e13c9a353102f79c6206ff5cb61a50,a49928bcdf77c55c6d6e05e09a9b4ca5,2017-11-17 11:55:30,89.9,11.83,d5b301a2087b017c4a9ca9730a2b18cf,3,2017-11-12 11:43:17,...,40.0,25.0,20.0,45167dabf29a055f7c1208a69dc95586,8210,-23.532594,-46.442789,3017,-23.533704,-46.61253
105553,e55e4860fd8e46e38f351f994f9e06cb,1,3a7c9b0413d7b9cc7f4a18318fc5afb3,7a67c85e85bb2ce8582c35f2203ad736,2017-03-16 01:19:21,129.99,9.28,676ee955ce60a3b6eaa8c816cfb7fb04,3,2017-03-12 01:19:21,...,50.0,40.0,30.0,b21b70ecf581fa5addd602adc787d832,8111,-23.488042,-46.376701,3426,-23.554792,-46.531428
68942,96bac00ebfd18ba64dabd7cb27471282,1,5411e9269501a870cabf632f05655131,3d871de0142ce09b7081e2b9d1733cb1,2018-01-19 10:32:21,129.0,38.45,2f7319232811fe9879212e93efb30147,3,2018-01-09 19:34:58,...,38.0,30.0,28.0,adc2b0515dd6bea5e28161bbfeeade0e,58045,-7.123182,-34.828699,13232,-23.209964,-46.776231
80847,b02fefc3800702d58c988da39c132902,1,144266e1585320d057797a369596bb8b,8cc6a0e5738e61a87b03c78b2ba9db4b,2017-03-28 10:44:34,739.9,21.92,374d1a370cc4fc67244345f046bbdff6,3,2017-03-22 10:44:34,...,30.0,32.0,27.0,b4891a8e1c6726d478568eb2422dcce9,6708,-23.585222,-46.849074,37795,-22.087304,-46.581748


In [52]:
from math import radians, sin, cos, sqrt, atan2


def haversine(data:pd.Series):
    R = 6371.0

    lat1, lon1, lat2, lon2 = map(radians, data.values)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return round(distance, 2)

detailed_orders_data["seller_customer_distance_km"] = detailed_orders_data[
    [
        "customer_geolocation_lat",
        "customer_geolocation_lng",
        "seller_geolocation_lat",
        "seller_geolocation_lng",
    ]
].agg(haversine, axis=1)

detailed_orders_data.sample(5, random_state=42)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,customer_id,order_status,order_purchase_timestamp,...,product_height_cm,product_width_cm,customer_unique_id,customer_zip_code_prefix,customer_geolocation_lat,customer_geolocation_lng,seller_zip_code_prefix,seller_geolocation_lat,seller_geolocation_lng,seller_customer_distance_km
49672,6bd41909d90c9f91e22ad236ea395623,1,c5e9b73e5f4c644885862ba3e1137b57,46dc3b2cc0980fb8ec44634e21d2718e,2017-10-09 16:49:50,199.99,16.16,f45ac6dfd34f67fc16e581fe9ae0028e,3,2017-10-03 16:41:47,...,11.0,28.0,98407dc3f7283a8a678d1d97e139bce9,71931,-15.836123,-48.040185,22240,-22.932957,-43.187386,938.81
23948,344c86e9cc82d497519e09dc351d28ff,1,b5e13c9a353102f79c6206ff5cb61a50,a49928bcdf77c55c6d6e05e09a9b4ca5,2017-11-17 11:55:30,89.9,11.83,d5b301a2087b017c4a9ca9730a2b18cf,3,2017-11-12 11:43:17,...,25.0,20.0,45167dabf29a055f7c1208a69dc95586,8210,-23.532594,-46.442789,3017,-23.533704,-46.61253,17.3
105553,e55e4860fd8e46e38f351f994f9e06cb,1,3a7c9b0413d7b9cc7f4a18318fc5afb3,7a67c85e85bb2ce8582c35f2203ad736,2017-03-16 01:19:21,129.99,9.28,676ee955ce60a3b6eaa8c816cfb7fb04,3,2017-03-12 01:19:21,...,40.0,30.0,b21b70ecf581fa5addd602adc787d832,8111,-23.488042,-46.376701,3426,-23.554792,-46.531428,17.43
68942,96bac00ebfd18ba64dabd7cb27471282,1,5411e9269501a870cabf632f05655131,3d871de0142ce09b7081e2b9d1733cb1,2018-01-19 10:32:21,129.0,38.45,2f7319232811fe9879212e93efb30147,3,2018-01-09 19:34:58,...,30.0,28.0,adc2b0515dd6bea5e28161bbfeeade0e,58045,-7.123182,-34.828699,13232,-23.209964,-46.776231,2197.77
80847,b02fefc3800702d58c988da39c132902,1,144266e1585320d057797a369596bb8b,8cc6a0e5738e61a87b03c78b2ba9db4b,2017-03-28 10:44:34,739.9,21.92,374d1a370cc4fc67244345f046bbdff6,3,2017-03-22 10:44:34,...,32.0,27.0,b4891a8e1c6726d478568eb2422dcce9,6708,-23.585222,-46.849074,37795,-22.087304,-46.581748,168.8


In [53]:
detailed_orders_data.isna().sum()

order_id                         0
order_item_id                    0
product_id                       0
seller_id                        0
shipping_limit_date              0
price                            0
freight_value                    0
customer_id                      0
order_status                     0
order_purchase_timestamp         0
order_approved_at                0
order_delivered_carrier_date     0
order_delivered_customer_date    0
order_estimated_delivery_date    0
payment_sequential               0
payment_type                     0
payment_installments             0
payment_value                    0
product_category_name            0
product_name_lenght              0
product_description_lenght       0
product_photos_qty               0
product_weight_g                 0
product_length_cm                0
product_height_cm                0
product_width_cm                 0
customer_unique_id               0
customer_zip_code_prefix         0
customer_geolocation

In [54]:
detailed_orders_data.columns

Index(['order_id', 'order_item_id', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value', 'customer_id',
       'order_status', 'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value', 'product_category_name',
       'product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_geolocation_lat',
       'customer_geolocation_lng', 'seller_zip_code_prefix',
       'seller_geolocation_lat', 'seller_geolocation_lng',
       'seller_customer_distance_km'],
      dtype='object')

In [86]:
detailed_orders_data.to_csv("data/fix/detailed_orders_dataset.csv", index=False)

# Seleksi Fitur

In [57]:
indices = np.arange(len(detailed_orders_data))

train_indices, temp_indices = train_test_split(indices, random_state=42, train_size=0.8)
test_indices, val_indices = train_test_split(temp_indices, random_state=42, train_size=0.5)

In [58]:
print(f"""
      train : {len(train_indices)} data
      test  : {len(test_indices)} data
      val   : {len(val_indices)} data
      """)


      train : 94080 data
      test  : 11760 data
      val   : 11761 data
      


In [93]:
def split_dataset(data: pd.DataFrame, y_columns: str):
    x_train_data = data.iloc[train_indices].drop(columns=[y_columns])
    x_test_data = data.iloc[test_indices].drop(columns=[y_columns])
    x_val_data = data.iloc[val_indices].drop(columns=[y_columns])

    y_train_data = data.iloc[train_indices][y_columns]
    y_test_data = data.iloc[test_indices][y_columns]
    y_val_data = data.iloc[val_indices][y_columns]

    return {"x":x_train_data, "y":y_train_data}, {"x":x_test_data, "y":y_test_data}, {"x":x_val_data, "y":y_val_data}

## Prediksi Harga Barang

In [60]:
price_prediction = detailed_orders_data[['product_category_name','product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'price']]
price_prediction.sample(5, random_state=42)

Unnamed: 0,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm,price
49672,71,733.0,29.0,11.0,28.0,199.99
23948,71,1500.0,40.0,25.0,20.0,89.9
105553,20,1500.0,50.0,40.0,30.0,129.99
68942,68,6550.0,38.0,30.0,28.0,129.0
80847,20,1925.0,30.0,32.0,27.0,739.9


In [94]:
train_price_prediction, test_price_prediction, val_price_prediction = split_dataset(price_prediction, 'price')

## Prediksi Barang Tiba

In [81]:
delivery_prediction = detailed_orders_data[['seller_customer_distance_km','price', 'freight_value','order_purchase_timestamp', 'order_delivered_customer_date','product_weight_g',"product_length_cm","product_height_cm", "product_width_cm"]]
delivery_prediction.sample(5, random_state=5)

Unnamed: 0,seller_customer_distance_km,price,freight_value,order_purchase_timestamp,order_delivered_customer_date,product_weight_g,product_length_cm,product_height_cm,product_width_cm
109165,330.41,56.5,14.15,2018-01-19 08:02:06,2018-02-02 15:42:37,488.0,17.0,16.0,17.0
30066,358.12,250.0,15.5,2018-02-08 13:47:24,2018-02-28 19:22:49,7800.0,33.0,44.0,33.0
62798,517.69,159.9,15.87,2018-01-18 22:48:53,2018-02-06 18:52:40,500.0,35.0,45.0,15.0
23260,458.2,59.0,13.07,2018-07-04 16:49:21,2018-07-07 14:41:18,300.0,18.0,9.0,12.0
19194,861.83,109.0,18.33,2017-11-02 11:10:32,2017-11-16 21:18:42,800.0,24.0,6.0,24.0


In [82]:
delivery_prediction.dtypes

seller_customer_distance_km             float64
price                                   float64
freight_value                           float64
order_purchase_timestamp         datetime64[ns]
order_delivered_customer_date    datetime64[ns]
product_weight_g                        float64
product_length_cm                       float64
product_height_cm                       float64
product_width_cm                        float64
dtype: object

In [83]:
delivery_prediction = delivery_prediction[~delivery_prediction['order_delivered_customer_date'].isna()]

delivery_prediction['order_estimated_delivery_day'] = (delivery_prediction["order_delivered_customer_date"] - delivery_prediction["order_purchase_timestamp"]).dt.days
delivery_prediction.drop(columns=["order_delivered_customer_date", "order_purchase_timestamp"], inplace=True)
delivery_prediction.sample(5, random_state=5)

Unnamed: 0,seller_customer_distance_km,price,freight_value,product_weight_g,product_length_cm,product_height_cm,product_width_cm,order_estimated_delivery_day
109165,330.41,56.5,14.15,488.0,17.0,16.0,17.0,14
30066,358.12,250.0,15.5,7800.0,33.0,44.0,33.0,20
62798,517.69,159.9,15.87,500.0,35.0,45.0,15.0,18
23260,458.2,59.0,13.07,300.0,18.0,9.0,12.0,2
19194,861.83,109.0,18.33,800.0,24.0,6.0,24.0,14


In [95]:
train_delivery_prediction, test_delivery_prediction, val_delivery_prediction = split_dataset(delivery_prediction, 'order_estimated_delivery_day')

# Save Dataset

In [96]:
np.savez_compressed("data/fix/price_prediction.npz", train=train_price_prediction, test=test_price_prediction, val=val_price_prediction)
np.savez_compressed("data/fix/delivery_prediction.npz", train=train_delivery_prediction, test=test_delivery_prediction, val=val_delivery_prediction)