In [1]:
import pandas as pd
from requests import get
import numpy as np
from sklearn.model_selection import train_test_split

In [24]:
customers = pd.read_csv("data/customers_dataset.csv")
sellers = pd.read_csv("data/sellers_dataset.csv")
order_items = pd.read_csv("data/order_items_dataset.csv")
order_payments = pd.read_csv("data/order_payments_dataset.csv")
order_reviews = pd.read_csv("data/order_reviews_dataset.csv")
orders = pd.read_csv("data/orders_dataset.csv")
product_translation = pd.read_csv("data/product_category_name_translation.csv")
products = pd.read_csv("data/products_dataset.csv")
geolocations = pd.read_csv("data/geolocation_dataset.csv")

In [3]:
state = pd.read_html(get("https://brazil-help.com/brazilian_states.htm").content)[2]
state.columns = state.iloc[1]
state = state.iloc[2:]
state = state[["Common Two Letter Abbreviation", "State"]]
state.sample(5, random_state=42)

1,Common Two Letter Abbreviation,State
10,GO,Goiás
15,PA,Pará
11,MA,Maranhão
23,RO,Rondônia
2,AC,Acre


In [20]:
products.product_category_name = products.product_category_name.replace(dict(product_translation.values))
products.sample(5, random_state=42)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
24880,f819f0c84a64f02d3a5606ca95edd272,watches_gifts,59.0,452.0,1.0,710.0,19.0,13.0,14.0
6366,b9de40e43fccb6ba53b7eadbd5c49077,housewares,41.0,1188.0,3.0,700.0,70.0,10.0,15.0
4989,26afe4ed5890d941fcba14205863eec7,furniture_decor,47.0,1092.0,1.0,3850.0,30.0,30.0,30.0
8135,986870b9985d95ef6205bae1953a61a5,furniture_decor,50.0,646.0,1.0,300.0,16.0,30.0,20.0
19482,28ce57ecf4afba85e6020ec0209cada9,cool_stuff,31.0,248.0,2.0,5650.0,73.0,73.0,20.0


In [5]:
datetime_columns = [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
]
for i in datetime_columns:
    orders[i] = pd.to_datetime(orders[i])

orders.dtypes

order_id                                 object
customer_id                              object
order_status                             object
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
dtype: object

In [6]:
customers.replace(dict(state.values), inplace=True)
sellers.replace(dict(state.values), inplace=True)
geolocations.replace(dict(state.values), inplace=True)

sellers.sample(5, random_state=42)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
1947,1f7fd2a6fcd5a6fa5d8a4dabc72aaae0,95800,venancio aires,Rio Grande do Sul
1309,6e1862e15f33d9994bc25922a85e1efc,13505,rio claro,São Paulo
1606,e628d4a53c109f09ca88098338b3a3f5,30170,belo horizonte,Minas Gerais
2626,0249d282d911d23cb8b869ab49c99f53,5676,sao paulo,São Paulo
2440,bdae679a9b282249bc23b9b69dae9a99,72210,brasilia,Distrito Federal


In [23]:
for df, file in (
    [customers, "customers"],
    [sellers, "sellers"],
    [order_items, "order_items"],
    [order_payments, "order_payments"],
    [order_reviews, "order_reviews"],
    [orders, "orders"],
    [product_translation, "product_translation"],
    [products, "products"],
    [geolocations, "geolocations"],
):
    print(
        f"""
file : {file}
{df.isna().sum()}
{'*'*100}
        """
    )


file : customers
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
****************************************************************************************************
        

file : sellers
seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64
****************************************************************************************************
        

file : order_items
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64
****************************************************************************************************
        

file : order_payments
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value

In [8]:
order_items = order_items.interpolate("linear")
order_payments = order_payments.interpolate("linear")
order_reviews = order_reviews.interpolate("linear")
orders = orders.interpolate("linear")
products = products.interpolate("linear")
geolocations = geolocations.interpolate("linear")

  order_items = order_items.interpolate("linear")
  order_payments = order_payments.interpolate("linear")
  order_reviews = order_reviews.interpolate("linear")
  orders = orders.interpolate("linear")
  products = products.interpolate("linear")
  geolocations = geolocations.interpolate("linear")


In [9]:
for df, file in (
    [customers, "customers"],
    [sellers, "sellers"],
    [order_items, "order_items"],
    [order_payments, "order_payments"],
    [order_reviews, "order_reviews"],
    [orders, "orders"],
    [product_translation, "product_translation"],
    [products, "products"],
    [geolocations, "geolocations"],
):
    print(
        f"""
file : {file}
{df.isna().sum()}
{'*'*100}
        """
    )


file : customers
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
****************************************************************************************************
        

file : sellers
seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64
****************************************************************************************************
        

file : order_items
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64
****************************************************************************************************
        

file : order_payments
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value

In [10]:
def getCoordinate(zip_code: int, geolocation_data: pd.DataFrame):
    result = geolocation_data[geolocation_data.geolocation_zip_code_prefix == zip_code][
        ["geolocation_lat", "geolocation_lng"]
    ]
    if len(result) == 0:
        closest_zip_code = (
            geolocation_data.geolocation_zip_code_prefix - 82040
        ).abs()
        closest_zip_code_index = closest_zip_code.idxmin()
        closest_zip_code = geolocation_data.iloc[[closest_zip_code_index]].geolocation_zip_code_prefix.values[0]
        result = (
            geolocation_data[geolocation_data.geolocation_zip_code_prefix == closest_zip_code][
                ["geolocation_lat", "geolocation_lng"]
            ]
            .sample(frac=0.5)
            .mean()
            .to_frame()
            .T
        )
    else:
        result = result.sample(1)
    return result.iloc[0]


sellers[["seller_geolocation_lat", "seller_geolocation_lng"]] = (
    sellers.seller_zip_code_prefix.apply(lambda x: getCoordinate(x, geolocations))
)
sellers.sample(5, random_state=42)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,seller_geolocation_lat,seller_geolocation_lng
1947,1f7fd2a6fcd5a6fa5d8a4dabc72aaae0,95800,venancio aires,Rio Grande do Sul,-29.608005,-52.18638
1309,6e1862e15f33d9994bc25922a85e1efc,13505,rio claro,São Paulo,-22.38006,-47.558051
1606,e628d4a53c109f09ca88098338b3a3f5,30170,belo horizonte,Minas Gerais,-19.92308,-43.941042
2626,0249d282d911d23cb8b869ab49c99f53,5676,sao paulo,São Paulo,-23.601384,-46.697738
2440,bdae679a9b282249bc23b9b69dae9a99,72210,brasilia,Distrito Federal,-15.800327,-48.109042


In [11]:
customers[["customer_geolocation_lat", "customer_geolocation_lng"]] = (
    customers.customer_zip_code_prefix.apply(lambda x: getCoordinate(x, geolocations))
)
customers.sample(5, random_state=42)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_geolocation_lat,customer_geolocation_lng
52263,c7432c6d237ffd6aa36a007b4237ec38,9a897ea48bf988012c00b802f1104a92,2971,sao paulo,São Paulo,-23.48823,-46.709673
46645,7f399d641e2e2064470145178c9e8778,90436a67885a57f147fb79e6d0e4bc1c,38610,unai,Minas Gerais,-16.355948,-46.903254
37546,ba5642b730704dc0f74b7cf715b41ed5,4d8056f71519ae1069e6747c63c676f7,88820,icara,Santa Catarina,-28.682609,-49.320716
94756,0f346a2cc84ebb2d52f0759d0acfd030,6117c9ef3251089693a6abb90c195eba,25250,duque de caxias,Rio de Janeiro,-22.630879,-43.210062
14771,d393b9491df482cf448e60aa9955b7f2,5caf3a2a5d1ef808e3dd182e79baa392,36955,mutum,Minas Gerais,-19.819012,-41.436745


In [12]:
detailed_orders_data = order_items.merge(orders, on='order_id') \
                    .merge(order_payments, on='order_id') \
                    .merge(products, on='product_id') \
                    .merge(customers, on='customer_id') \
                    .merge(sellers, on='seller_id')

detailed_orders_data.sample(5, random_state=42)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,customer_id,order_status,order_purchase_timestamp,...,customer_zip_code_prefix,customer_city,customer_state,customer_geolocation_lat,customer_geolocation_lng,seller_zip_code_prefix,seller_city,seller_state,seller_geolocation_lat,seller_geolocation_lng
49672,6bd41909d90c9f91e22ad236ea395623,1,c5e9b73e5f4c644885862ba3e1137b57,46dc3b2cc0980fb8ec44634e21d2718e,2017-10-09 16:49:50,199.99,16.16,f45ac6dfd34f67fc16e581fe9ae0028e,delivered,2017-10-03 16:41:47,...,71931,brasilia,Distrito Federal,-15.837411,-48.038526,22240,rio de janeiro,Rio de Janeiro,-22.938488,-43.192775
23948,344c86e9cc82d497519e09dc351d28ff,1,b5e13c9a353102f79c6206ff5cb61a50,a49928bcdf77c55c6d6e05e09a9b4ca5,2017-11-17 11:55:30,89.9,11.83,d5b301a2087b017c4a9ca9730a2b18cf,delivered,2017-11-12 11:43:17,...,8210,sao paulo,São Paulo,-23.531855,-46.443385,3017,sao paulo,São Paulo,-23.534877,-46.611154
105553,e55e4860fd8e46e38f351f994f9e06cb,1,3a7c9b0413d7b9cc7f4a18318fc5afb3,7a67c85e85bb2ce8582c35f2203ad736,2017-03-16 01:19:21,129.99,9.28,676ee955ce60a3b6eaa8c816cfb7fb04,delivered,2017-03-12 01:19:21,...,8111,sao paulo,São Paulo,-23.486433,-46.386914,3426,sao paulo,São Paulo,-23.551734,-46.536955
68942,96bac00ebfd18ba64dabd7cb27471282,1,5411e9269501a870cabf632f05655131,3d871de0142ce09b7081e2b9d1733cb1,2018-01-19 10:32:21,129.0,38.45,2f7319232811fe9879212e93efb30147,delivered,2018-01-09 19:34:58,...,58045,joao pessoa,Paraíba,-7.124926,-34.825205,13232,campo limpo paulista,São Paulo,-23.209985,-46.762413
80847,b02fefc3800702d58c988da39c132902,1,144266e1585320d057797a369596bb8b,8cc6a0e5738e61a87b03c78b2ba9db4b,2017-03-28 10:44:34,739.9,21.92,374d1a370cc4fc67244345f046bbdff6,delivered,2017-03-22 10:44:34,...,6708,cotia,São Paulo,-23.580812,-46.857335,37795,andradas,Minas Gerais,-22.078807,-46.572333


In [13]:
from math import radians, sin, cos, sqrt, atan2


def haversine(data:pd.Series):
    R = 6371.0

    lat1, lon1, lat2, lon2 = map(radians, data.values)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return round(distance, 2)

detailed_orders_data["seller_customer_distance_km"] = detailed_orders_data[
    [
        "customer_geolocation_lat",
        "customer_geolocation_lng",
        "seller_geolocation_lat",
        "seller_geolocation_lng",
    ]
].agg(haversine, axis=1)

detailed_orders_data.sample(5, random_state=42)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,customer_id,order_status,order_purchase_timestamp,...,customer_city,customer_state,customer_geolocation_lat,customer_geolocation_lng,seller_zip_code_prefix,seller_city,seller_state,seller_geolocation_lat,seller_geolocation_lng,seller_customer_distance_km
49672,6bd41909d90c9f91e22ad236ea395623,1,c5e9b73e5f4c644885862ba3e1137b57,46dc3b2cc0980fb8ec44634e21d2718e,2017-10-09 16:49:50,199.99,16.16,f45ac6dfd34f67fc16e581fe9ae0028e,delivered,2017-10-03 16:41:47,...,brasilia,Distrito Federal,-15.837411,-48.038526,22240,rio de janeiro,Rio de Janeiro,-22.938488,-43.192775,938.8
23948,344c86e9cc82d497519e09dc351d28ff,1,b5e13c9a353102f79c6206ff5cb61a50,a49928bcdf77c55c6d6e05e09a9b4ca5,2017-11-17 11:55:30,89.9,11.83,d5b301a2087b017c4a9ca9730a2b18cf,delivered,2017-11-12 11:43:17,...,sao paulo,São Paulo,-23.531855,-46.443385,3017,sao paulo,São Paulo,-23.534877,-46.611154,17.11
105553,e55e4860fd8e46e38f351f994f9e06cb,1,3a7c9b0413d7b9cc7f4a18318fc5afb3,7a67c85e85bb2ce8582c35f2203ad736,2017-03-16 01:19:21,129.99,9.28,676ee955ce60a3b6eaa8c816cfb7fb04,delivered,2017-03-12 01:19:21,...,sao paulo,São Paulo,-23.486433,-46.386914,3426,sao paulo,São Paulo,-23.551734,-46.536955,16.93
68942,96bac00ebfd18ba64dabd7cb27471282,1,5411e9269501a870cabf632f05655131,3d871de0142ce09b7081e2b9d1733cb1,2018-01-19 10:32:21,129.0,38.45,2f7319232811fe9879212e93efb30147,delivered,2018-01-09 19:34:58,...,joao pessoa,Paraíba,-7.124926,-34.825205,13232,campo limpo paulista,São Paulo,-23.209985,-46.762413,2196.97
80847,b02fefc3800702d58c988da39c132902,1,144266e1585320d057797a369596bb8b,8cc6a0e5738e61a87b03c78b2ba9db4b,2017-03-28 10:44:34,739.9,21.92,374d1a370cc4fc67244345f046bbdff6,delivered,2017-03-22 10:44:34,...,cotia,São Paulo,-23.580812,-46.857335,37795,andradas,Minas Gerais,-22.078807,-46.572333,169.55


In [18]:
detailed_orders_data.isna().sum()

order_id                            0
order_item_id                       0
product_id                          0
seller_id                           0
shipping_limit_date                 0
price                               0
freight_value                       0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                   0
order_delivered_carrier_date        0
order_delivered_customer_date       0
order_estimated_delivery_date       0
payment_sequential                  0
payment_type                        0
payment_installments                0
payment_value                       0
product_category_name            1698
product_name_lenght                 0
product_description_lenght          0
product_photos_qty                  0
product_weight_g                    0
product_length_cm                   0
product_height_cm                   0
product_width_cm                    0
customer_uni

In [15]:
detailed_orders_data.columns

Index(['order_id', 'order_item_id', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value', 'customer_id',
       'order_status', 'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value', 'product_category_name',
       'product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'customer_geolocation_lat', 'customer_geolocation_lng',
       'seller_zip_code_prefix', 'seller_city', 'seller_state',
       'seller_geolocation_lat', 'seller_geolocation_lng',
       'seller_customer_distance_km'],
      dtype='object')

In [16]:
aaaaaa

NameError: name 'aaaaaa' is not defined

# Prediksi Harga Barang

In [87]:
detailed_orders_data.isna().sum()

order_id                            0
order_item_id                       0
product_id                          0
seller_id                           0
shipping_limit_date                 0
price                               0
freight_value                       0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                   0
order_delivered_carrier_date        0
order_delivered_customer_date       0
order_estimated_delivery_date       0
payment_sequential                  0
payment_type                        0
payment_installments                0
payment_value                       0
product_category_name            1698
product_name_lenght                 0
product_description_lenght          0
product_photos_qty                  0
product_weight_g                    0
product_length_cm                   0
product_height_cm                   0
product_width_cm                    0
customer_uni

In [78]:
price_prediction = detailed_orders_data[['product_category_name','product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'price']]
price_prediction.sample(5, random_state=42)

Unnamed: 0,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm,price
49672,toys,733.0,29.0,11.0,28.0,199.99
23948,toys,1500.0,40.0,25.0,20.0,89.9
105553,cool_stuff,1500.0,50.0,40.0,30.0,129.99
68942,stationery,6550.0,38.0,30.0,28.0,129.0
80847,cool_stuff,1925.0,30.0,32.0,27.0,739.9


In [80]:
price_prediction.iloc[125]

product_category_name      NaN
product_weight_g         200.0
product_length_cm         16.0
product_height_cm          5.0
product_width_cm          12.0
price                     7.79
Name: 125, dtype: object

In [None]:
from sklearn.preprocessing import LabelEncoder

product_category_name_encoder = LabelEncoder()
price_prediction['product_category_name'] = product_category_name_encoder.fit_transform(price_prediction.product_category_name)

dict(enumerate(product_category_name_encoder.classes_))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price_prediction['product_category_name'] = product_category_name_encoder.fit_transform(price_prediction.product_category_name)


{0: 'agro_industry_and_commerce',
 1: 'air_conditioning',
 2: 'art',
 3: 'arts_and_craftmanship',
 4: 'audio',
 5: 'auto',
 6: 'baby',
 7: 'bed_bath_table',
 8: 'books_general_interest',
 9: 'books_imported',
 10: 'books_technical',
 11: 'cds_dvds_musicals',
 12: 'christmas_supplies',
 13: 'cine_photo',
 14: 'computers',
 15: 'computers_accessories',
 16: 'consoles_games',
 17: 'construction_tools_construction',
 18: 'construction_tools_lights',
 19: 'construction_tools_safety',
 20: 'cool_stuff',
 21: 'costruction_tools_garden',
 22: 'costruction_tools_tools',
 23: 'diapers_and_hygiene',
 24: 'drinks',
 25: 'dvds_blu_ray',
 26: 'electronics',
 27: 'fashio_female_clothing',
 28: 'fashion_bags_accessories',
 29: 'fashion_childrens_clothes',
 30: 'fashion_male_clothing',
 31: 'fashion_shoes',
 32: 'fashion_sport',
 33: 'fashion_underwear_beach',
 34: 'fixed_telephony',
 35: 'flowers',
 36: 'food',
 37: 'food_drink',
 38: 'furniture_bedroom',
 39: 'furniture_decor',
 40: 'furniture_living

In [None]:
price_prediction[price_prediction.product_category_name==73]

Unnamed: 0,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm,price
125,73,200.0,16.0,5.0,12.0,7.79
127,73,700.0,35.0,14.0,11.0,7.60
134,73,400.0,20.0,12.0,15.0,122.99
144,73,200.0,16.0,2.0,11.0,20.30
173,73,2200.0,16.0,2.0,11.0,56.00
...,...,...,...,...,...,...
117250,73,400.0,20.0,12.0,15.0,122.99
117277,73,400.0,32.0,15.0,15.0,39.90
117294,73,350.0,16.0,6.0,11.0,139.00
117383,73,475.0,21.0,15.0,21.0,49.90
