TODO: add context and problem statement etc.

## Import modules and data

In [None]:
import os
import pickle

import warnings
from datetime import datetime, timedelta
from time import time

import joblib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor

warnings.filterwarnings("ignore")
pd.set_option(
    "display.max_columns", None
)  # displays all columns (wrap-around) in pandas statistics

In [None]:
file_paths = {
    "orders_train": "/data/sandcastle/boxes/configerator/source/parin_course/Ecommerce_Order_Dataset/train/df_Orders.csv",
    "customers_train": "/data/sandcastle/boxes/configerator/source/parin_course/Ecommerce_Order_Dataset/train/df_Customers.csv",
    "products_train": "/data/sandcastle/boxes/configerator/source/parin_course/Ecommerce_Order_Dataset/train/df_Products.csv",
    "payments_train": "/data/sandcastle/boxes/configerator/source/parin_course/Ecommerce_Order_Dataset/train/df_Payments.csv",
    "order_items_train": "/data/sandcastle/boxes/configerator/source/parin_course/Ecommerce_Order_Dataset/train/df_OrderItems.csv",
    "orders_test": "/data/sandcastle/boxes/configerator/source/parin_course/Ecommerce_Order_Dataset/test/df_Orders.csv",
    "customers_test": "/data/sandcastle/boxes/configerator/source/parin_course/Ecommerce_Order_Dataset/test/df_Customers.csv",
    "products_test": "/data/sandcastle/boxes/configerator/source/parin_course/Ecommerce_Order_Dataset/test/df_Products.csv",
    "payments_test": "/data/sandcastle/boxes/configerator/source/parin_course/Ecommerce_Order_Dataset/test/df_Payments.csv",
    "order_items_test": "/data/sandcastle/boxes/configerator/source/parin_course/Ecommerce_Order_Dataset/test/df_OrderItems.csv",
}

# Load datasets
df_orders_train = pd.read_csv(file_paths["orders_train"])
df_customers_train = pd.read_csv(file_paths["customers_train"])
df_products_train = pd.read_csv(file_paths["products_train"])
df_payments_train = pd.read_csv(file_paths["payments_train"])
df_order_items_train = pd.read_csv(file_paths["order_items_train"])

df_orders_test = pd.read_csv(file_paths["orders_test"])
df_customers_test = pd.read_csv(file_paths["customers_test"])
df_products_test = pd.read_csv(file_paths["products_test"])
df_payments_test = pd.read_csv(file_paths["payments_test"])
df_order_items_test = pd.read_csv(file_paths["order_items_test"])

# Display the first few rows of each dataframe to understand the structure
print("Orders (Train):")
display(df_orders_train.head())

print("Customers (Train):")
display(df_customers_train.head())

print("Products (Train):")
display(df_products_train.head())

print("Payments (Train):")
display(df_payments_train.head())

print("Order Items (Train):")
display(df_order_items_train.head())

## Train data inspection and data cleaning

In [None]:
# null values
def print_null_duplicates(df):
    print("Null values:")
    print(df.isnull().sum())
    print("Duplicate values:")
    print(df.duplicated().sum())


print("\nOrders (Train):")
print_null_duplicates(df_orders_train)

print("\nCustomers (Train):")
print_null_duplicates(df_customers_train)

print("\nProducts (Train):")
print_null_duplicates(df_products_train)

print("\nPayments (Train):")
print_null_duplicates(df_payments_train)

print("\nOrder Items (Train):")
print_null_duplicates(df_order_items_train)

### Orders dataset

In [None]:
df_orders_train.info()
print(df_orders_train.duplicated(subset=["order_id"]).value_counts())
print(df_orders_train.duplicated(subset=["customer_id"]).value_counts())

Insights:
* This can be merged with customers dateset
* Following columns are unnecessary
  * order_purchase_timestamp
  * order_approved_at
  * order_delivered_timestamp
  * order_estimated_delivery_date
* After merge the ID columns can be dropped from the training datasets

### Customers dataset

In [None]:
df_customers_train.info()
print(df_customers_train.duplicated(subset=["customer_id"]).value_counts())
print(df_customers_train["customer_city"].unique().size)
print(df_customers_train["customer_state"].unique().size)
print(df_customers_train["customer_zip_code_prefix"].sample(10))

Insights
* customer_zip_code_prefix is unnecessary and can be dropped
* This can be merged with orders dataset

### Products dataset

In [None]:
print(df_products_train.info())
# removing duplicates from products dataset
# df_products_train = df_products_train.drop_duplicates()
print_null_duplicates(df_products_train)
print(df_products_train["product_category_name"].unique().size)
print(df_products_train["product_category_name"].unique())

Insights:
* We can remove the duplicate rows from the dataset
* Marginal number of rows have product_category_name as nan
  * We would still keep them in case some of those products are very popular

### Order items dataset

In [None]:
print(df_order_items_train.info())
print(df_order_items_train.duplicated(subset=["order_id"]).value_counts())
print(df_order_items_train.duplicated(subset=["product_id"]).value_counts())
print(df_order_items_train.duplicated(subset=["seller_id"]).value_counts())

Insights:
* Seller id is unnecessary
* This dataset can be merged with orders dataset
* Once merged, the id columns can be dropped

### Payments dataset

In [None]:
print(df_payments_train.info())
print(df_payments_train.duplicated(subset=["order_id"]).value_counts())
print(df_payments_train["payment_type"].unique())
print(df_payments_train["payment_sequential"].unique())
print(df_payments_train["payment_installments"].unique())

# df_payments_train["payment_value"].hist(
#     column="payment_value", bins=500, figsize=(15, 8)
# )

Insights:
* Since all orders are unique, this can be merged with the orders table as well

#### Clean datasets

In [None]:
def clean_orders_dataset(df):
    df.drop_duplicates(inplace=True)
    df.drop(
        columns=[
            "order_purchase_timestamp",
            "order_approved_at",
            "order_delivered_timestamp",
            "order_estimated_delivery_date",
        ],
        inplace=True,
    )


def clean_customers_dataset(df):
    df.drop_duplicates(inplace=True)
    df.drop(columns=["customer_zip_code_prefix"], inplace=True)


def clean_products_dataset(df):
    df.drop_duplicates(inplace=True)


def clean_order_items_dataset(df):
    df.drop_duplicates(inplace=True)
    df.drop(columns=["seller_id"], inplace=True)


def clean_payments_dataset(df):
    df.drop_duplicates(inplace=True)

In [None]:
# clean train datasets
clean_orders_dataset(df_orders_train)
clean_customers_dataset(df_customers_train)
clean_products_dataset(df_products_train)
clean_order_items_dataset(df_order_items_train)
clean_payments_dataset(df_payments_train)

### Merge datasets

In [None]:
def merge_datasets(df_orders, df_order_items, df_customers, df_products, df_payments):
    return (
        df_orders.merge(df_order_items, on="order_id", how="outer")
        .merge(df_customers, on="customer_id", how="outer")
        .merge(df_products, on="product_id", how="outer")
        .merge(df_payments, on="order_id", how="outer")
    )


df = merge_datasets(
    df_orders_train,
    df_order_items_train,
    df_customers_train,
    df_products_train,
    df_payments_train,
)
df.info()
print_null_duplicates(df)

In [None]:
df.drop(columns=["order_id", "customer_id", "product_id"], inplace=True)

In [None]:
print_null_duplicates(df)

In [None]:
# it seems removing the various id columns resulted in lots of duplicates. Dropping those duplicates now
df.drop_duplicates(inplace=True)
print_null_duplicates(df)

In [None]:
df.info()

In [None]:
# reseting the index
df.reset_index(drop=True, inplace=True)
df.info()

In [None]:
df.head()

### Exploratory data analysis

#### Univariate analysis

##### Numeric features

In [None]:
df.hist(figsize=(15, 15), grid=False, bins=1000)

##### Categorical features

In [None]:
n_categories = 5
n_cols = 2
n_rows = n_categories // n_cols + (n_categories % n_cols > 0)
cat = df[
    [
        "order_status",
        "product_category_name",
        "payment_type",
        "customer_state",
        "customer_city",
    ]
]
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * 17, n_rows * 24))
axs = axs.flatten()
for i, column in enumerate(cat.columns):
    # Plot each category in its subplot
    sns.barplot(
        x=cat[column].value_counts().index, y=cat[column].value_counts(), ax=axs[i]
    )
    axs[i].set_title(column)
    # Get current x-tick labels and set them with a 90-degree rotation
    labels = axs[i].get_xticklabels()
    axs[i].set_xticklabels(labels, rotation=90)

for i in range(n_categories, len(axs)):
    axs[i].set_visible(False)
plt.tight_layout()
plt.xticks(rotation=90)
plt.show()