# Data Preparation

In [45]:
# "magic commands" to enable autoreload of your imported packages
%load_ext autoreload
%autoreload 2

In [6]:
import os
import pandas as pd 

## Explore the data folder

The data was provided by Olist, downloaded from kaggle. 

Inspect the data structure from the schema:

![Schema](schema.png)

In [2]:
!tree

[01;34m.[00m
├── 0-Project_Setup.ipynb
├── 01-data_preparation.ipynb
├── [01;34mdata[00m
│   ├── README.md
│   └── [01;34mcsv[00m
│       ├── [01;31mbrazilian-ecommerce.zip[00m
│       ├── olist_customers_dataset.csv
│       ├── olist_geolocation_dataset.csv
│       ├── olist_order_items_dataset.csv
│       ├── olist_order_payments_dataset.csv
│       ├── olist_order_reviews_dataset.csv
│       ├── olist_orders_dataset.csv
│       ├── olist_products_dataset.csv
│       ├── olist_sellers_dataset.csv
│       └── product_category_name_translation.csv
├── [01;34mpkg[00m
│   └── __init__.py
└── schema.png

3 directories, 15 files


## Construct the dictionary `data`

In [24]:
csv_path = os.path.join(os.getcwd(), 'data', 'csv')

file_names = [f for f in os.listdir(csv_path) if f.endswith(".csv")]

key_names = [
    name.replace("olist_", "")
    .replace(".csv", "")
    .replace("_dataset", "")
    for name in file_names
]

values = [pd.read_csv(path) for path in [os.path.join(csv_path,file_name) for file_name in file_names]]

data = {key:value for (key, value) in zip(key_names, values)}

In [50]:
## test .py script

from pkg.data import Data

data = Data().get_data()

data.keys()

# EDA

## EDA with pandas profiling

In [64]:
import pandas_profiling


datasets_to_profile = ['sellers', 'orders', 
                      'order_items', 'customers',
                      'geolocation',
                      'order_payments',
                      'order_reviews',
                      'products']

for d in datasets_to_profile:
    print('Exporting: ', d)
    profile = data[d].profile_report(title="pd-" + d)
    profile.to_file(output_file="data/reports/" + d + ".html")

import IPython
IPython.display.HTML(filename = 'data/reports/sellers.html')

dict_keys(['sellers', 'product_category_name_translation', 'orders', 'order_items', 'customers', 'geolocation', 'order_payments', 'order_reviews', 'products'])

## EDA with sweetviz

In [73]:
import sweetviz as sv 
sv.config_parser.read("Override.ini")

for d in datasets_to_profile:
    print('Exporting: ', d)
    profile = sv.analyze(data[d])
    profile.show_html(
        filepath="data/reports/" + d + "-sv.html",
        layout = 'widescreen',
        open_browser = False)

['Override.ini']