# Exploratory Analysis

In [1]:
import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2
import os

In [29]:
!pwd

/Users/bingobango/code/lewagon/data-exploratory-analysis


In [30]:
from olist.data import Olist
data = Olist().get_data()

In [41]:
data

TypeError: BlockManager.__init__() got multiple values for argument 'verify_integrity'

Each transaction on the Olist ecommerce platform is characterized by:
- a `customer_id`, that would buy...
- various`product_id`...
- to a `seller_id`...
- and leaves a `rewiew_id`...
- all this belonging to an `order_id`

## 1 - Run an automated exploratory analysis with [pandas profiling](https://github.com/pandas-profiling/pandas-profiling)

In [4]:
# First, let's install the pandas-profiling package
! pip install --quiet pandas==1.4.4 pandas-profiling==3.3.0


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [13]:
# Then create a "reports" directory
# !mkdir reports

In [14]:
import pandas_profiling
datasets_to_profile = ['orders', 'products', 'sellers',
                  'customers', 'order_reviews',
                  'order_items']

👉 Create and save one `html report` per dataset to profile 

⏳ (It usually takes a few minutes)

In [18]:
for d in datasets_to_profile:
    print('exporting: '+ d)
    profile = data[d].profile_report(title='Report for '+ d)
    profile.to_file(output_file=os.path.join('reports', d, '.html'))
    

exporting: orders


AttributeError: 'DataFrame' object has no attribute 'profile_report'

## 2 - Investigate the cardinalities of your various DataFrames

❓ **How many unique `orders`, `reviews`, `sellers`, `products` and `customer` is there ?**  
(You can use pandas profiling or pandas methods on your notebook if you prefer)

In [34]:
aggregates = np.array([
    [
        data["orders"].order_id.nunique(),
        data["order_reviews"].review_id.nunique(),
        data["sellers"].seller_id.nunique(),
        data["products"].product_id.nunique(),
        data["customers"].customer_id.nunique(),
    ],
    [
        data["orders"].order_id.count(),
        data["order_reviews"].review_id.count(),
        data["sellers"].seller_id.count(),
        data["products"].product_id.count(),
        data["customers"].customer_id.count(),
    ],
])

In [36]:
pd.DataFrame(data=np.vstack((aggregates, aggregates[0,:] / aggregates[1,:])).T,
            index=['orders', 'reviews', 'sellers', 'products', 'customers'],
            columns=['nuinque', 'count', 'ratio'])

Unnamed: 0,nuinque,count,ratio
orders,99441.0,99441.0,1.0
reviews,98410.0,99224.0,0.991796
sellers,3095.0,3095.0,1.0
products,32951.0,32951.0,1.0
customers,99441.0,99441.0,1.0


❓ **How many reviews is there per order? Do we have reviews for all orders ?**
<details>
    <summary markdown='span'>Hints</summary>

This info is not directly accessible in your individual csv. You'll need to proceed to merge
</details>

In [39]:
orders = data['orders'][['customer_id', 'order_id']]
reviews = data['order_reviews'][['order_id', 'review_id']]
items = data['order_items'][['order_id', 'product_id','seller_id']]

In [40]:
orders_reviews = orders.merge(reviews, on='order_id', how='outer')
orders_reviews.head(2)

TypeError: BlockManager.__init__() got multiple values for argument 'verify_integrity'

🧪 **Test your code below**

Store the number of orders with missing reviews as `int` in a variable named `n_missing_reviews`

In [None]:
n_missing_review

In [None]:
from nbresult import ChallengeResult

result = ChallengeResult('exploratory',
    n=n_missing_reviews
)
result.write()
print(result.check())