In [5]:
# Quick Dataset Overview - Run this in Jupyter or Python
# Create as: notebooks/quick_overview.ipynb

import pandas as pd
import os

print("OLIST DATASET QUICK OVERVIEW")
print("=" * 60)

# Data folder path (adjust if needed)
DATA_PATH = "../data/"

# Get all CSV files
try:
    csv_files = [f for f in os.listdir(DATA_PATH) if f.endswith('.csv')]
    print(f"Found {len(csv_files)} CSV files\n")
except:
    print("Data folder not found. Make sure you have a 'data' folder with CSV files")
    csv_files = []

# Quick overview function
def quick_overview(filename):
    """Show columns and first few rows of a dataset"""
    try:
        # Load the dataset
        df = pd.read_csv(DATA_PATH + filename)
        
        # Clean filename for display
        display_name = filename.replace('olist_', '').replace('_dataset.csv', '').replace('.csv', '').upper()
        
        print(f"{display_name}")
        print("-" * 50)
        print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
        print(f"Size: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
        
        print(f"\n Columns ({len(df.columns)}):")
        for i, col in enumerate(df.columns, 1):
            print(f"  {i:2d}. {col}")
        
        print(f"\nFirst 3 rows:")
        print(df.head(3).to_string(max_cols=6, max_colwidth=20))
        
        print("\n" + "="*60 + "\n")
        
        return df.shape, list(df.columns)
        
    except Exception as e:
        print(f"Error loading {filename}: {e}")
        return None, None

# Overview all datasets
dataset_info = {}

for filename in csv_files:
    shape, columns = quick_overview(filename)
    if shape:
        dataset_info[filename] = {
            'shape': shape,
            'columns': columns
        }

# Summary
if dataset_info:
    print("SUMMARY OF ALL DATASETS:")
    print("=" * 60)
    
    total_rows = sum([info['shape'][0] for info in dataset_info.values()])
    total_columns = sum([info['shape'][1] for info in dataset_info.values()])
    
    for filename, info in dataset_info.items():
        clean_name = filename.replace('olist_', '').replace('_dataset.csv', '').replace('.csv', '')
        print(f"{clean_name:<25} {info['shape'][0]:>8,} rows × {info['shape'][1]:>2} cols")
    
    print("-" * 60)
    print(f"{'TOTAL':<25} {total_rows:>8,} rows × {total_columns:>2} cols")
    

else:
    print("No datasets loaded. Check your data folder!")


OLIST DATASET QUICK OVERVIEW
Found 9 CSV files

SELLERS
--------------------------------------------------
Shape: 3,095 rows × 4 columns
Size: 0.6 MB

 Columns (4):
   1. seller_id
   2. seller_zip_code_prefix
   3. seller_city
   4. seller_state

First 3 rows:
             seller_id  seller_zip_code_prefix     seller_city seller_state
0  3442f8959a84dea7...                13023           campinas           SP
1  d1b65fc7debc3361...                13844         mogi guacu           SP
2  ce3ad9de960102d0...                20031     rio de janeiro           RJ


PRODUCT_CATEGORY_NAME_TRANSLATION
--------------------------------------------------
Shape: 71 rows × 2 columns
Size: 0.0 MB

 Columns (2):
   1. product_category_name
   2. product_category_name_english

First 3 rows:
  product_category_name product_category_name_english
0         beleza_saude         health_beauty         
1  informatica_aces...   computers_access...         
2           automotivo                  auto       

In [6]:
# Simple Time Range Analysis for Olist Orders
import pandas as pd

# Load orders dataset
df = pd.read_csv("../data/olist_orders_dataset.csv")

# Convert to datetime
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

# Basic info
print("Dataset shape:", df.shape)
print("Total orders:", len(df))

# Time range
start_date = df['order_purchase_timestamp'].min()
end_date = df['order_purchase_timestamp'].max()
total_days = (end_date - start_date).days

print("\nTime Range:")
print("Start date:", start_date)
print("End date:", end_date)
print("Total days:", total_days)
print("Total years:", round(total_days/365.25, 1))

# Check timestamp format
print("\nTimestamp format:")
print("Sample timestamps:")
print(df['order_purchase_timestamp'].head(3).tolist())

# Data distribution by year
print("\nOrders by year:")
df['year'] = df['order_purchase_timestamp'].dt.year
yearly_counts = df['year'].value_counts().sort_index()
for year, count in yearly_counts.items():
    print(f"{year}: {count:,} orders")

# Data distribution by month
print("\nOrders by month:")
df['month'] = df['order_purchase_timestamp'].dt.month
monthly_counts = df['month'].value_counts().sort_index()
for month, count in monthly_counts.items():
    print(f"Month {month}: {count:,} orders")

Dataset shape: (99441, 8)
Total orders: 99441

Time Range:
Start date: 2016-09-04 21:15:19
End date: 2018-10-17 17:30:18
Total days: 772
Total years: 2.1

Timestamp format:
Sample timestamps:
[Timestamp('2017-10-02 10:56:33'), Timestamp('2018-07-24 20:41:37'), Timestamp('2018-08-08 08:38:49')]

Orders by year:
2016: 329 orders
2017: 45,101 orders
2018: 54,011 orders

Orders by month:
Month 1: 8,069 orders
Month 2: 8,508 orders
Month 3: 9,893 orders
Month 4: 9,343 orders
Month 5: 10,573 orders
Month 6: 9,412 orders
Month 7: 10,318 orders
Month 8: 10,843 orders
Month 9: 4,305 orders
Month 10: 4,959 orders
Month 11: 7,544 orders
Month 12: 5,674 orders
