In [5]:
# Quick Dataset Overview - Run this in Jupyter or Python
# Create as: notebooks/quick_overview.ipynb

import pandas as pd
import os

print("OLIST DATASET QUICK OVERVIEW")
print("=" * 60)

# Data folder path (adjust if needed)
DATA_PATH = "../data/"

# Get all CSV files
try:
    csv_files = [f for f in os.listdir(DATA_PATH) if f.endswith('.csv')]
    print(f"Found {len(csv_files)} CSV files\n")
except:
    print("Data folder not found. Make sure you have a 'data' folder with CSV files")
    csv_files = []

# Quick overview function
def quick_overview(filename):
    """Show columns and first few rows of a dataset"""
    try:
        # Load the dataset
        df = pd.read_csv(DATA_PATH + filename)
        
        # Clean filename for display
        display_name = filename.replace('olist_', '').replace('_dataset.csv', '').replace('.csv', '').upper()
        
        print(f"{display_name}")
        print("-" * 50)
        print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
        print(f"Size: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
        
        print(f"\n Columns ({len(df.columns)}):")
        for i, col in enumerate(df.columns, 1):
            print(f"  {i:2d}. {col}")
        
        print(f"\nFirst 3 rows:")
        print(df.head(3).to_string(max_cols=6, max_colwidth=20))
        
        print("\n" + "="*60 + "\n")
        
        return df.shape, list(df.columns)
        
    except Exception as e:
        print(f"Error loading {filename}: {e}")
        return None, None

# Overview all datasets
dataset_info = {}

for filename in csv_files:
    shape, columns = quick_overview(filename)
    if shape:
        dataset_info[filename] = {
            'shape': shape,
            'columns': columns
        }

# Summary
if dataset_info:
    print("SUMMARY OF ALL DATASETS:")
    print("=" * 60)
    
    total_rows = sum([info['shape'][0] for info in dataset_info.values()])
    total_columns = sum([info['shape'][1] for info in dataset_info.values()])
    
    for filename, info in dataset_info.items():
        clean_name = filename.replace('olist_', '').replace('_dataset.csv', '').replace('.csv', '')
        print(f"{clean_name:<25} {info['shape'][0]:>8,} rows × {info['shape'][1]:>2} cols")
    
    print("-" * 60)
    print(f"{'TOTAL':<25} {total_rows:>8,} rows × {total_columns:>2} cols")
    

else:
    print("No datasets loaded. Check your data folder!")


OLIST DATASET QUICK OVERVIEW
Found 9 CSV files

SELLERS
--------------------------------------------------
Shape: 3,095 rows × 4 columns
Size: 0.6 MB

 Columns (4):
   1. seller_id
   2. seller_zip_code_prefix
   3. seller_city
   4. seller_state

First 3 rows:
             seller_id  seller_zip_code_prefix     seller_city seller_state
0  3442f8959a84dea7...                13023           campinas           SP
1  d1b65fc7debc3361...                13844         mogi guacu           SP
2  ce3ad9de960102d0...                20031     rio de janeiro           RJ


PRODUCT_CATEGORY_NAME_TRANSLATION
--------------------------------------------------
Shape: 71 rows × 2 columns
Size: 0.0 MB

 Columns (2):
   1. product_category_name
   2. product_category_name_english

First 3 rows:
  product_category_name product_category_name_english
0         beleza_saude         health_beauty         
1  informatica_aces...   computers_access...         
2           automotivo                  auto       

In [6]:
# Simple Time Range Analysis for Olist Orders
import pandas as pd

# Load orders dataset
df = pd.read_csv("../data/olist_orders_dataset.csv")

# Convert to datetime
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

# Basic info
print("Dataset shape:", df.shape)
print("Total orders:", len(df))

# Time range
start_date = df['order_purchase_timestamp'].min()
end_date = df['order_purchase_timestamp'].max()
total_days = (end_date - start_date).days

print("\nTime Range:")
print("Start date:", start_date)
print("End date:", end_date)
print("Total days:", total_days)
print("Total years:", round(total_days/365.25, 1))

# Check timestamp format
print("\nTimestamp format:")
print("Sample timestamps:")
print(df['order_purchase_timestamp'].head(3).tolist())

# Data distribution by year
print("\nOrders by year:")
df['year'] = df['order_purchase_timestamp'].dt.year
yearly_counts = df['year'].value_counts().sort_index()
for year, count in yearly_counts.items():
    print(f"{year}: {count:,} orders")

# Data distribution by month
print("\nOrders by month:")
df['month'] = df['order_purchase_timestamp'].dt.month
monthly_counts = df['month'].value_counts().sort_index()
for month, count in monthly_counts.items():
    print(f"Month {month}: {count:,} orders")

Dataset shape: (99441, 8)
Total orders: 99441

Time Range:
Start date: 2016-09-04 21:15:19
End date: 2018-10-17 17:30:18
Total days: 772
Total years: 2.1

Timestamp format:
Sample timestamps:
[Timestamp('2017-10-02 10:56:33'), Timestamp('2018-07-24 20:41:37'), Timestamp('2018-08-08 08:38:49')]

Orders by year:
2016: 329 orders
2017: 45,101 orders
2018: 54,011 orders

Orders by month:
Month 1: 8,069 orders
Month 2: 8,508 orders
Month 3: 9,893 orders
Month 4: 9,343 orders
Month 5: 10,573 orders
Month 6: 9,412 orders
Month 7: 10,318 orders
Month 8: 10,843 orders
Month 9: 4,305 orders
Month 10: 4,959 orders
Month 11: 7,544 orders
Month 12: 5,674 orders


In [5]:
import pandas as pd
import os
from pathlib import Path

def analyze_csv_files(data_folder="data"):
    """
    Analyze all CSV files in the data folder and print their structure
    """
    data_path = Path(data_folder)
    
    # Debug: Check if folder exists and list all files
    print(f"Looking for CSV files in: {data_path.absolute()}")
    print(f"Folder exists: {data_path.exists()}")
    
    if data_path.exists():
        all_files = list(data_path.iterdir())
        print(f"All files in folder: {[f.name for f in all_files]}")
    
    csv_files = list(data_path.glob("*.csv"))
    print(f"Found {len(csv_files)} CSV files: {[f.name for f in csv_files]}")
    
    print("=" * 80)
    print("CSV FILES ANALYSIS")
    print("=" * 80)
    
    for csv_file in sorted(csv_files):
        print(f"\n📁 FILE: {csv_file.name}")
        print("-" * 60)
        
        try:
            # Read CSV with minimal rows for structure analysis
            df = pd.read_csv(csv_file, nrows=1000)  # Read first 1000 rows for analysis
            
            print(f"📊 SHAPE: {df.shape[0]:,} rows x {df.shape[1]} columns")
            print(f"💾 SIZE: {os.path.getsize(csv_file) / (1024*1024):.2f} MB")
            
            print("\n📋 COLUMNS:")
            for i, col in enumerate(df.columns, 1):
                dtype = df[col].dtype
                null_count = df[col].isnull().sum()
                null_pct = (null_count / len(df)) * 100
                
                # Get sample values (non-null)
                sample_values = df[col].dropna().head(3).tolist()
                sample_str = ", ".join([str(val)[:50] for val in sample_values])
                
                print(f"  {i:2d}. {col:<30} | {str(dtype):<12} | {null_pct:5.1f}% null | Ex: {sample_str}")
            
            print(f"\n📈 SAMPLE DATA (first 3 rows):")
            print(df.head(3).to_string(max_cols=6, max_colwidth=30))
            
            # Special analysis for date columns
            date_columns = []
            for col in df.columns:
                if any(keyword in col.lower() for keyword in ['date', 'time', 'timestamp']):
                    date_columns.append(col)
            
            if date_columns:
                print(f"\n📅 DATE COLUMNS ANALYSIS:")
                for col in date_columns:
                    sample_dates = df[col].dropna().head(3).tolist()
                    print(f"  {col}: {sample_dates}")
            
            # Check for potential ID columns
            id_columns = [col for col in df.columns if 'id' in col.lower()]
            if id_columns:
                print(f"\n🔑 ID COLUMNS:")
                for col in id_columns:
                    unique_count = df[col].nunique()
                    total_count = len(df[col].dropna())
                    uniqueness = (unique_count / total_count * 100) if total_count > 0 else 0
                    print(f"  {col}: {unique_count:,} unique values ({uniqueness:.1f}% unique)")
        
        except Exception as e:
            print(f"❌ ERROR reading {csv_file.name}: {str(e)}")
        
        print("\n" + "="*80)

def check_data_relationships():
    """
    Check potential relationships between datasets
    """
    print("\n🔗 POTENTIAL DATA RELATIONSHIPS:")
    print("-" * 40)
    
    relationships = {
        "olist_orders_dataset2.csv": ["customer_id", "order_id"],
        "olist_customers_dataset.csv": ["customer_id"],
        "olist_order_items_dataset.csv": ["order_id", "product_id", "seller_id"],
        "olist_products_dataset.csv": ["product_id"],
        "olist_sellers_dataset.csv": ["seller_id"],
        "olist_order_payments_dataset.csv": ["order_id"],
        "olist_order_reviews_dataset.csv": ["order_id"],
    }
    
    for file, keys in relationships.items():
        print(f"{file}: {', '.join(keys)}")

if __name__ == "__main__":
    # Check if we need to go up one directory level
    import os
    if os.path.exists("data"):
        analyze_csv_files("data")
    elif os.path.exists("../data"):
        analyze_csv_files("../data")
    else:
        print("Could not find data folder. Please run from project root directory.")
        print("Current directory:", os.getcwd())
    
    check_data_relationships()
    
    print("\n🎯 RECOMMENDED LOADING ORDER:")
    print("1. dim_states (from state_enhancement_documented.csv)")
    print("2. dim_customers (from olist_customers_dataset.csv)")
    print("3. dim_product_categories (from product_category_name_translation.csv)")
    print("4. dim_products (from olist_products_dataset.csv)")
    print("5. dim_sellers (from olist_sellers_dataset.csv)")
    print("6. fact_orders (from olist_orders_dataset2.csv)")
    print("7. dim_order_items (from olist_order_items_dataset.csv)")
    print("8. dim_payments (from olist_order_payments_dataset.csv)")
    print("9. dim_reviews (from olist_order_reviews_dataset.csv)")
    print("10. dim_holidays (from olist_holiday_dataset.csv)")
    print("11. dim_economic_indicators (from economic_indicators.csv)")
    print("12. dim_geolocation (from olist_geolocation_dataset.csv)")

Looking for CSV files in: /Users/pilar/Desktop/Projects/olist-sales-dashboard/DataFoundation/../data
Folder exists: True
All files in folder: ['olist_orders_dataset2.csv', 'olist_sellers_dataset.csv', 'product_category_name_translation.csv', 'state_data_sources.csv', 'olist_orders_dataset.csv', 'olist_order_items_dataset.csv', 'economic_indicators.csv', 'olist_customers_dataset.csv', 'olist_geolocation_dataset.csv', 'olist_order_payments_dataset.csv', 'olitst_holiday_dataset.csv', 'state_enhancement_documented.csv', 'olist_order_reviews_dataset.csv', 'olist_products_dataset.csv', 'olist_holiday_dataset.csv']
Found 15 CSV files: ['olist_orders_dataset2.csv', 'olist_sellers_dataset.csv', 'product_category_name_translation.csv', 'state_data_sources.csv', 'olist_orders_dataset.csv', 'olist_order_items_dataset.csv', 'economic_indicators.csv', 'olist_customers_dataset.csv', 'olist_geolocation_dataset.csv', 'olist_order_payments_dataset.csv', 'olitst_holiday_dataset.csv', 'state_enhancement_d

In [10]:
import pandas as pd

def detailed_geolocation_analysis():
    """Detailed analysis of geolocation duplicates"""
    
    df = pd.read_csv('../data/olist_geolocation_dataset.csv')
    
    print("=== DETAILED GEOLOCATION ANALYSIS ===")
    
    # Group by coordinates
    key_columns = ['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']
    grouped = df.groupby(key_columns)
    
    # Categorize duplicates
    identical_duplicates = 0
    city_name_variations = 0
    different_locations = 0
    
    duplicate_types = {
        'identical': [],
        'city_variations': [],
        'different_locations': []
    }
    
    for name, group in grouped:
        if len(group) > 1:  # Has duplicates
            unique_cities = group['geolocation_city'].nunique()
            unique_states = group['geolocation_state'].nunique()
            
            # Check if it's just accent/case differences
            cities_lower = group['geolocation_city'].str.lower().str.normalize('NFD').str.encode('ascii', errors='ignore').str.decode('ascii')
            normalized_cities = cities_lower.nunique()
            
            if unique_cities == 1 and unique_states == 1:
                # Completely identical
                identical_duplicates += len(group) - 1
                duplicate_types['identical'].append({
                    'coords': name,
                    'count': len(group),
                    'city': group['geolocation_city'].iloc[0],
                    'state': group['geolocation_state'].iloc[0]
                })
            elif normalized_cities == 1 and unique_states == 1:
                # Just accent/case differences
                city_name_variations += len(group) - 1
                duplicate_types['city_variations'].append({
                    'coords': name,
                    'count': len(group),
                    'cities': group['geolocation_city'].unique(),
                    'state': group['geolocation_state'].iloc[0]
                })
            else:
                # Different locations
                different_locations += len(group) - 1
                duplicate_types['different_locations'].append({
                    'coords': name,
                    'count': len(group),
                    'cities': group['geolocation_city'].unique(),
                    'states': group['geolocation_state'].unique()
                })
    
    print(f"Duplicate breakdown:")
    print(f"  Identical duplicates: {identical_duplicates:,}")
    print(f"  City name variations: {city_name_variations:,}")
    print(f"  Different locations: {different_locations:,}")
    print(f"  Total duplicates: {identical_duplicates + city_name_variations + different_locations:,}")
    
    # Show examples of each type
    print(f"\n=== EXAMPLES ===")
    
    print(f"\n1. IDENTICAL DUPLICATES (first 3):")
    for item in duplicate_types['identical'][:3]:
        print(f"   {item['coords']} -> {item['city']}, {item['state']} (appears {item['count']} times)")
    
    print(f"\n2. CITY NAME VARIATIONS (first 5):")
    for item in duplicate_types['city_variations'][:5]:
        print(f"   {item['coords']} -> {list(item['cities'])} in {item['state']} (appears {item['count']} times)")
    
    print(f"\n3. DIFFERENT LOCATIONS (first 5):")
    for item in duplicate_types['different_locations'][:5]:
        print(f"   {item['coords']} -> Cities: {list(item['cities'])}, States: {list(item['states'])} (appears {item['count']} times)")
    
    # Recommendation
    print(f"\n=== RECOMMENDATION ===")
    total_safe_to_remove = identical_duplicates + city_name_variations
    print(f"Safe to deduplicate: {total_safe_to_remove:,} records ({total_safe_to_remove/280009*100:.1f}% of duplicates)")
    print(f"Need careful handling: {different_locations:,} records ({different_locations/280009*100:.1f}% of duplicates)")
    
    return duplicate_types

if __name__ == "__main__":
    analysis = detailed_geolocation_analysis()

=== DETAILED GEOLOCATION ANALYSIS ===
Duplicate breakdown:
  Identical duplicates: 208,875
  City name variations: 70,229
  Different locations: 905
  Total duplicates: 280,009

=== EXAMPLES ===

1. IDENTICAL DUPLICATES (first 3):
   (np.int64(1001), np.float64(-23.551336655288804), np.float64(-46.63402699777831)) -> sao paulo, SP (appears 3 times)
   (np.int64(1001), np.float64(-23.55049770690751), np.float64(-46.63433817805407)) -> sao paulo, SP (appears 8 times)
   (np.int64(1001), np.float64(-23.5498252739339), np.float64(-46.63396956010149)) -> sao paulo, SP (appears 2 times)

2. CITY NAME VARIATIONS (first 5):
   (np.int64(1001), np.float64(-23.549779299469115), np.float64(-46.6339571183853)) -> ['são paulo', 'sao paulo'] in SP (appears 2 times)
   (np.int64(1003), np.float64(-23.5490832616594), np.float64(-46.63486400979368)) -> ['são paulo', 'sao paulo'] in SP (appears 2 times)
   (np.int64(1005), np.float64(-23.549980033585307), np.float64(-46.63476783166945)) -> ['são paulo',