Run DBScan on returns duckdb

In [1]:
# Required imports for Customer Clustering Features Pipeline
import pandas as pd
import duckdb
import numpy as np
from pathlib import Path
import logging
from typing import Union, Optional
import gc
from datetime import datetime
import time
import warnings
from customer_clustering_features import create_customer_clustering_features

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print("‚úÖ All imports loaded successfully!")
print("üîß Environment configured for large dataset processing")
print("üìä Ready to run customer clustering features pipeline")

start_time = time.time()

try:
   # Configuration
   config = {
       'file_path': 'data/base_returns_sku_metadata.csv',  # Update with your file path
       'table_name': 'customer_transactions',
       'features_table_name': 'customer_clustering_features',
       'chunk_size': 50000,  # Adjust based on available RAM
       'db_file': 'customer_features.db',  # Persistent database file
       'force_recreate': False  # Set to True to rebuild from scratch
   }
   
   print("üöÄ Starting Customer Clustering Feature Pipeline")
   print(f"üìÅ File: {config['file_path']}")
   print(f"üíæ Database: {config['db_file']}")
   print(f"üìä Features table: {config['features_table_name']}")
   print("-" * 60)
   
   # Run the complete pipeline
   conn = create_customer_clustering_features(**config)
   
   # Display results summary
   print("\n" + "="*60)
   print("‚úÖ PIPELINE COMPLETED SUCCESSFULLY")
   print("="*60)
   
   # Show sample features
   print("\nüìã Sample Customer Features:")
   sample_features = conn.execute(f"""
       SELECT * FROM {config['features_table_name']} 
       ORDER BY SALES_ORDER_NO_nunique DESC 
       LIMIT 5
   """).df()
   print(sample_features.to_string(index=False))
   
   # Show customers with return comments (if any)
   customers_with_comments = conn.execute("""
       SELECT 
           CUSTOMER_EMAILID,
           COUNT(*) as total_returns,
           COUNT(CASE WHEN has_return_comment THEN 1 END) as returns_with_comments,
           STRING_AGG(DISTINCT RETURN_COMMENT, '; ') as sample_comments
       FROM return_timing_analysis
       WHERE has_return_comment AND RETURN_COMMENT != ''
       GROUP BY CUSTOMER_EMAILID
       ORDER BY returns_with_comments DESC
       LIMIT 3
   """).df()
   
   if len(customers_with_comments) > 0:
       print("\nüí¨ Customers with Return Comments (for sentiment analysis):")
       print(customers_with_comments.to_string(index=False))
   
   # Performance summary
   elapsed_time = time.time() - start_time
   total_rows = conn.execute(f"SELECT COUNT(*) FROM {config['table_name']}").fetchone()[0]
   features_count = conn.execute(f"SELECT COUNT(*) FROM {config['features_table_name']}").fetchone()[0]
   
   print(f"\n‚è±Ô∏è  Processing completed in {elapsed_time:.2f} seconds")
   print(f"üìä Processed {total_rows:,} transaction records")
   print(f"üë• Generated features for {features_count:,} customers")
   print(f"üèÉ‚Äç‚ôÇÔ∏è Processing speed: {total_rows/elapsed_time:,.0f} records/second")
   
   # Export options
   export_csv = input("\nüì§ Export features to CSV? (y/n): ").lower().strip() == 'y'
   if export_csv:
       features_df = conn.execute(f"SELECT * FROM {config['features_table_name']}").df()
       csv_filename = 'customer_clustering_features.csv'
       features_df.to_csv(csv_filename, index=False)
       print(f"‚úÖ Features exported to {csv_filename}")
   
   print(f"\nüéØ Ready for DBSCAN clustering!")
   print(f"üíæ Database connection available as 'conn' variable")
   
except Exception as e:
   print(f"‚ùå ERROR: {str(e)}")
   print("Check the logs above for detailed error information")
   raise
finally:
   elapsed_time = time.time() - start_time
   print(f"\n‚è±Ô∏è  Total execution time: {elapsed_time:.2f} seconds")

2025-06-24 00:34:14,332 INFO: Loading data from data/base_returns_sku_metadata.csv
2025-06-24 00:34:14,340 INFO: Table customer_transactions already exists, skipping data load
2025-06-24 00:34:14,340 INFO: Creating intermediate feature tables
2025-06-24 00:34:14,343 INFO: Using reference date: 2025-06-23 09:56:56
2025-06-24 00:34:14,344 INFO: Creating customer order summary
2025-06-24 00:34:14,344 INFO: Executing: CREATE OR REPLACE TABLE customer_order_summary ...


‚úÖ All imports loaded successfully!
üîß Environment configured for large dataset processing
üìä Ready to run customer clustering features pipeline
üöÄ Starting Customer Clustering Feature Pipeline
üìÅ File: data/base_returns_sku_metadata.csv
üíæ Database: customer_features.db
üìä Features table: customer_clustering_features
------------------------------------------------------------


2025-06-24 00:34:15,310 INFO: Creating customer item summary
2025-06-24 00:34:15,311 INFO: Executing: CREATE OR REPLACE TABLE customer_item_summary ...
2025-06-24 00:34:16,626 INFO: Creating return timing analysis
2025-06-24 00:34:16,627 INFO: Executing: CREATE OR REPLACE TABLE return_timing_analysis ...
2025-06-24 00:34:16,628 ERROR: Error in feature creation: Binder Error: Referenced column "RETURN_COMMENT" not found in FROM clause!
Candidate bindings: "RETURN_NO", "RETURN_QTY", "RETURN_DATE", "UNITS_RETURNED_FLAG", "CUSTOMER_EMAILID"

LINE 12:             RETURN_COMMENT,
                     ^


‚ùå ERROR: Binder Error: Referenced column "RETURN_COMMENT" not found in FROM clause!
Candidate bindings: "RETURN_NO", "RETURN_QTY", "RETURN_DATE", "UNITS_RETURNED_FLAG", "CUSTOMER_EMAILID"

LINE 12:             RETURN_COMMENT,
                     ^
Check the logs above for detailed error information

‚è±Ô∏è  Total execution time: 2.34 seconds


BinderException: Binder Error: Referenced column "RETURN_COMMENT" not found in FROM clause!
Candidate bindings: "RETURN_NO", "RETURN_QTY", "RETURN_DATE", "UNITS_RETURNED_FLAG", "CUSTOMER_EMAILID"

LINE 12:             RETURN_COMMENT,
                     ^