In [None]:
"""
Task 3: Timezone Analysis Script
Run this to see comprehensive analysis
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')

# Import Task 3 components
from src.timestamp_processing.timezone_processor import detect_and_normalize_timestamps
from src.timestamp_processing.timezone_detector import TimezoneDetector
from src.timestamp_processing.timestamp_validator import TimestampValidator

def main_analysis():
    print("🔍 TASK 3: COMPREHENSIVE TIMEZONE ANALYSIS")
    print("=" * 50)
    
    # 1. Load sample data
    print("\n1️⃣ LOADING SAMPLE DATA")
    try:
        df = pd.read_csv('data/timezone_samples/new_york_user.csv')
        print(f"✅ Loaded {len(df)} records")
        print(df.head())
    except FileNotFoundError:
        print("❌ Sample data not found - creating it...")
        from data.timezone_samples.create_sample_data import create_new_york_user_data
        df = create_new_york_user_data()
        print(f"✅ Created {len(df)} sample records")
    
    # 2. Timezone Detection Analysis
    print("\n2️⃣ TIMEZONE DETECTION ANALYSIS")
    detector = TimezoneDetector()
    detection_result = detector.detect_timezone_from_patterns(df)
    print(f"🎯 Detected Timezone: {detection_result['timezone']}")
    print(f"🎯 Confidence: {detection_result['confidence']:.2f}")
    
    # Visualize hourly patterns
    plt.figure(figsize=(12, 6))
    hourly_dist = pd.to_datetime(df['timestamp']).dt.hour.value_counts().sort_index()
    plt.bar(hourly_dist.index, hourly_dist.values)
    plt.title('Hourly Activity Distribution')
    plt.xlabel('Hour of Day')
    plt.ylabel('Number of Records')
    plt.show()
    
    # 3. Performance Analysis
    print("\n3️⃣ PERFORMANCE ANALYSIS")
    import time
    
    sample_sizes = [100, 500, 1000]  # Smaller sizes for testing
    processing_times = []
    
    for size in sample_sizes:
        test_data = df.head(size)
        
        start_time = time.time()
        result = detect_and_normalize_timestamps(test_data.copy())
        end_time = time.time()
        
        processing_time = end_time - start_time
        processing_times.append(processing_time)
        print(f"📊 {size} records: {processing_time:.4f} seconds")
    
    # Plot performance
    plt.figure(figsize=(10, 6))
    plt.plot(sample_sizes, processing_times, marker='o')
    plt.title('Processing Time vs Data Size')
    plt.xlabel('Number of Records')
    plt.ylabel('Processing Time (seconds)')
    plt.show()
    
    # 4. Validation Analysis
    print("\n4️⃣ VALIDATION ANALYSIS")
    validator = TimestampValidator()
    
    # Run validation
    original_data = df.copy()
    processed_data = detect_and_normalize_timestamps(df.copy())
    
    validation_result = validator.validate_timestamp_conversion(original_data, processed_data)
    
    # Show validation report
    report = validator.create_validation_report(validation_result)
    print(report)
    
    print("\n✅ ANALYSIS COMPLETED!")

if __name__ == "__main__":
    main_analysis()
