# Data Overview

This notebook provides an overview of the dynamic pricing dataset, including:
- Data loading and validation
- Basic statistics and distributions
- Data quality assessment
- Initial insights

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path().absolute().parent / "src"))

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import project modules
from src.data.load_data import load_raw_data, validate_data_schema, get_data_summary
from src.data.clean import clean_data
from src.config import NUMERICAL_FEATURES, CATEGORICAL_FEATURES, TARGET_COLUMN

## Load and Validate Data

In [None]:
# Load the raw dataset
try:
    df = load_raw_data()
    print(f"Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
except Exception as e:
    print(f"Error loading data: {e}")
    # Create sample data if loading fails
    np.random.seed(42)
    n_samples = 1000
    df = pd.DataFrame({
        'Number_of_Riders': np.random.poisson(15, n_samples),
        'Number_of_Drivers': np.random.poisson(10, n_samples),
        'Location_Category': np.random.choice(['Urban', 'Suburban', 'Rural'], n_samples, p=[0.5, 0.3, 0.2]),
        'Customer_Loyalty_Status': np.random.choice(['Silver', 'Gold', 'Platinum'], n_samples, p=[0.4, 0.4, 0.2]),
        'Number_of_Past_Rides': np.random.randint(0, 100, n_samples),
        'Average_Ratings': np.random.uniform(3.0, 5.0, n_samples),
        'Time_of_Booking': np.random.choice(['Morning', 'Afternoon', 'Evening', 'Night'], n_samples),
        'Vehicle_Type': np.random.choice(['Economy', 'Premium', 'Luxury'], n_samples, p=[0.6, 0.3, 0.1]),
        'Expected_Ride_Duration': np.random.uniform(5, 60, n_samples),
        'Historical_Cost_of_Ride': np.random.uniform(10, 100, n_samples)
    })
    print("Created sample data for demonstration")

In [None]:
# Validate data schema
validation_results = validate_data_schema(df)
print("Data Schema Validation:")
print(f"Valid: {validation_results['is_valid']}")

if not validation_results['is_valid']:
    print("Errors:")
    for error in validation_results['errors']:
        print(f"  - {error}")

if validation_results['warnings']:
    print("Warnings:")
    for warning in validation_results['warnings']:
        print(f"  - {warning}")

## Basic Data Summary

In [None]:
# Display basic information
print("Dataset Info:")
df.info()

print("\n" + "="*50)
print("First 5 rows:")
display(df.head())

print("\n" + "="*50)
print("Descriptive Statistics:")
display(df.describe())

## Data Quality Assessment

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

print("Missing Values Analysis:")
missing_df = pd.DataFrame({
    'Count': missing_values,
    'Percentage': missing_percentage
})
display(missing_df[missing_df['Count'] > 0])

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print("Removing duplicate rows...")
    df = df.drop_duplicates()
    print(f"New shape: {df.shape}")

## Feature Distributions

In [None]:
# Numerical features distribution
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Numerical Features Distribution', fontsize=16)

numerical_cols = NUMERICAL_FEATURES + [TARGET_COLUMN]

for i, col in enumerate(numerical_cols[:6]):
    row, col_idx = i // 3, i % 3
    axes[row, col_idx].hist(df[col], bins=30, alpha=0.7)
    axes[row, col_idx].set_title(col)
    axes[row, col_idx].set_xlabel('')
    axes[row, col_idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Categorical features distribution
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Categorical Features Distribution', fontsize=16)

for i, col in enumerate(CATEGORICAL_FEATURES[:4]):
    row, col_idx = i // 2, i % 2
    value_counts = df[col].value_counts()
    axes[row, col_idx].bar(value_counts.index, value_counts.values)
    axes[row, col_idx].set_title(col)
    axes[row, col_idx].set_xlabel('')
    axes[row, col_idx].set_ylabel('Count')
    axes[row, col_idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Correlation matrix for numerical features
numerical_df = df[NUMERICAL_FEATURES + [TARGET_COLUMN]]
correlation_matrix = numerical_df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target variable
target_correlations = correlation_matrix[TARGET_COLUMN].sort_values(key=abs, ascending=False)
print("Correlation with Target Variable:")
print(target_correlations.drop(TARGET_COLUMN))

## Key Insights

In [None]:
# Demand-Supply Analysis
df['demand_supply_ratio'] = df['Number_of_Riders'] / (df['Number_of_Drivers'] + 1e-8)

print("Demand-Supply Analysis:")
print(f"Average demand/supply ratio: {df['demand_supply_ratio'].mean():.2f}")
print(f"High demand periods (ratio > 2): {(df['demand_supply_ratio'] > 2).mean():.1%}")
print(f"Low supply periods (ratio < 0.5): {(df['demand_supply_ratio'] < 0.5).mean():.1%}")

# Price analysis
print(f"\nPrice Analysis:")
print(f"Average ride price: ${df[TARGET_COLUMN].mean():.2f}")
print(f"Price range: ${df[TARGET_COLUMN].min():.2f} - ${df[TARGET_COLUMN].max():.2f}")
print(f"Price standard deviation: ${df[TARGET_COLUMN].std():.2f}")

In [None]:
# Location-based analysis
location_stats = df.groupby('Location_Category').agg({
    TARGET_COLUMN: ['mean', 'std', 'count'],
    'Number_of_Riders': 'mean',
    'Number_of_Drivers': 'mean'
}).round(2)

print("Location-based Analysis:")
display(location_stats)

In [None]:
# Time-based analysis
time_stats = df.groupby('Time_of_Booking').agg({
    TARGET_COLUMN: ['mean', 'count'],
    'Number_of_Riders': 'mean',
    'Number_of_Drivers': 'mean'
}).round(2)

print("Time-based Analysis:")
display(time_stats)

## Data Cleaning

In [None]:
# Apply data cleaning
df_clean, cleaning_report = clean_data(df)

print("Data Cleaning Report:")
print(f"Original shape: {cleaning_report['original_shape']}")
print(f"Final shape: {cleaning_report['final_shape']}")
print(f"Rows removed: {cleaning_report['removed_rows']}")
print(f"Cleaning steps: {len(cleaning_report['cleaning_steps'])}")

for step in cleaning_report['cleaning_steps']:
    print(f"  - {step}")

## Summary

### Key Findings:
1. **Data Quality**: The dataset contains [number] records with [number] features
2. **Missing Values**: [summary of missing values]
3. **Price Distribution**: [summary of price distribution]
4. **Demand-Supply Dynamics**: [key insights about demand-supply ratios]
5. **Location Patterns**: [location-based pricing patterns]
6. **Time Patterns**: [time-based pricing patterns]

### Next Steps:
1. Proceed to exploratory data analysis (EDA)
2. Feature engineering for price dynamics
3. Model development and evaluation