# Exploratory Data Analysis: Price Dynamics

This notebook performs deep exploratory analysis on price dynamics, focusing on:
- Demand-supply relationships
- Time-based pricing patterns
- Location-based price variations
- Customer segmentation effects
- Vehicle type pricing
- Surge pricing patterns

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path().absolute().parent / "src"))

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import project modules
from src.data.load_data import load_raw_data
from src.data.clean import clean_data
from src.features.time_features import extract_time_features, create_time_buckets
from src.features.pressure_index import calculate_pressure_index, create_surge_indicators
from src.config import NUMERICAL_FEATURES, CATEGORICAL_FEATURES, TARGET_COLUMN

## Load and Prepare Data

In [None]:
# Load and clean data
df = load_raw_data()
df_clean, _ = clean_data(df)

print(f"Cleaned dataset shape: {df_clean.shape}")
print(f"Target variable: {TARGET_COLUMN}")

# Add engineered features for analysis
df_analysis = extract_time_features(df_clean)
df_analysis = calculate_pressure_index(df_analysis)
df_analysis = create_surge_indicators(df_analysis)

print(f"Analysis dataset shape: {df_analysis.shape}")

## Demand-Supply Dynamics Analysis

In [None]:
# Create demand-supply analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Demand-Supply Dynamics Analysis', fontsize=16)

# 1. Demand vs Supply scatter
axes[0, 0].scatter(df_analysis['Number_of_Riders'], df_analysis['Number_of_Drivers'], 
                   alpha=0.6, c=df_analysis[TARGET_COLUMN], cmap='viridis')
axes[0, 0].set_xlabel('Number of Riders')
axes[0, 0].set_ylabel('Number of Drivers')
axes[0, 0].set_title('Demand vs Supply')
plt.colorbar(axes[0, 0].collections[0], ax=axes[0, 0], label='Price')

# 2. Demand-Supply Ratio Distribution
axes[0, 1].hist(df_analysis['demand_supply_ratio'], bins=50, alpha=0.7)
axes[0, 1].axvline(df_analysis['demand_supply_ratio'].mean(), color='red', linestyle='--', label='Mean')
axes[0, 1].set_xlabel('Demand/Supply Ratio')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Demand-Supply Ratio Distribution')
axes[0, 1].legend()

# 3. Price vs Demand-Supply Ratio
axes[1, 0].scatter(df_analysis['demand_supply_ratio'], df_analysis[TARGET_COLUMN], alpha=0.6)
axes[1, 0].set_xlabel('Demand/Supply Ratio')
axes[1, 0].set_ylabel('Price ($)')
axes[1, 0].set_title('Price vs Demand-Supply Ratio')

# 4. Pressure Index Distribution
axes[1, 1].hist(df_analysis['pressure_index'], bins=50, alpha=0.7)
axes[1, 1].set_xlabel('Pressure Index')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Market Pressure Index Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Analyze high-demand vs low-demand periods
high_demand_threshold = 2.0
low_demand_threshold = 0.5

high_demand = df_analysis[df_analysis['demand_supply_ratio'] > high_demand_threshold]
low_demand = df_analysis[df_analysis['demand_supply_ratio'] < low_demand_threshold]
normal_demand = df_analysis[(df_analysis['demand_supply_ratio'] >= low_demand_threshold) & 
                           (df_analysis['demand_supply_ratio'] <= high_demand_threshold)]

print("Demand Analysis:")
print(f"High demand periods (> {high_demand_threshold}): {len(high_demand)} ({len(high_demand)/len(df_analysis):.1%})")
print(f"Low demand periods (< {low_demand_threshold}): {len(low_demand)} ({len(low_demand)/len(df_analysis):.1%})")
print(f"Normal demand periods: {len(normal_demand)} ({len(normal_demand)/len(df_analysis):.1%})")

print("\nPrice Analysis by Demand Level:")
print(f"High demand avg price: ${high_demand[TARGET_COLUMN].mean():.2f}")
print(f"Normal demand avg price: ${normal_demand[TARGET_COLUMN].mean():.2f}")
print(f"Low demand avg price: ${low_demand[TARGET_COLUMN].mean():.2f}")

## Time-Based Price Patterns

In [None]:
# Time-based analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Time-Based Price Patterns', fontsize=16)

# 1. Price by Time of Booking
time_price = df_analysis.groupby('Time_of_Booking')[TARGET_COLUMN].agg(['mean', 'std', 'count'])
axes[0, 0].bar(time_price.index, time_price['mean'], yerr=time_price['std'], capsize=5)
axes[0, 0].set_xlabel('Time of Booking')
axes[0, 0].set_ylabel('Average Price ($)')
axes[0, 0].set_title('Average Price by Time of Booking')
axes[0, 0].tick_params(axis='x', rotation=45)

# 2. Demand by Time of Booking
time_demand = df_analysis.groupby('Time_of_Booking')['Number_of_Riders'].mean()
time_supply = df_analysis.groupby('Time_of_Booking')['Number_of_Drivers'].mean()

x = np.arange(len(time_demand.index))
width = 0.35

axes[0, 1].bar(x - width/2, time_demand.values, width, label='Demand', alpha=0.7)
axes[0, 1].bar(x + width/2, time_supply.values, width, label='Supply', alpha=0.7)
axes[0, 1].set_xlabel('Time of Booking')
axes[0, 1].set_ylabel('Average Count')
axes[0, 1].set_title('Demand vs Supply by Time')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(time_demand.index, rotation=45)
axes[0, 1].legend()

# 3. Rush Hour vs Non-Rush Hour
rush_hour = df_analysis[df_analysis['is_rush_hour'] == 1]
non_rush = df_analysis[df_analysis['is_rush_hour'] == 0]

rush_prices = [rush_hour[TARGET_COLUMN].mean(), non_rush[TARGET_COLUMN].mean()]
rush_labels = ['Rush Hour', 'Non-Rush Hour']

axes[1, 0].bar(rush_labels, rush_prices, alpha=0.7)
axes[1, 0].set_ylabel('Average Price ($)')
axes[1, 0].set_title('Rush Hour vs Non-Rush Hour Pricing')

# 4. Demand-Supply Ratio by Time
time_ratio = df_analysis.groupby('Time_of_Booking')['demand_supply_ratio'].mean()
axes[1, 1].bar(time_ratio.index, time_ratio.values, alpha=0.7)
axes[1, 1].axhline(y=1.0, color='red', linestyle='--', label='Equilibrium')
axes[1, 1].set_xlabel('Time of Booking')
axes[1, 1].set_ylabel('Average Demand/Supply Ratio')
axes[1, 1].set_title('Demand-Supply Ratio by Time')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## Location-Based Price Analysis

In [None]:
# Location-based analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Location-Based Price Analysis', fontsize=16)

# 1. Price by Location
location_price = df_analysis.groupby('Location_Category')[TARGET_COLUMN].agg(['mean', 'std', 'count'])
axes[0, 0].bar(location_price.index, location_price['mean'], yerr=location_price['std'], capsize=5)
axes[0, 0].set_xlabel('Location Category')
axes[0, 0].set_ylabel('Average Price ($)')
axes[0, 0].set_title('Average Price by Location')

# 2. Demand-Supply by Location
location_demand = df_analysis.groupby('Location_Category')['Number_of_Riders'].mean()
location_supply = df_analysis.groupby('Location_Category')['Number_of_Drivers'].mean()

x = np.arange(len(location_demand.index))
width = 0.35

axes[0, 1].bar(x - width/2, location_demand.values, width, label='Demand', alpha=0.7)
axes[0, 1].bar(x + width/2, location_supply.values, width, label='Supply', alpha=0.7)
axes[0, 1].set_xlabel('Location Category')
axes[0, 1].set_ylabel('Average Count')
axes[0, 1].set_title('Demand vs Supply by Location')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(location_demand.index)
axes[0, 1].legend()

# 3. Price Distribution by Location
for location in df_analysis['Location_Category'].unique():
    location_data = df_analysis[df_analysis['Location_Category'] == location][TARGET_COLUMN]
    axes[1, 0].hist(location_data, bins=30, alpha=0.5, label=location)

axes[1, 0].set_xlabel('Price ($)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Price Distribution by Location')
axes[1, 0].legend()

# 4. Demand-Supply Ratio by Location
location_ratio = df_analysis.groupby('Location_Category')['demand_supply_ratio'].mean()
axes[1, 1].bar(location_ratio.index, location_ratio.values, alpha=0.7)
axes[1, 1].axhline(y=1.0, color='red', linestyle='--', label='Equilibrium')
axes[1, 1].set_xlabel('Location Category')
axes[1, 1].set_ylabel('Average Demand/Supply Ratio')
axes[1, 1].set_title('Demand-Supply Ratio by Location')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## Customer Segmentation Analysis

In [None]:
# Customer loyalty analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Customer Segmentation Analysis', fontsize=16)

# 1. Price by Loyalty Status
loyalty_price = df_analysis.groupby('Customer_Loyalty_Status')[TARGET_COLUMN].agg(['mean', 'std', 'count'])
axes[0, 0].bar(loyalty_price.index, loyalty_price['mean'], yerr=loyalty_price['std'], capsize=5)
axes[0, 0].set_xlabel('Loyalty Status')
axes[0, 0].set_ylabel('Average Price ($)')
axes[0, 0].set_title('Average Price by Loyalty Status')

# 2. Number of Past Rides by Loyalty
loyalty_rides = df_analysis.groupby('Customer_Loyalty_Status')['Number_of_Past_Rides'].mean()
axes[0, 1].bar(loyalty_rides.index, loyalty_rides.values, alpha=0.7)
axes[0, 1].set_xlabel('Loyalty Status')
axes[0, 1].set_ylabel('Average Number of Past Rides')
axes[0, 1].set_title('Customer Experience by Loyalty Status')

# 3. Ratings by Loyalty Status
loyalty_ratings = df_analysis.groupby('Customer_Loyalty_Status')['Average_Ratings'].mean()
axes[1, 0].bar(loyalty_ratings.index, loyalty_ratings.values, alpha=0.7)
axes[1, 0].set_xlabel('Loyalty Status')
axes[1, 0].set_ylabel('Average Rating')
axes[1, 0].set_title('Customer Ratings by Loyalty Status')
axes[1, 0].set_ylim(3, 5)

# 4. Price vs Number of Past Rides
axes[1, 1].scatter(df_analysis['Number_of_Past_Rides'], df_analysis[TARGET_COLUMN], alpha=0.6)
axes[1, 1].set_xlabel('Number of Past Rides')
axes[1, 1].set_ylabel('Price ($)')
axes[1, 1].set_title('Price vs Customer Experience')

plt.tight_layout()
plt.show()

## Vehicle Type Pricing Analysis

In [None]:
# Vehicle type analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Vehicle Type Pricing Analysis', fontsize=16)

# 1. Price by Vehicle Type
vehicle_price = df_analysis.groupby('Vehicle_Type')[TARGET_COLUMN].agg(['mean', 'std', 'count'])
axes[0, 0].bar(vehicle_price.index, vehicle_price['mean'], yerr=vehicle_price['std'], capsize=5)
axes[0, 0].set_xlabel('Vehicle Type')
axes[0, 0].set_ylabel('Average Price ($)')
axes[0, 0].set_title('Average Price by Vehicle Type')

# 2. Duration by Vehicle Type
vehicle_duration = df_analysis.groupby('Vehicle_Type')['Expected_Ride_Duration'].mean()
axes[0, 1].bar(vehicle_duration.index, vehicle_duration.values, alpha=0.7)
axes[0, 1].set_xlabel('Vehicle Type')
axes[0, 1].set_ylabel('Average Duration (minutes)')
axes[0, 1].set_title('Average Duration by Vehicle Type')

# 3. Price per Minute by Vehicle Type
df_analysis['price_per_minute'] = df_analysis[TARGET_COLUMN] / df_analysis['Expected_Ride_Duration']
vehicle_ppm = df_analysis.groupby('Vehicle_Type')['price_per_minute'].mean()
axes[1, 0].bar(vehicle_ppm.index, vehicle_ppm.values, alpha=0.7)
axes[1, 0].set_xlabel('Vehicle Type')
axes[1, 0].set_ylabel('Price per Minute ($)')
axes[1, 0].set_title('Price Efficiency by Vehicle Type')

# 4. Vehicle Type Distribution
vehicle_counts = df_analysis['Vehicle_Type'].value_counts()
axes[1, 1].pie(vehicle_counts.values, labels=vehicle_counts.index, autopct='%1.1f%%')
axes[1, 1].set_title('Vehicle Type Distribution')

plt.tight_layout()
plt.show()

## Surge Pricing Patterns

In [None]:
# Surge pricing analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Surge Pricing Patterns', fontsize=16)

# 1. Surge Level Distribution
if 'surge_level' in df_analysis.columns:
    surge_counts = df_analysis['surge_level'].value_counts()
    axes[0, 0].bar(surge_counts.index, surge_counts.values, alpha=0.7)
    axes[0, 0].set_xlabel('Surge Level')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].set_title('Surge Level Distribution')
    axes[0, 0].tick_params(axis='x', rotation=45)

# 2. Base Surge Multiplier Distribution
if 'base_surge_multiplier' in df_analysis.columns:
    axes[0, 1].hist(df_analysis['base_surge_multiplier'], bins=50, alpha=0.7)
    axes[0, 1].axvline(df_analysis['base_surge_multiplier'].mean(), color='red', linestyle='--', label='Mean')
    axes[0, 1].set_xlabel('Base Surge Multiplier')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Surge Multiplier Distribution')
    axes[0, 1].legend()

# 3. Surge Trigger Analysis
if 'high_demand_trigger' in df_analysis.columns:
    high_surge = df_analysis[df_analysis['high_demand_trigger'] == 1]
    normal_surge = df_analysis[df_analysis['high_demand_trigger'] == 0]
    
    surge_prices = [high_surge[TARGET_COLUMN].mean(), normal_surge[TARGET_COLUMN].mean()]
    surge_labels = ['High Surge Trigger', 'Normal']
    
    axes[1, 0].bar(surge_labels, surge_prices, alpha=0.7)
    axes[1, 0].set_ylabel('Average Price ($)')
    axes[1, 0].set_title('Price by Surge Trigger Status')

# 4. Surge Probability by Time
if 'surge_probability' in df_analysis.columns:
    time_surge = df_analysis.groupby('Time_of_Booking')['surge_probability'].mean()
    axes[1, 1].bar(time_surge.index, time_surge.values, alpha=0.7)
    axes[1, 1].set_xlabel('Time of Booking')
    axes[1, 1].set_ylabel('Average Surge Probability')
    axes[1, 1].set_title('Surge Probability by Time')
    axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Interactive Visualizations

In [None]:
# Interactive price vs demand-supply analysis
fig = px.scatter(df_analysis.sample(min(1000, len(df_analysis))), 
                x='demand_supply_ratio', 
                y=TARGET_COLUMN,
                color='Location_Category',
                size='Expected_Ride_Duration',
                hover_data=['Time_of_Booking', 'Vehicle_Type', 'Customer_Loyalty_Status'],
                title='Price vs Demand-Supply Ratio by Location')
fig.show()

In [None]:
# Interactive time series analysis
time_analysis = df_analysis.groupby('Time_of_Booking').agg({
    TARGET_COLUMN: 'mean',
    'demand_supply_ratio': 'mean',
    'Number_of_Riders': 'mean',
    'Number_of_Drivers': 'mean'
}).reset_index()

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add price trace
fig.add_trace(
    go.Scatter(x=time_analysis['Time_of_Booking'], 
              y=time_analysis[TARGET_COLUMN],
              name='Average Price',
              line=dict(color='blue')),
    secondary_y=False,
)

# Add demand-supply ratio trace
fig.add_trace(
    go.Scatter(x=time_analysis['Time_of_Booking'], 
              y=time_analysis['demand_supply_ratio'],
              name='Demand/Supply Ratio',
              line=dict(color='red')),
    secondary_y=True,
)

fig.update_xaxes(title_text="Time of Booking")
fig.update_yaxes(title_text="Average Price ($)", secondary_y=False)
fig.update_yaxes(title_text="Demand/Supply Ratio", secondary_y=True)
fig.update_layout(title_text="Price and Demand-Supply Dynamics by Time")

fig.show()

## Key Insights Summary

### Demand-Supply Dynamics:
- [Insights about demand-supply patterns]
- [Price elasticity observations]
- [Market pressure indicators]

### Time-Based Patterns:
- [Rush hour pricing effects]
- [Peak demand periods]
- [Time-based surge patterns]

### Location-Based Variations:
- [Urban vs suburban vs rural pricing]
- [Location-specific demand patterns]
- [Geographic price elasticity]

### Customer Segmentation:
- [Loyalty program effects]
- [Customer experience impact]
- [Rating correlations]

### Vehicle Type Analysis:
- [Premium vehicle pricing]
- [Duration-based pricing]
- [Price efficiency metrics]

### Surge Pricing:
- [Surge trigger conditions]
- [Surge multiplier distributions]
- [Time-based surge patterns]

### Recommendations for Modeling:
1. [Feature engineering recommendations]
2. [Model selection considerations]
3. [Business rule implementations]