In [23]:
import pandas as pd
import numpy as np

df = pd.read_csv('uber_cleaned.csv')
df.rename(columns = {'pickup_longitude_x': 'pickup_longitude',
                     'pickup_latitude_x': 'pickup_latitude'}, inplace = True)
df.head()

Unnamed: 0,fare_id,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_latitude_rounded,...,haversine_distance,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_dayofweek,pickup_dayofweek_label,fare_per_passenger,sublocality
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,40.738,...,1.683323,2015,5,7,19,52,3,Thursday,7.5,Manhattan
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1,40.728,...,2.45759,2009,7,17,20,4,4,Friday,7.7,Manhattan
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1,40.741,...,5.036377,2009,8,24,21,45,0,Monday,12.9,Manhattan
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,40.791,...,1.661683,2009,6,26,8,22,4,Friday,1.766667,Manhattan
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,40.744,...,4.47545,2014,8,28,17,47,3,Thursday,3.2,Queens


# Questions to answer:
## Business Performance
1. What are the peak demand hours/days?
2. Which locations generate highest revenue per trip?
3. How does fare vary by distance, time, and passenger count?
4. What's the average trip distance and duration proxy?

In [24]:
import plotly.express as px

# Count trips for each hour and day of week
heatmap_data = df.groupby(['pickup_dayofweek_label', 'pickup_hour']).size().reset_index(name='trip_count')

# Pivot for heatmap
heatmap_pivot = heatmap_data.pivot(index='pickup_dayofweek_label', columns='pickup_hour', values='trip_count')

# Plotly heatmap
fig = px.imshow(
    heatmap_pivot,
    labels=dict(x="Pickup Hour", y="Day of Week", color="Trip Count"),
    x=heatmap_pivot.columns,
    y=heatmap_pivot.index,
    aspect="auto",
    title="Uber Trips Heatmap by Hour and Day of Week"
)
fig.show()

In [None]:
# Calculate total and average revenue per pickup location (rounded coordinates for grouping)
pickup_revenue = df.groupby(['sublocality']).agg(
    total_revenue=('fare_amount', 'sum'),
    avg_revenue=('fare_amount', 'mean'),
    trip_count=('fare_amount', 'count')
).reset_index()

pickup_revenue.sort_values('total_revenue', ascending=False).head(10)

Unnamed: 0,sublocality,total_revenue,avg_revenue,trip_count
2,Manhattan,1850267.24,10.21395,181151
3,Queens,283427.85,29.33732,9661
0,Brooklyn,57176.86,12.953525,4414
6,The Bronx,2479.39,12.094585,205
5,Staten Island,225.97,17.382308,13
1,Chauncey,12.9,12.9,1
4,Sobral,8.9,8.9,1


In [29]:
import plotly.graph_objects as go

# Get top 10 locations for visualization
top_locations = pickup_revenue.sort_values('avg_revenue', ascending=False).head(10)

# Bar chart for average revenue
fig = go.Figure(data=[
    go.Bar(x=top_locations['sublocality'], 
           y=top_locations['avg_revenue'],
           marker_color='lightgreen')
])

fig.update_layout(
    title='Average Revenue per Trip by Location',
    xaxis_title='Pickup Location',
    yaxis_title='Average Revenue ($)',
    xaxis_tickangle=45
)

fig.show()

## Pricing Insights
5. Is there dynamic pricing evidence? (fare/km variance by hour)
6. Are there fare anomalies or outliers to investigate?
7. How does passenger count affect per-passenger pricing?

## Operational Patterns
8. What's the geographic distribution of pickups?
9. Are there seasonal trends in ridership/fares?
10. Which trip categories (short/long) dominate?