In [1]:
# @title Setup
from google.cloud import bigquery
from google.colab import data_table
import bigframes.pandas as bpd

project = 'yellow-taxi-trips-2025' # Project ID inserted based on the query results selected to explore
location = 'US' # Location inserted based on the query results selected to explore
client = bigquery.Client(project=project, location=location)
data_table.enable_dataframe_formatter()

In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [3]:
# Extract the current year
from datetime import datetime
current_year = datetime.now().year
current_year

2025

In [4]:
# Function to execute a BigQuery query and return a DataFrame

def query_to_dataframe(query: str) -> pd.DataFrame:
    """
    Executes a SQL query in BigQuery and returns a Pandas DataFrame.

    Parameters:
    - query (str): The SQL query to execute.

    Return:
    - pd.DataFrame : The DataFrame containing the results of the query.
    """
    try:
        df = client.query(query).to_dataframe()
        print(f"Query executed successfully. Retrieved {df.shape[0]} rows.")
        return df
    except Exception as e:
        print(f"Error executing query: {e}")
        return pd.DataFrame()

# IV/ Competitive Insights & Operational Efficiency

## Question 12: Which boroughs or zones have the highest and lowest trip volumes, and how do they compare over time?

In [5]:
query_trip_volume_by_borough = """
SELECT *
FROM `yellow-taxi-trips-2025.views_fordashboard.trip_volume_by_borough`
"""
trip_volume_by_borough_df = query_to_dataframe(query_trip_volume_by_borough)
trip_volume_by_borough_df.head()

Query executed successfully. Retrieved 7254828 rows.


Unnamed: 0,trip_date,year,month,pickup_borough,dropoff_borough,pickup_zone,dropoff_zone,total_trips
0,2024-01-22,2024,1,Manhattan,Manhattan,Morningside Heights,Lenox Hill East,6
1,2024-01-12,2024,1,Manhattan,Manhattan,Lenox Hill West,Yorkville West,158
2,2024-01-28,2024,1,Manhattan,Manhattan,UN/Turtle Bay South,Lenox Hill East,20
3,2024-01-26,2024,1,Manhattan,Brooklyn,West Chelsea/Hudson Yards,Brooklyn Heights,1
4,2024-01-21,2024,1,Manhattan,Manhattan,East Village,Sutton Place/Turtle Bay North,66


In [6]:
# Filter rows where the year is between 2020 and the current year (inclusive)
filtered_trip_volume_by_borough_df = trip_volume_by_borough_df[(trip_volume_by_borough_df['year'] >= 2020) & (trip_volume_by_borough_df['year'] <= current_year)]
filtered_trip_volume_by_borough_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7254311 entries, 0 to 7254827
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   trip_date        dbdate
 1   year             Int64 
 2   month            Int64 
 3   pickup_borough   object
 4   dropoff_borough  object
 5   pickup_zone      object
 6   dropoff_zone     object
 7   total_trips      Int64 
dtypes: Int64(3), dbdate(1), object(4)
memory usage: 518.9+ MB


In [7]:
# Convert trip_date to datetime
filtered_trip_volume_by_borough_df["trip_date"] = pd.to_datetime(filtered_trip_volume_by_borough_df["trip_date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_trip_volume_by_borough_df["trip_date"] = pd.to_datetime(filtered_trip_volume_by_borough_df["trip_date"])


In [8]:
# Aggregate trip counts by borough
borough_trip_counts = (
    filtered_trip_volume_by_borough_df.groupby("pickup_borough")["total_trips"]
    .sum()
    .reset_index()
    .sort_values(by="total_trips", ascending=False)
)

# Bar Chart: Trip Volume by Borough
fig1 = px.bar(
    borough_trip_counts,
    x="pickup_borough",
    y="total_trips",
    title="Total Trip Volume by Borough",
    labels={"pickup_borough": "Borough", "total_trips": "Total Trips"},
    template="plotly_white",
    text_auto=True
)
fig1.show()

In [9]:
# Line Chart: Trip Volume Over Time by Borough
borough_trend = (
    filtered_trip_volume_by_borough_df.groupby(["trip_date", "pickup_borough"])["total_trips"]
    .sum()
    .reset_index()
)

fig2 = px.line(
    borough_trend,
    x="trip_date",
    y="total_trips",
    color="pickup_borough",
    title="Trip Volume Over Time by Borough",
    labels={"trip_date": "Date", "total_trips": "Total Trips", "pickup_borough": "Borough"},
    template="plotly_white"
)
fig2.show()

## Question 13: How frequently do yellow taxis serve airports (JFK, LaGuardia, ...), and what is the average fare for these trips?

In [10]:
query_airport_trips_analysis = """
SELECT *
FROM `yellow-taxi-trips-2025.views_fordashboard.airport_trips_analysis`
"""
airport_trips_analysis_df = query_to_dataframe(query_airport_trips_analysis)
airport_trips_analysis_df.head()

Query executed successfully. Retrieved 3313 rows.


Unnamed: 0,trip_date,year,month,airport,total_trips,avg_fare,avg_distance
0,2024-01-11,2024,1,LaGuardia Airport,4433,69.13,9.94
1,2024-01-04,2024,1,JFK Airport,5965,80.93,15.84
2,2024-01-24,2024,1,JFK Airport,3909,82.73,15.97
3,2024-01-28,2024,1,JFK Airport,5194,81.42,15.97
4,2024-01-12,2024,1,LaGuardia Airport,4318,70.04,9.96


In [11]:
# Filter rows where the year is between 2020 and the current year (inclusive)
filtered_airport_trips_analysis_df = airport_trips_analysis_df[(airport_trips_analysis_df['year'] >= 2020) & (airport_trips_analysis_df['year'] <= current_year)]
filtered_airport_trips_analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3284 entries, 0 to 3312
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   trip_date     3284 non-null   dbdate 
 1   year          3284 non-null   Int64  
 2   month         3284 non-null   Int64  
 3   airport       3284 non-null   object 
 4   total_trips   3284 non-null   Int64  
 5   avg_fare      3284 non-null   float64
 6   avg_distance  3284 non-null   float64
dtypes: Int64(3), dbdate(1), float64(2), object(1)
memory usage: 214.9+ KB


In [12]:
# Convert trip_date to datetime
filtered_airport_trips_analysis_df["trip_date"] = pd.to_datetime(filtered_airport_trips_analysis_df["trip_date"])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [13]:
# Bar Chart: Total Trips by Airport
airport_trip_counts = (
    filtered_airport_trips_analysis_df.groupby("airport")["total_trips"]
    .sum()
    .reset_index()
    .sort_values(by="total_trips", ascending=False)
)

fig1 = px.bar(
    airport_trip_counts,
    x="airport",
    y="total_trips",
    title="Total Trips by Airport",
    labels={"airport": "Airport", "total_trips": "Total Trips"},
    template="plotly_dark",
    text_auto=True,
    color="total_trips",
    color_continuous_scale="blues"
)
fig1.show()

In [14]:
# Line Chart: Trends Over Time
airport_trend = (
    filtered_airport_trips_analysis_df.groupby(["trip_date", "airport"])["total_trips"]
    .sum()
    .reset_index()
)

fig2 = px.line(
    airport_trend,
    x="trip_date",
    y="total_trips",
    color="airport",
    title="Airport Trip Trends Over Time",
    labels={"trip_date": "Date", "total_trips": "Total Trips", "airport": "Airport"},
    template="plotly_white",
    line_shape="spline",
    markers=True
)
fig2.show()

In [15]:
# Scatter Plot: Distance vs Fare
fig3 = px.scatter(
    filtered_airport_trips_analysis_df,
    x="avg_distance",
    y="avg_fare",
    color="airport",
    size="total_trips",
    title="Fare vs Distance for Airport Trips",
    labels={"avg_distance": "Avg Distance (miles)", "avg_fare": "Avg Fare ($)", "airport": "Airport"},
    template="plotly_white",
    hover_data=["total_trips"]
)
fig3.show()

## Question 14: How often do taxis use different rate codes (e.g., standard rate vs. negotiated fares), and how do these rates vary across boroughs?

In [16]:
query_rate_code_analysis = """
SELECT *
FROM `yellow-taxi-trips-2025.views_fordashboard.rate_code_analysis`
"""
rate_code_analysis_df = query_to_dataframe(query_rate_code_analysis)
rate_code_analysis_df.head()

Query executed successfully. Retrieved 1667 rows.


Unnamed: 0,year,month,pickup_borough,RateCodeID,rate_code_description,total_trips,avg_fare
0,2022,11,Unknown,1.0,Standard rate,39280,27.38
1,2022,11,Unknown,5.0,Negotiated fare,869,82.62
2,2022,11,EWR,5.0,Negotiated fare,166,111.68
3,2002,12,Queens,1.0,Standard rate,8,47.8
4,2022,11,Manhattan,6.0,Group ride,2,14.93


In [17]:
# Filter rows where the year is between 2020 and the current year (inclusive)
filtered_rate_code_analysis_df = rate_code_analysis_df[(rate_code_analysis_df['year'] >= 2020) & (rate_code_analysis_df['year'] <= current_year)]
filtered_rate_code_analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1628 entries, 0 to 1666
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   1628 non-null   Int64  
 1   month                  1628 non-null   Int64  
 2   pickup_borough         1628 non-null   object 
 3   RateCodeID             1628 non-null   float64
 4   rate_code_description  1628 non-null   object 
 5   total_trips            1628 non-null   Int64  
 6   avg_fare               1628 non-null   float64
dtypes: Int64(3), float64(2), object(2)
memory usage: 106.5+ KB


In [18]:
# Bar Chart: Total Trips per Rate Code
rate_code_counts = (
    filtered_rate_code_analysis_df.groupby("rate_code_description")["total_trips"]
    .sum()
    .reset_index()
    .sort_values(by="total_trips", ascending=False)
)

fig1 = px.bar(
    rate_code_counts,
    x="rate_code_description",
    y="total_trips",
    title="Total Trips by Rate Code",
    labels={"rate_code_description": "Rate Code", "total_trips": "Total Trips"},
    template="plotly_dark",
    text_auto=True,
    color="total_trips",
    color_continuous_scale="oranges"
)
fig1.show()

In [19]:
# Stacked Bar Chart: Rate Code Usage per Borough
borough_rate_distribution = (
    filtered_rate_code_analysis_df.groupby(["pickup_borough", "rate_code_description"])["total_trips"]
    .sum()
    .reset_index()
)

fig2 = px.bar(
    borough_rate_distribution,
    x="pickup_borough",
    y="total_trips",
    color="rate_code_description",
    title="Rate Code Distribution Across Boroughs",
    labels={"pickup_borough": "Borough", "total_trips": "Total Trips", "rate_code_description": "Rate Code"},
    template="plotly_white",
    text_auto=True,
    barmode="stack"
)
fig2.show()

In [20]:
# Box Plot: Fare Distribution for Each Rate Code
fig3 = px.box(
    filtered_rate_code_analysis_df,
    x="rate_code_description",
    y="avg_fare",
    title="Fare Distribution by Rate Code",
    labels={"rate_code_description": "Rate Code", "avg_fare": "Average Fare ($)"},
    template="plotly_white",
    color="rate_code_description"
)
fig3.show()


## Question 15: How long do trips typically take, and is there a trend of increasing or decreasing trip durations over time?

In [21]:
query_trip_duration_analysis = """
SELECT *
FROM `yellow-taxi-trips-2025.views_fordashboard.trip_duration_analysis`
"""
trip_duration_analysis_df = query_to_dataframe(query_trip_duration_analysis)
trip_duration_analysis_df.head()

Query executed successfully. Retrieved 26412 rows.


Unnamed: 0,year,month,day,hour,avg_trip_duration_min,total_trips
0,2022,2,11,0,12.49,2948
1,2022,2,13,0,13.96,4740
2,2023,11,24,0,13.42,887
3,2023,11,7,0,17.59,1439
4,2022,4,28,0,14.29,2558


In [22]:
# Filter rows where the year is between 2020 and the current year (inclusive)
filtered_trip_duration_analysis_df = trip_duration_analysis_df[(trip_duration_analysis_df['year'] >= 2020) & (trip_duration_analysis_df['year'] <= current_year)]
filtered_trip_duration_analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26249 entries, 0 to 26410
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   26249 non-null  Int64  
 1   month                  26249 non-null  Int64  
 2   day                    26249 non-null  Int64  
 3   hour                   26249 non-null  Int64  
 4   avg_trip_duration_min  26249 non-null  float64
 5   total_trips            26249 non-null  Int64  
dtypes: Int64(5), float64(1)
memory usage: 1.5 MB


In [23]:
# Line Chart: Trend of Trip Duration Over Time
fig1 = px.line(
    filtered_trip_duration_analysis_df.groupby(["year", "month"])["avg_trip_duration_min"].mean().reset_index(),
    x="month",
    y="avg_trip_duration_min",
    color="year",
    title="Trend of Average Trip Duration Over Time",
    labels={"month": "Month", "avg_trip_duration_min": "Avg Trip Duration (min)", "year": "Year"},
    template="plotly_dark",
    markers=True
)
fig1.show()

In [None]:
# Mapping numerical days to actual names
day_mapping = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday"}

# Create "day_of_week" column
filtered_trip_duration_analysis_df["day_of_week"] = pd.to_datetime(
    filtered_trip_duration_analysis_df[["year", "month", "day"]]
).dt.dayofweek.map(day_mapping)

# Bar Chart: Average Trip Duration per Day of the Week
fig1 = px.bar(
    filtered_trip_duration_analysis_df.groupby("day_of_week")["avg_trip_duration_min"].mean().reset_index(),
    x="day_of_week",
    y="avg_trip_duration_min",
    title="Average Trip Duration by Day of the Week",
    labels={"day_of_week": "Day of the Week", "avg_trip_duration_min": "Avg Trip Duration (min)"},
    template="plotly_dark",
    color="day_of_week",
    category_orders={"day_of_week": ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]}
)
fig1.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

