In [None]:
# @title Setup
from google.cloud import bigquery
from google.colab import data_table
import bigframes.pandas as bpd

project = 'yellow-taxi-trips-2025' # Project ID inserted based on the query results selected to explore
location = 'US' # Location inserted based on the query results selected to explore
client = bigquery.Client(project=project, location=location)
data_table.enable_dataframe_formatter()

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# Extract the current year
from datetime import datetime
current_year = datetime.now().year
current_year

2025

In [None]:
# Function to execute a BigQuery query and return a DataFrame

def query_to_dataframe(query: str) -> pd.DataFrame:
    """
    Executes a SQL query in BigQuery and returns a Pandas DataFrame.

    Parameters:
    - query (str): The SQL query to execute.

    Return:
    - pd.DataFrame : The DataFrame containing the results of the query.
    """
    try:
        df = client.query(query).to_dataframe()
        print(f"Query executed successfully. Retrieved {df.shape[0]} rows.")
        return df
    except Exception as e:
        print(f"Error executing query: {e}")
        return pd.DataFrame()

## Question 10: How often do passengers tip, and what factors (time of day, borough, fare amount) influence tip amounts?

In [None]:
query_tipping_behavior_analysis = """
SELECT *
FROM `yellow-taxi-trips-2025.views_fordashboard.tipping_behavior_analysis`
"""
tipping_behavior_analysis_df = query_to_dataframe(query_tipping_behavior_analysis)
tipping_behavior_analysis_df.head()

Query executed successfully. Retrieved 825967 rows.


Unnamed: 0,trip_date,year,month,pickup_hour,pickup_borough,dropoff_borough,total_trips,tipped_trips,tip_frequency_percentage,avg_tip_amount,avg_total_fare,avg_fare,avg_tip_percentage
0,2020-08-11,2020,8,21,Manhattan,Manhattan,607,583,96.05,2.41,15.35,9.2,15.46
1,2020-08-06,2020,8,20,Manhattan,Manhattan,831,800,96.27,2.34,15.18,9.1,15.31
2,2020-08-24,2020,8,16,Manhattan,Manhattan,1337,1288,96.34,2.46,15.8,9.14,15.34
3,2020-08-07,2020,8,15,Manhattan,Manhattan,1360,1304,95.88,2.43,15.61,9.91,15.64
4,2020-08-13,2020,8,15,Manhattan,Manhattan,1398,1342,95.99,2.38,15.36,9.68,15.53


In [None]:
# Filter rows where the year is between 2020 and the current year (inclusive)
filtered_tipping_behavior_analysis_df = tipping_behavior_analysis_df[(tipping_behavior_analysis_df['year'] >= 2020) & (tipping_behavior_analysis_df['year'] <= current_year)]
filtered_tipping_behavior_analysis_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 825799 entries, 0 to 825966
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   trip_date                 825799 non-null  dbdate 
 1   year                      825799 non-null  Int64  
 2   month                     825799 non-null  Int64  
 3   pickup_hour               825799 non-null  Int64  
 4   pickup_borough            825799 non-null  object 
 5   dropoff_borough           825799 non-null  object 
 6   total_trips               825799 non-null  Int64  
 7   tipped_trips              825799 non-null  Int64  
 8   tip_frequency_percentage  825799 non-null  float64
 9   avg_tip_amount            825799 non-null  float64
 10  avg_total_fare            825799 non-null  float64
 11  avg_fare                  825799 non-null  float64
 12  avg_tip_percentage        825799 non-null  float64
dtypes: Int64(5), dbdate(1), float64(5), object(2)
mem

In [None]:
# Aggregate data by month to reduce noise
filtered_tipping_behavior_analysis_df["trip_date"] = pd.to_datetime(filtered_tipping_behavior_analysis_df["trip_date"])
df_monthly = filtered_tipping_behavior_analysis_df.groupby(pd.Grouper(key="trip_date", freq="M"))["tip_frequency_percentage"].mean().reset_index()
df_monthly.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


'M' is deprecated and will be removed in a future version, please use 'ME' instead.



Unnamed: 0,trip_date,tip_frequency_percentage
0,2020-01-31,83.115016
1,2020-02-29,84.152667
2,2020-03-31,80.975619
3,2020-04-30,83.958329
4,2020-05-31,85.180854


In [None]:
# Create a smoother line plot
fig = px.line(
    df_monthly,
    x="trip_date",
    y="tip_frequency_percentage",
    title="Tip Frequency Over Time (Monthly Average)",
    labels={"tip_frequency_percentage": "Tip Frequency (%)", "trip_date": "Date"},
    template="plotly_white",
    line_shape="spline"  # Makes the line smoother
)

fig.show()


In [None]:
fig = px.bar(
    filtered_tipping_behavior_analysis_df.groupby("pickup_borough")["avg_tip_percentage"].mean().reset_index(),
    x="pickup_borough",
    y="avg_tip_percentage",
    title="Average Tip Percentage by Pickup Borough",
    labels={"avg_tip_percentage": "Average Tip (%)"},
    template="plotly_white",
    color="avg_tip_percentage",
    color_continuous_scale="Blues"
)
fig.show()

In [None]:
heatmap_data = filtered_tipping_behavior_analysis_df.groupby(["pickup_hour", "pickup_borough"])["avg_tip_percentage"].mean().unstack()

fig = go.Figure(
    data=go.Heatmap(
        z=heatmap_data.values,
        x=heatmap_data.columns,
        y=heatmap_data.index,
        colorscale="YlGnBu"
    )
)

fig.update_layout(
    title="Tipping Trends by Time of Day & Borough",
    xaxis_title="Borough",
    yaxis_title="Hour of Day",
    template="plotly_white"
)
fig.show()

## Question 11: How much revenue is generated from additional charges (MTA tax, congestion surcharge, airport fees), and has it changed over time?

In [None]:
query_additional_charges_revenue = """
SELECT *
FROM `yellow-taxi-trips-2025.views_fordashboard.additional_charges_revenue`
"""
additional_charges_revenue_df = query_to_dataframe(query_additional_charges_revenue)
additional_charges_revenue_df.head()

Query executed successfully. Retrieved 1821 rows.


Unnamed: 0,trip_date,year,month,total_trips,total_MTA_tax,total_congestion_surcharge,total_airport_fees,total_additional_revenue,avg_additional_charge_per_trip
0,2020-08-27,2020,8,33955,16925.5,77692.5,,,
1,2020-05-28,2020,5,11252,5604.0,24395.0,,,
2,2021-03-01,2021,3,48407,24154.5,110875.0,,,
3,2020-01-14,2020,1,208718,103881.5,490291.25,,,
4,2021-01-10,2021,1,26940,13427.5,59247.5,,,


In [None]:
# Filter rows where the year is between 2020 and the current year (inclusive)
filtered_additional_charges_revenue_df = additional_charges_revenue_df[(additional_charges_revenue_df['year'] >= 2020) & (additional_charges_revenue_df['year'] <= current_year)]
filtered_additional_charges_revenue_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1797 entries, 0 to 1820
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   trip_date                       1797 non-null   dbdate 
 1   year                            1797 non-null   Int64  
 2   month                           1797 non-null   Int64  
 3   total_trips                     1797 non-null   Int64  
 4   total_MTA_tax                   1797 non-null   float64
 5   total_congestion_surcharge      1797 non-null   float64
 6   total_airport_fees              1359 non-null   float64
 7   total_additional_revenue        1359 non-null   float64
 8   avg_additional_charge_per_trip  1359 non-null   float64
dtypes: Int64(3), dbdate(1), float64(5)
memory usage: 145.7 KB


In [None]:
# Convert trip_date to datetime
filtered_additional_charges_revenue_df["trip_date"] = pd.to_datetime(filtered_additional_charges_revenue_df["trip_date"])

# Fill missing values with 0 (assumes missing airport fees mean no charge for that trip)
filtered_additional_charges_revenue_df.fillna(0, inplace=True)

# Create stacked area plot
fig = px.area(
    filtered_additional_charges_revenue_df,
    x="trip_date",
    y=["total_MTA_tax", "total_congestion_surcharge", "total_airport_fees"],
    labels={"trip_date": "Date", "value": "Revenue ($)", "variable": "Charge Type"},
    title="Revenue from Additional Charges Over Time",
    template="plotly_white"
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

