In [1]:
# @title Setup
from google.cloud import bigquery
from google.colab import data_table
import bigframes.pandas as bpd
import pandas as pd

project = 'yellow-taxi-trips-2025' # Project ID inserted based on the query results selected to explore
location = 'US' # Location inserted based on the query results selected to explore
client = bigquery.Client(project=project, location=location)
data_table.enable_dataframe_formatter()

In [2]:
# Function to execute a BigQuery query and return a DataFrame

def query_to_dataframe(query: str) -> pd.DataFrame:
    """
    Executes a SQL query in BigQuery and returns a Pandas DataFrame.

    Parameters:
    - query (str): The SQL query to execute.

    Return:
    - pd.DataFrame : The DataFrame containing the results of the query.
    """
    try:
        df = client.query(query).to_dataframe()
        print(f"Query executed successfully. Retrieved {df.shape[0]} rows.")
        return df
    except Exception as e:
        print(f"Error executing query: {e}")
        return pd.DataFrame()

In [4]:
query_trips_ml_data = """
SELECT *
FROM `yellow-taxi-trips-2025.ml_dataset.trips_ml_data`
"""
trips_ml_data_df = query_to_dataframe(query_trips_ml_data)
trips_ml_data_df.head()

Query executed successfully. Retrieved 3143810 rows.


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,source_file
0,2,2024-12-13 22:09:59+00:00,2024-12-13 22:10:25+00:00,2.0,0.01,1.0,N,249,249,2,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,yellow_tripdata_2024-12.parquet
1,2,2024-12-13 07:55:19+00:00,2024-12-13 07:59:19+00:00,1.0,0.98,1.0,N,45,144,2,0.0,0.0,0.0,0.0,0.0,0.0,2.5,2.5,0.0,yellow_tripdata_2024-12.parquet
2,2,2024-12-12 11:34:11+00:00,2024-12-12 11:45:43+00:00,2.0,0.94,1.0,N,161,246,2,0.0,0.0,0.5,0.0,0.0,1.0,4.0,2.5,0.0,yellow_tripdata_2024-12.parquet
3,2,2024-12-31 20:54:09+00:00,2024-12-31 21:22:15+00:00,1.0,1.55,99.0,N,97,97,2,0.0,0.0,0.5,0.0,0.0,1.0,4.0,2.5,0.0,yellow_tripdata_2024-12.parquet
4,2,2024-12-13 07:33:07+00:00,2024-12-13 07:59:17+00:00,1.0,19.26,1.0,N,42,231,2,0.0,0.0,0.5,0.0,0.0,1.0,4.0,2.5,0.0,yellow_tripdata_2024-12.parquet


In [5]:
trips_ml_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3143810 entries, 0 to 3143809
Data columns (total 20 columns):
 #   Column                 Dtype              
---  ------                 -----              
 0   VendorID               Int64              
 1   tpep_pickup_datetime   datetime64[us, UTC]
 2   tpep_dropoff_datetime  datetime64[us, UTC]
 3   passenger_count        float64            
 4   trip_distance          float64            
 5   RatecodeID             float64            
 6   store_and_fwd_flag     object             
 7   PULocationID           Int64              
 8   DOLocationID           Int64              
 9   payment_type           Int64              
 10  fare_amount            float64            
 11  extra                  float64            
 12  mta_tax                float64            
 13  tip_amount             float64            
 14  tolls_amount           float64            
 15  improvement_surcharge  float64            
 16  total_amount      

In [6]:
# Missing values
trips_ml_data_df.isna().sum()

Unnamed: 0,0
VendorID,0
tpep_pickup_datetime,0
tpep_dropoff_datetime,0
passenger_count,0
trip_distance,0
RatecodeID,0
store_and_fwd_flag,0
PULocationID,0
DOLocationID,0
payment_type,0


In [7]:
def preprocess_data(df):
    # Ensure datetime columns are in datetime format
    #df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    #df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

    # Trip duration in minutes
    df["trip_duration"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60

    # Extract time-based features
    df["pickup_dayofweek"] = df["tpep_pickup_datetime"].dt.dayofweek # Monday=0, Sunday=6.
    df["pickup_month"] = df["tpep_pickup_datetime"].dt.month
    df["pickup_year"] = df["tpep_pickup_datetime"].dt.year
    df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
    df["is_weekend"] = df["pickup_dayofweek"].isin([5, 6]).astype(int)  # 5=Saturday, 6=Sunday

    # Filter payment type (Credit Card = 1, Cash = 2)
    #df = df[df["payment_type"].isin([1, 2])].copy()

    # Create binary feature for credit card payments
    df["is_credit_card"] = (df["payment_type"] == 1).astype(int)

    # Select relevant columns
    selected_cols = [
        "PULocationID", "DOLocationID", "passenger_count", "trip_distance",
        "trip_duration", "pickup_dayofweek", "pickup_month", "pickup_year", "pickup_hour",
        "is_weekend", "is_credit_card", "total_amount"
    ]

    return df[selected_cols].copy()


In [8]:
from sklearn.model_selection import train_test_split

def split_data(df, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    """
    Splits the dataframe into train, validation, and test sets.

    Parameters:
    - df: Pandas DataFrame
    - train_size: Proportion of the dataset for training (default=70%)
    - val_size: Proportion for validation (default=15%)
    - test_size: Proportion for testing (default=15%)
    - random_state: Seed for reproducibility

    Returns:
    - train_df, val_df, test_df: Split DataFrames
    """
    assert train_size + val_size + test_size == 1, "Split sizes must sum to 1"

    # First, split train + val and test
    train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    # Then, split train and validation
    train_df, val_df = train_test_split(train_val_df, test_size=val_size / (train_size + val_size),
                                        random_state=random_state)

    return train_df, val_df, test_df

# Apply the function
train_df, val_df, test_df = split_data(trips_ml_data_df)

# Display the sizes
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")


Train size: 2200666
Validation size: 471572
Test size: 471572


In [9]:
preprocessed_train_df = preprocess_data(train_df)
preprocessed_train_df.head()

Unnamed: 0,PULocationID,DOLocationID,passenger_count,trip_distance,trip_duration,pickup_dayofweek,pickup_month,pickup_year,pickup_hour,is_weekend,is_credit_card,total_amount
131226,43,142,1.0,1.4,10.316667,5,12,2024,18,1,1,18.48
424593,114,181,1.0,6.53,33.216667,6,12,2024,23,1,1,60.67
1046513,211,144,1.0,0.74,6.333333,1,12,2024,19,0,1,17.28
310706,237,237,1.0,0.38,1.483333,5,12,2024,12,1,0,7.7
2917145,138,230,1.0,9.89,98.7,3,12,2024,17,0,1,118.2


In [10]:
preprocessed_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2200666 entries, 131226 to 2727659
Data columns (total 12 columns):
 #   Column            Dtype  
---  ------            -----  
 0   PULocationID      Int64  
 1   DOLocationID      Int64  
 2   passenger_count   float64
 3   trip_distance     float64
 4   trip_duration     float64
 5   pickup_dayofweek  int32  
 6   pickup_month      int32  
 7   pickup_year       int32  
 8   pickup_hour       int32  
 9   is_weekend        int64  
 10  is_credit_card    int64  
 11  total_amount      float64
dtypes: Int64(2), float64(4), int32(4), int64(2)
memory usage: 188.9 MB


In [11]:
# Load the preprocessed_train_df dataframe into BigQuery

DATASET_ID = "ml_dataset"
TABLE_ID = "preprocessed_train_data"
FULL_TABLE_ID = f"{project}.{DATASET_ID}.{TABLE_ID}"

# Define schema (ensure correct types)
schema = [
    bigquery.SchemaField("PULocationID", "INTEGER"),
    bigquery.SchemaField("DOLocationID", "INTEGER"),
    bigquery.SchemaField("passenger_count", "FLOAT"),
    bigquery.SchemaField("trip_distance", "FLOAT"),
    bigquery.SchemaField("trip_duration", "FLOAT"),
    bigquery.SchemaField("pickup_dayofweek", "INTEGER"),
    bigquery.SchemaField("pickup_month", "INTEGER"),
    bigquery.SchemaField("pickup_year", "INTEGER"),
    bigquery.SchemaField("pickup_hour", "INTEGER"),
    bigquery.SchemaField("is_weekend", "INTEGER"),
    bigquery.SchemaField("is_credit_card", "INTEGER"),
    bigquery.SchemaField("total_amount", "FLOAT"),
]

# Load data into BigQuery
job = client.load_table_from_dataframe(
    preprocessed_train_df, FULL_TABLE_ID, job_config=bigquery.LoadJobConfig(schema=schema)
)

# Wait for the job to complete
job.result()

print(f"Data successfully uploaded to BigQuery: {FULL_TABLE_ID}")

Data successfully uploaded to BigQuery: yellow-taxi-trips-2025.ml_dataset.preprocessed_train_data


In [12]:
preprocessed_test_df = preprocess_data(test_df)
preprocessed_test_df.head()

Unnamed: 0,PULocationID,DOLocationID,passenger_count,trip_distance,trip_duration,pickup_dayofweek,pickup_month,pickup_year,pickup_hour,is_weekend,is_credit_card,total_amount
2054169,100,162,1.0,2.19,17.866667,4,12,2024,18,0,1,28.2
2782532,230,229,1.0,0.87,14.783333,0,12,2024,14,0,1,21.0
1937927,170,141,1.0,1.77,19.766667,3,12,2024,11,0,1,25.2
1475510,132,112,1.0,16.4,42.183333,4,12,2024,22,0,1,87.65
90,162,161,1.0,0.41,2.7,6,12,2024,15,1,0,9.1


In [13]:
preprocessed_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 471572 entries, 2054169 to 3104906
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   PULocationID      471572 non-null  Int64  
 1   DOLocationID      471572 non-null  Int64  
 2   passenger_count   471572 non-null  float64
 3   trip_distance     471572 non-null  float64
 4   trip_duration     471572 non-null  float64
 5   pickup_dayofweek  471572 non-null  int32  
 6   pickup_month      471572 non-null  int32  
 7   pickup_year       471572 non-null  int32  
 8   pickup_hour       471572 non-null  int32  
 9   is_weekend        471572 non-null  int64  
 10  is_credit_card    471572 non-null  int64  
 11  total_amount      471572 non-null  float64
dtypes: Int64(2), float64(4), int32(4), int64(2)
memory usage: 40.5 MB


In [14]:
preprocessed_test_df.shape

(471572, 12)

In [15]:
# Load the preprocessed_test_df dataframe into BigQuery

DATASET_ID = "ml_dataset"
TABLE_ID = "preprocessed_test_data"
FULL_TABLE_ID = f"{project}.{DATASET_ID}.{TABLE_ID}"

# Define schema (ensure correct types)
schema = [
    bigquery.SchemaField("PULocationID", "INTEGER"),
    bigquery.SchemaField("DOLocationID", "INTEGER"),
    bigquery.SchemaField("passenger_count", "FLOAT"),
    bigquery.SchemaField("trip_distance", "FLOAT"),
    bigquery.SchemaField("trip_duration", "FLOAT"),
    bigquery.SchemaField("pickup_dayofweek", "INTEGER"),
    bigquery.SchemaField("pickup_month", "INTEGER"),
    bigquery.SchemaField("pickup_year", "INTEGER"),
    bigquery.SchemaField("pickup_hour", "INTEGER"),
    bigquery.SchemaField("is_weekend", "INTEGER"),
    bigquery.SchemaField("is_credit_card", "INTEGER"),
    bigquery.SchemaField("total_amount", "FLOAT"),
]

# Load data into BigQuery
job = client.load_table_from_dataframe(
    preprocessed_test_df, FULL_TABLE_ID, job_config=bigquery.LoadJobConfig(schema=schema)
)

# Wait for the job to complete
job.result()

print(f"Data successfully uploaded to BigQuery: {FULL_TABLE_ID}")

Data successfully uploaded to BigQuery: yellow-taxi-trips-2025.ml_dataset.preprocessed_test_data


In [None]:
# You can continue to create a custom model
