In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

In [3]:
root_path = "/Users/marinelegall/code/lewagon/data/07-ML-Ops/02-Cloud-training/train-in-the-cloud/"
sys.path.append(root_path)

In [4]:
sys.path

['/Users/marinelegall/code/lewagon/data/07-ML-Ops/02-Cloud-training/train-in-the-cloud/taxifare/interface',
 '/Users/marinelegall/code/lewagon/data/04-Decision-Science/01-Project-Setup/context-and-setup',
 '/Users/marinelegall/code/lewagon/data/07-ML-Ops/02-Cloud-training/train-in-the-cloud/taxifare/interface',
 '/Users/marinelegall/code/lewagon/data/05-ML/10-Natural-Language-Processing/ham_or_spam',
 '/Users/marinelegall/.pyenv/versions/3.10.6/lib/python310.zip',
 '/Users/marinelegall/.pyenv/versions/3.10.6/lib/python3.10',
 '/Users/marinelegall/.pyenv/versions/3.10.6/lib/python3.10/lib-dynload',
 '',
 '/Users/marinelegall/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages',
 '/Users/marinelegall/code/lewagon/data/07-ML-Ops/02-Cloud-training/train-in-the-cloud/']

In [5]:
# Load the environment here first

from dotenv import load_dotenv
load_dotenv(dotenv_path='/Users/marinelegall/code/lewagon/data/07-ML-Ops/02-Cloud-training/train-in-the-cloud/.env')
load_dotenv()

True

In [6]:
import numpy as np
import pandas as pd

from pathlib import Path
from colorama import Fore, Style
from dateutil.parser import parse

from taxifare.params import *
from taxifare.ml_logic.data import get_data_with_cache, clean_data, load_data_to_bq
from taxifare.ml_logic.model import initialize_model, compile_model, train_model, evaluate_model
from taxifare.ml_logic.preprocessor import preprocess_features
from taxifare.ml_logic.registry import load_model, save_model, save_results

[34m
Loading TensorFlow...[0m


2024-11-17 13:08:05.226875: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.



✅ TensorFlow loaded (13.77s)


In [7]:
def preprocess(min_date:str = '2009-01-01', max_date:str = '2015-01-01') -> None:
    """
    - Query the raw dataset from Le Wagon's BigQuery dataset
    - Cache query result as a local CSV if it doesn't exist locally
    - Process query data
    - Store processed data on your personal BQ (truncate existing table if it exists)
    - No need to cache processed data as CSV (it will be cached when queried back from BQ during training)
    """

    print(Fore.MAGENTA + "\n ⭐️ Use case: preprocess" + Style.RESET_ALL)

    # Query raw data from BigQuery using `get_data_with_cache`
    min_date = parse(min_date).strftime('%Y-%m-%d') # e.g '2009-01-01'
    max_date = parse(max_date).strftime('%Y-%m-%d') # e.g '2009-01-01'

    query = f"""
        SELECT {",".join(COLUMN_NAMES_RAW)}
        FROM `{GCP_PROJECT_WAGON}`.{BQ_DATASET}.raw_{DATA_SIZE}
        WHERE pickup_datetime BETWEEN '{min_date}' AND '{max_date}'
        ORDER BY pickup_datetime
    """

    # Where nothing else

    # Retrieve `query` data from BigQuery or from `data_query_cache_path` if the file already exists!
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath("raw", f"query_{min_date}_{max_date}_{DATA_SIZE}.csv")
    data_query_cached_exists = data_query_cache_path.is_file()

    if data_query_cached_exists:
        print("Loading data from local CSV...")

        data = pd.read_csv(data_query_cache_path)


    else:
        print("Loading data from Querying Big Query server...")
        from google.cloud import bigquery

        client = bigquery.Client(project=GCP_PROJECT)
        query_job = client.query(query)
        result = query_job.result()
        data = result.to_dataframe()

        # Save it locally to accelerate the next queries!
        data.to_csv(data_query_cache_path, header=True, index=False)

    # Process data
    # Clean data using data.py
    data_clean = clean_data(data)
    X = data_clean.drop("fare_amount", axis=1)
    X_processed = preprocess_features(X)

    # Load a DataFrame onto BigQuery containing [pickup_datetime, X_processed, y]
    # using data.load_data_to_bq()

    return data, X_processed

In [8]:
data, X_processed = preprocess()

[35m
 ⭐️ Use case: preprocess[0m
Loading data from local CSV...
✅ data cleaned
[34m
Preprocessing features...[0m
✅ X_processed, with shape (447, 65)


In [16]:
pd.DataFrame(X_processed)
X_processed.shape

(447, 65)

In [17]:
data[["pickup_datetime"]].shape

(455, 1)

In [18]:
y = data['fare_amount']
y.shape

(455,)

In [12]:
y = data['fare_amount']

data_processed_with_timestamp = pd.DataFrame(np.concatenate((
        data[["pickup_datetime"]],
        X_processed,
        y,
    ), axis=1))

data_processed_with_timestamp

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 455 and the array at index 1 has size 447

In [19]:
def preprocess_chunks_from_monday(min_date:str = '2009-01-01', max_date:str = '2015-01-01') -> None:
    """
    - Query the raw dataset from Le Wagon's BigQuery dataset
    - Cache query result as a local CSV if it doesn't exist locally
    - Process query data
    - Store processed data on your personal BQ (truncate existing table if it exists)
    - No need to cache processed data as CSV (it will be cached when queried back from BQ during training)
    """

    print(Fore.MAGENTA + "\n ⭐️ Use case: preprocess" + Style.RESET_ALL)

    # Query raw data from BigQuery using `get_data_with_cache`
    min_date = parse(min_date).strftime('%Y-%m-%d') # e.g '2009-01-01'
    max_date = parse(max_date).strftime('%Y-%m-%d') # e.g '2009-01-01'

    query = f"""
        SELECT {",".join(COLUMN_NAMES_RAW)}
        FROM `{GCP_PROJECT_WAGON}`.{BQ_DATASET}.raw_{DATA_SIZE}
        WHERE pickup_datetime BETWEEN '{min_date}' AND '{max_date}'
        ORDER BY pickup_datetime
    """

    # Where nothing else

    # Retrieve `query` data from BigQuery or from `data_query_cache_path` if the file already exists!
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath("raw", f"query_{min_date}_{max_date}_{DATA_SIZE}.csv")
    data_query_cached_exists = data_query_cache_path.is_file()
    data_processed_path = Path(LOCAL_DATA_PATH).joinpath("processed", f"processed_{min_date}_{max_date}_{DATA_SIZE}.csv")
    data_query_cache_exists = data_query_cache_path.is_file()

    if data_query_cached_exists:
        print("Loading data from local CSV...")
        data = pd.read_csv(data_query_cache_path)
        chunks = None

        # $CODE_BEGIN
        chunks = pd.read_csv(
            data_query_cache_path,
            chunksize=200, #CHUNK_SIZE
            parse_dates=["pickup_datetime"])

    # if:
        # else:
        #     print("Loading data from Querying Big Query server...")
            # from google.cloud import bigquery

        #     client = bigquery.Client(project=GCP_PROJECT)
        #     query_job = client.query(query)
        #     result = query_job.result()
        #     data = result.to_dataframe()

        #     # Save it locally to accelerate the next queries!
        #     data.to_csv(data_query_cache_path, header=True, index=False)

        # # Process data
        # # Clean data using data.py
        # data_clean = clean_data(data)
        # X = data.drop("fare_amount", axis=1)
        # X_preprocessed = preprocess_features(X)

        # y = data['fare_amount']

        # # Load a DataFrame onto BigQuery containing [pickup_datetime, X_processed, y]
        # # using data.load_data_to_bq()

        # data_processed_with_timestamp = pd.DataFrame(np.concatenate((
        #     data_clean[["pickup_datetime"]],
        #     X_preprocessed,
        #     y,
        # ), axis=1))
        # # Shape error
    else:
        from google.cloud import bigquery
        print("Get a DataFrame iterable from querying the BigQuery server...")
        chunks = None

        # 🎯 HINT: `bigquery.Client(...).query(...).result(page_size=...).to_dataframe_iterable()`
        # $CODE_BEGIN
        client = bigquery.Client(project=GCP_PROJECT)

        query_job = client.query(query)
        result = query_job.result(page_size=CHUNK_SIZE)

        chunks = result.to_dataframe_iterable()

    for chunk_id, chunk in enumerate(chunks):
        print(f"Processing chunk {chunk_id}...")

        # Clean chunk
        # $CODE_BEGIN
        chunk_clean = clean_data(chunk)
        # $CODE_END

        # Create chunk_processed
        # 🎯 HINT: create (`X_chunk`, `y_chunk`), process only `X_processed_chunk`, then concatenate (X_processed_chunk, y_chunk)
        # $CODE_BEGIN
        X_chunk = chunk_clean.drop("fare_amount", axis=1)
        y_chunk = chunk_clean[["fare_amount"]]
        X_processed_chunk = preprocess_features(X_chunk)

        chunk_processed = pd.DataFrame(np.concatenate((X_processed_chunk, y_chunk), axis=1))
        # $CODE_END

        # Save and append the processed chunk to a local CSV at "data_processed_path"
        # 🎯 HINT: df.to_csv(mode=...)
        # 🎯 HINT: we want a CSV with neither index nor headers (they'd be meaningless)
        # $CODE_BEGIN
        chunk_processed.to_csv(
            data_processed_path,
            mode="w" if chunk_id==0 else "a",
            header=False,
            index=False,
        )
        # $CODE_END

        # Save and append the raw chunk `if not data_query_cache_exists`
        # $CODE_BEGIN
        # 🎯 HINT: we want a CSV with headers this time
        # 🎯 HINT: only the first chunk should store headers
        if not data_query_cache_exists:
            chunk.to_csv(
                data_query_cache_path,
                mode="w" if chunk_id==0 else "a",
                header=True if chunk_id==0 else False,
                index=False
            )
        # $CODE_END

    print(f"✅ data query saved as {data_query_cache_path}")
    print("✅ preprocess() done")

    return data

In [20]:
data = preprocess_chunks_from_monday()
data

[35m
 ⭐️ Use case: preprocess[0m
Loading data from local CSV...
Processing chunk 0...
✅ data cleaned
[34m
Preprocessing features...[0m
✅ X_processed, with shape (196, 65)
Processing chunk 1...
✅ data cleaned
[34m
Preprocessing features...[0m
✅ X_processed, with shape (197, 65)
Processing chunk 2...
✅ data cleaned
[34m
Preprocessing features...[0m
✅ X_processed, with shape (54, 65)
✅ data query saved as /Users/marinelegall/.lewagon/mlops/data/raw/query_2009-01-01_2015-01-01_1k.csv
✅ preprocess() done


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,8.9,2009-01-15 09:22:39+00:00,-73.955013,40.780784,-73.964862,40.768096,1
1,4.1,2009-01-20 10:05:35+00:00,-73.983992,40.740018,-73.975250,40.749208,1
2,10.6,2009-01-25 03:17:05+00:00,-73.987612,40.749694,-73.950233,40.780101,1
3,8.3,2009-01-26 18:18:38+00:00,-74.011112,40.713399,-74.001300,40.721281,1
4,38.2,2009-01-31 03:59:43+00:00,-73.999552,40.728320,-73.926824,40.864947,1
...,...,...,...,...,...,...,...
450,12.5,2014-11-24 19:37:20+00:00,-73.995305,40.725196,-74.016585,40.708135,1
451,52.5,2014-11-24 22:40:00+00:00,-74.005060,40.720610,-73.838890,40.663380,2
452,7.5,2014-12-09 07:19:44+00:00,-73.985858,40.761979,-73.973105,40.764185,2
453,6.5,2014-12-27 15:34:00+00:00,-74.006780,40.735680,-74.003780,40.726100,4


In [25]:
def preprocess_solution(min_date:str = '2009-01-01', max_date:str = '2015-01-01') -> None:
    """
    - Query the raw dataset from Le Wagon's BigQuery dataset
    - Cache query result as a local CSV if it doesn't exist locally
    - Process query data
    - Store processed data on your personal BQ (truncate existing table if it exists)
    - No need to cache processed data as CSV (it will be cached when queried back from BQ during training)
    """

    print(Fore.MAGENTA + "\n ⭐️ Use case: preprocess" + Style.RESET_ALL)

    # Query raw data from BigQuery using `get_data_with_cache`
    min_date = parse(min_date).strftime('%Y-%m-%d') # e.g '2009-01-01'
    max_date = parse(max_date).strftime('%Y-%m-%d') # e.g '2009-01-01'

    query = f"""
        SELECT {",".join(COLUMN_NAMES_RAW)}
        FROM `{GCP_PROJECT_WAGON}`.{BQ_DATASET}.raw_{DATA_SIZE}
        WHERE pickup_datetime BETWEEN '{min_date}' AND '{max_date}'
        ORDER BY pickup_datetime
    """

    # $CHA_BEGIN
    # Retrieve data using `get_data_with_cache`
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath("raw", f"query_{min_date}_{max_date}_{DATA_SIZE}.csv")
    data_query = get_data_with_cache(
        query=query,
        gcp_project=GCP_PROJECT,
        cache_path=data_query_cache_path,
        data_has_header=True
    )
    # $CHA_END

    # Process data
    # $CHA_BEGIN
    data_clean = clean_data(data_query)

    X = data_clean.drop("fare_amount", axis=1)
    y = data_clean[["fare_amount"]]

    X_processed = preprocess_features(X)

    # $CHA_END
    # Load a DataFrame onto BigQuery containing [pickup_datetime, X_processed, y]
    # using data.load_data_to_bq()
    # $CHA_BEGIN
    data_processed_with_timestamp = pd.DataFrame(np.concatenate((
        data_clean[["pickup_datetime"]],
        X_processed,
        y,
    ), axis=1))

    load_data_to_bq(
        data_processed_with_timestamp,
        gcp_project=GCP_PROJECT,
        bq_dataset=BQ_DATASET,
        table=f'processed_{DATA_SIZE}',
        truncate=True
    )
    return data_processed_with_timestamp
    # $CHA_END

    print("✅ preprocess() done \n")

In [28]:
data_solution = preprocess_solution()
data_solution

[35m
 ⭐️ Use case: preprocess[0m
[34m
Load data from local CSV...[0m
✅ Data loaded, with shape (455, 7)
✅ data cleaned
[34m
Preprocessing features...[0m
✅ X_processed, with shape (447, 65)
[34m
Save data to BigQuery @ wagon-paris-1812.taxifare.processed_1k...:[0m
✅ Data saved to bigquery, with shape (447, 67)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
0,2009-01-15 09:22:39+00:00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.9
1,2009-01-20 10:05:35+00:00,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.1
2,2009-01-25 03:17:05+00:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.6
3,2009-01-26 18:18:38+00:00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.3
4,2009-01-31 03:59:43+00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,38.200001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,2014-11-24 19:37:20+00:00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.5
443,2014-11-24 22:40:00+00:00,0.142857,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.5
444,2014-12-09 07:19:44+00:00,0.142857,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.5
445,2014-12-27 15:34:00+00:00,0.428571,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.5


In [37]:
data_solution[[0]]

Unnamed: 0,0
0,2009-01-15 09:22:39+00:00
1,2009-01-20 10:05:35+00:00
2,2009-01-25 03:17:05+00:00
3,2009-01-26 18:18:38+00:00
4,2009-01-31 03:59:43+00:00
...,...
442,2014-11-24 19:37:20+00:00
443,2014-11-24 22:40:00+00:00
444,2014-12-09 07:19:44+00:00
445,2014-12-27 15:34:00+00:00


In [36]:
data

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,8.9,2009-01-15 09:22:39+00:00,-73.955013,40.780784,-73.964862,40.768096,1
1,4.1,2009-01-20 10:05:35+00:00,-73.983992,40.740018,-73.975250,40.749208,1
2,10.6,2009-01-25 03:17:05+00:00,-73.987612,40.749694,-73.950233,40.780101,1
3,8.3,2009-01-26 18:18:38+00:00,-74.011112,40.713399,-74.001300,40.721281,1
4,38.2,2009-01-31 03:59:43+00:00,-73.999552,40.728320,-73.926824,40.864947,1
...,...,...,...,...,...,...,...
450,12.5,2014-11-24 19:37:20+00:00,-73.995305,40.725196,-74.016585,40.708135,1
451,52.5,2014-11-24 22:40:00+00:00,-74.005060,40.720610,-73.838890,40.663380,2
452,7.5,2014-12-09 07:19:44+00:00,-73.985858,40.761979,-73.973105,40.764185,2
453,6.5,2014-12-27 15:34:00+00:00,-74.006780,40.735680,-74.003780,40.726100,4


In [38]:
def preprocess(min_date:str = '2009-01-01', max_date:str = '2015-01-01') -> None:
    """
    - Query the raw dataset from Le Wagon's BigQuery dataset
    - Cache query result as a local CSV if it doesn't exist locally
    - Process query data
    - Store processed data on your personal BQ (truncate existing table if it exists)
    - No need to cache processed data as CSV (it will be cached when queried back from BQ during training)
    """

    print(Fore.MAGENTA + "\n ⭐️ Use case: preprocess" + Style.RESET_ALL)

    # Query raw data from BigQuery using `get_data_with_cache`
    min_date = parse(min_date).strftime('%Y-%m-%d') # e.g '2009-01-01'
    max_date = parse(max_date).strftime('%Y-%m-%d') # e.g '2009-01-01'

    query = f"""
        SELECT {",".join(COLUMN_NAMES_RAW)}
        FROM `{GCP_PROJECT_WAGON}`.{BQ_DATASET}.raw_{DATA_SIZE}
        WHERE pickup_datetime BETWEEN '{min_date}' AND '{max_date}'
        ORDER BY pickup_datetime
    """

    # $CHA_BEGIN
    # Retrieve data using `get_data_with_cache`
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath("raw", f"query_{min_date}_{max_date}_{DATA_SIZE}.csv")
    data_query = get_data_with_cache(
        query=query,
        gcp_project=GCP_PROJECT,
        cache_path=data_query_cache_path,
        data_has_header=True
    )
    # $CHA_END

    # Process data
    # $CHA_BEGIN
    data_clean = clean_data(data_query)

    X = data_clean.drop("fare_amount", axis=1)
    y = data_clean[["fare_amount"]]

    X_processed = preprocess_features(X)

    # $CHA_END
    # Load a DataFrame onto BigQuery containing [pickup_datetime, X_processed, y]
    # using data.load_data_to_bq()
    # $CHA_BEGIN
    data_processed_with_timestamp = pd.DataFrame(np.concatenate((
        data_clean[["pickup_datetime"]],
        X_processed,
        y,
    ), axis=1))

    load_data_to_bq(
        data_processed_with_timestamp,
        gcp_project=GCP_PROJECT,
        bq_dataset=BQ_DATASET,
        table=f'processed_{DATA_SIZE}',
        truncate=True
    )
    # $CHA_END

    print("✅ preprocess() done \n")

In [39]:
preprocess()

[35m
 ⭐️ Use case: preprocess[0m
[34m
Load data from local CSV...[0m
✅ Data loaded, with shape (455, 7)
✅ data cleaned
[34m
Preprocessing features...[0m
✅ X_processed, with shape (447, 65)
[34m
Save data to BigQuery @ wagon-paris-1812.taxifare.processed_1k...:[0m
✅ Data saved to bigquery, with shape (447, 67)
✅ preprocess() done 



In [40]:
# FOR TRAIN
def preprocess(min_date:str = '2009-01-01', max_date:str = '2015-01-01') -> None:
    """
    - Query the raw dataset from Le Wagon's BigQuery dataset
    - Cache query result as a local CSV if it doesn't exist locally
    - Process query data
    - Store processed data on your personal BQ (truncate existing table if it exists)
    - No need to cache processed data as CSV (it will be cached when queried back from BQ during training)
    """
    print(Fore.MAGENTA + "\n ⭐️ Use case: preprocess" + Style.RESET_ALL)

    # Query raw data from BigQuery using `get_data_with_cache`
    min_date = parse(min_date).strftime('%Y-%m-%d') # e.g '2009-01-01'
    max_date = parse(max_date).strftime('%Y-%m-%d') # e.g '2009-01-01'

    query = f"""
        SELECT {",".join(COLUMN_NAMES_RAW)}
        FROM `{GCP_PROJECT_WAGON}`.{BQ_DATASET}.raw_{DATA_SIZE}
        WHERE pickup_datetime BETWEEN '{min_date}' AND '{max_date}'
        ORDER BY pickup_datetime
    """

    # $CHA_BEGIN
    # Retrieve data using `get_data_with_cache`
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath("raw", f"query_{min_date}_{max_date}_{DATA_SIZE}.csv")
    data_query = get_data_with_cache(
        query=query,
        gcp_project=GCP_PROJECT,
        cache_path=data_query_cache_path,
        data_has_header=True
    )
    # $CHA_END

    # Process data
    # $CHA_BEGIN
    data_clean = clean_data(data_query)

    X = data_clean.drop("fare_amount", axis=1)
    y = data_clean[["fare_amount"]]

    X_processed = preprocess_features(X)

    # $CHA_END
    # Load a DataFrame onto BigQuery containing [pickup_datetime, X_processed, y]
    # using data.load_data_to_bq()
    # $CHA_BEGIN
    data_processed_with_timestamp = pd.DataFrame(np.concatenate((
        data_clean[["pickup_datetime"]],
        X_processed,
        y,
    ), axis=1))

    load_data_to_bq(
        data_processed_with_timestamp,
        gcp_project=GCP_PROJECT,
        bq_dataset=BQ_DATASET,
        table=f'processed_{DATA_SIZE}',
        truncate=True
    )
    # $CHA_END

    print("✅ preprocess() done \n")
    return X_processed,y

In [50]:
X_processed, y = preprocess()
data_cleaned = pd.DataFrame(X_processed).merge(y, left_index=True, right_index=True)

[35m
 ⭐️ Use case: preprocess[0m
[34m
Load data from local CSV...[0m
✅ Data loaded, with shape (455, 7)
✅ data cleaned
[34m
Preprocessing features...[0m
✅ X_processed, with shape (447, 65)
[34m
Save data to BigQuery @ wagon-paris-1812.taxifare.processed_1k...:[0m
✅ Data saved to bigquery, with shape (447, 67)
✅ preprocess() done 



In [51]:
data_cleaned

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,fare_amount
0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.900000
1,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.100000
2,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.600000
3,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.300000
4,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,38.200001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.000000
443,0.142857,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.500000
444,0.142857,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.500000
445,0.428571,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.500000
