In [1]:
from google.cloud import storage

In [2]:
client = storage.Client()

In [3]:
list(client.list_buckets())

[<Bucket: e2e-gcp-data-lake>]

In [4]:
bucket = client.get_bucket("e2e-gcp-data-lake")

In [5]:
list(bucket.list_blobs())

[]

In [6]:
blob = bucket.blob("datasets/cards/deckofcards.txt")

In [7]:
blob.upload_from_filename("../datasets/cards/deckofcards.txt")

In [8]:
list(bucket.list_blobs())

[<Blob: e2e-gcp-data-lake, datasets/cards/deckofcards.txt, 1703562042218766>]

In [10]:
with open('./downloads/deckofcards-gcs.txt', 'wb') as file_obj:
    client.download_blob_to_file("gs://e2e-gcp-data-lake/datasets/cards/deckofcards.txt", file_obj)

In [11]:
bucket.delete_blob("datasets/cards/deckofcards.txt")

In [12]:
import os
import glob

In [13]:
def get_file_names(folder_path):
    files = glob.glob(f"{folder_path}/**", recursive=True)
    return list(filter(lambda f: os.path.isfile(f) and f.endswith("txt"), files))

In [14]:
base_dir = "../datasets/cards"

In [15]:
files = get_file_names(base_dir)

In [16]:
for f in files:
    suffix = "/".join(f.split("/")[3:])
    blob = bucket.blob(f"datasets/cards/{suffix}")
    blob.upload_from_filename(f)

In [17]:
list(bucket.list_blobs(prefix="datasets/cards"))

[<Blob: e2e-gcp-data-lake, datasets/cards/deckofcards.txt, 1703562130258645>,
 <Blob: e2e-gcp-data-lake, datasets/cards/smalldecks/deckofcards.txt, 1703562137407685>]

!gsutil ls -r gs://e2e-gcp-data-lake/<path>/

Hands On

In [18]:
import pandas as pd
import numpy as np

In [19]:
col_names = ['order_id', 'order_date', 'order_customer_id', 'order_status']

In [21]:
df = pd.read_csv("gs://e2e-gcp-data-lake/datasets/retail_db/orders/part-00000", names=col_names)

In [22]:
df.head()

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE


In [23]:
import json

In [24]:
with open("../datasets/retail_db/schemas.json") as f:
    schemas = json.load(f)

In [28]:
def get_column_names(dataset_name):
    return [col['column_name'] for col in sorted(schemas[dataset_name], key=lambda col: col['column_position'])]

In [32]:
def get_file_names(folder_path):
    files = glob.glob(f"{folder_path}/**", recursive=True)
    return list(filter(lambda f: os.path.isfile(f) and f.endswith("part-00000"), files))

In [41]:
base_dir = "../datasets/retail_db"

In [42]:
files = get_file_names(base_dir)

In [43]:
target_dir = "retail_db_paruqet"

In [44]:
for f in files:
    suffix = "/".join(f.split("/")[3:])
    dataset_name = suffix.split("/")[0]
    blob_name = f"gs://e2e-gcp-data-lake/datasets/{target_dir}/{suffix}.snappy.parquet"

    df = pd.read_csv(f, names=get_column_names(dataset_name))
    df.to_parquet(blob_name, index=False)

In [47]:
pd.read_parquet("gs://e2e-gcp-data-lake/datasets/retail_db_paruqet/order_items/part-00000.snappy.parquet")

Unnamed: 0,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
0,1,1,957,1,299.98,299.98
1,2,2,1073,1,199.99,199.99
2,3,2,502,5,250.00,50.00
3,4,2,403,1,129.99,129.99
4,5,4,897,2,49.98,24.99
...,...,...,...,...,...,...
172193,172194,68881,403,1,129.99,129.99
172194,172195,68882,365,1,59.99,59.99
172195,172196,68882,502,1,50.00,50.00
172196,172197,68883,208,1,1999.99,1999.99
