# Interacting with the Data Lake files

### Install the cloud provider Python libraries

```bash

pip install gcsfs

```

In [1]:
import os
import pandas as pd

In [3]:
# Specify the GCS bucket and path
PROJECT = os.getenv('GOOGLE_PROJECT_NAME')
BUCKET = 'gs://ozkary_data_lake_ozkary-de-101'
CONTAINER = 'turnstile'

## Use the Google Cloud API

In [7]:
# query the bucket using GCS API and display the file names and sizes
from google.cloud import storage
def files_in_storage():
    storage_client = storage.Client(project=PROJECT)
    bucket = storage_client.get_bucket(BUCKET.split('/')[-1])
    files = bucket.list_blobs()
    for file in files:    
        print(file.name, file.size)

files_in_storage()

turnstile/240302.csv.gz 2717306
turnstile/240309.csv.gz 2756893
turnstile/240316.csv.gz 2698860
turnstile/batch_23_20240722_210838.csv.gz 372
turnstile/batch_24_20240722_211013.csv.gz 1231
turnstile/batch_26_20240724_124515.csv.gz 738


## View a data sample

In [5]:
def show_data(path: str):
    # Specify the GCS URL with the path to your file in your GCS bucket
    gcs_url = f'{BUCKET}/{path}'
    print(gcs_url)

    # Use Pandas to read data from the GCS URL
    df = pd.read_csv(gcs_url, iterator=False,compression="gzip", index_col=0)

    # Now you can work with the DataFrame 'df' as usual
    print(df.head(10))  # Example: Display the first few rows of the DataFrame

PATH = f'{CONTAINER}/240309.csv.gz'
show_data(PATH)

gs://ozkary_data_lake_ozkary-de-101/turnstile/240309.csv.gz
     CA  UNIT       SCP STATION LINENAME DIVISION        DATE      TIME  \
0  A002  R051  02-00-00   59 ST  NQR456W      BMT  03/02/2024  03:00:00   
1  A002  R051  02-00-00   59 ST  NQR456W      BMT  03/02/2024  07:00:00   
2  A002  R051  02-00-00   59 ST  NQR456W      BMT  03/02/2024  11:00:00   
3  A002  R051  02-00-00   59 ST  NQR456W      BMT  03/02/2024  15:00:00   
4  A002  R051  02-00-00   59 ST  NQR456W      BMT  03/02/2024  19:00:00   
5  A002  R051  02-00-00   59 ST  NQR456W      BMT  03/02/2024  23:00:00   
6  A002  R051  02-00-00   59 ST  NQR456W      BMT  03/03/2024  03:00:00   
7  A002  R051  02-00-00   59 ST  NQR456W      BMT  03/03/2024  07:00:00   
8  A002  R051  02-00-00   59 ST  NQR456W      BMT  03/03/2024  11:00:00   
9  A002  R051  02-00-00   59 ST  NQR456W      BMT  03/03/2024  15:00:00   

      DESC  ENTRIES  EXITS  
0  REGULAR   131997  97773  
1  REGULAR   131999  97789  
2  REGULAR   132025  97859 

## Monitor your orchestration system


In [8]:
# check the prefect system
!prefect flow-run ls

!prefect deployment ls

!prefect block ls


  "class": algorithms.Blowfish,
[32mNo flow runs found.[0m
  "class": algorithms.Blowfish,
[3m                                  Deployments                                  [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1mName                                [0m[1m [0m┃[1m [0m[1mID                                  [0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│[34m [0m[34mMTA Batch flow/[0m[1;34mdep-docker-mta-de-101[0m[34m [0m│[36m [0m[36m32f03a73-ebee-47d4-96af-405c20b7b76d[0m[36m [0m│
│[34m [0m[34mMTA Test/[0m[1;34mprefect-test-deployment[0m[34m    [0m[34m [0m│[36m [0m[36mfe26904d-f3a0-40af-bead-863320ad5cad[0m[36m [0m│
└──────────────────────────────────────┴──────────────────────────────────────┘
  "class": algorithms.Blowfish,
[3m                                     Blocks                                     [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━

## Run the data pipeline process locally

```bash
python3 ./flows/etl_web_to_gcs.py --year 2024 --month 2 --day 24
```

In [6]:
#check the active flows
!prefect flow-run ls


  "class": algorithms.Blowfish,
[3m                                   Flow Runs                                    [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━┳━━━━━━━━━┓
┃[1m [0m[1m                            ID[0m[1m [0m┃[1m [0m[1mFlow            [0m[1m [0m┃[1m [0m[1mName      [0m[1m [0m┃[1m [0m[1m…[0m[1m [0m┃[1m [0m[1mWhen   [0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━╇━━━━━━━━━┩
│[36m [0m[36mbb930f81-576c-4e93-9a76-9a945…[0m[36m [0m│[34m [0m[34mMTA : etl_web_t…[0m[34m [0m│[32m [0m[32mdivergent…[0m[32m [0m│ … │[1m [0m[1m36 sec…[0m[1m [0m│
│[36m [0m[36mc8f3cdea-b1ca-490f-878e-69955…[0m[36m [0m│[34m [0m[34mMTA Batch flow  [0m[34m [0m│[32m [0m[32mspeedy-pu…[0m[32m [0m│ … │[1m [0m[1m38 sec…[0m[1m [0m│
└────────────────────────────────┴──────────────────┴────────────┴───┴─────────┘


In [10]:
# check the files in the data lake
files_in_storage()

turnstile/240217.csv.gz 2715438
turnstile/240224.csv.gz 2693738
turnstile/240302.csv.gz 2717306


In [11]:
# show new file content
file_path = f'{CONTAINER}/240302.csv.gz'
show_data(file_path)

gs://ozkary_data_lake_ozkary-de-101/turnstile/240302.csv.gz
     CA  UNIT       SCP STATION LINENAME DIVISION        DATE      TIME  \
0  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  03:00:00   
1  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  07:00:00   
2  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  11:00:00   
3  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  15:00:00   
4  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  19:00:00   
5  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  23:00:00   
6  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/25/2024  03:00:00   
7  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/25/2024  07:00:00   
8  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/25/2024  11:00:00   
9  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/25/2024  15:00:00   

      DESC  ENTRIES  EXITS  
0  REGULAR   128436  94650  
1  REGULAR   128443  94666  
2  REGULAR   128479  94733 