# Interacting with the Data Lake files

### Install the cloud provider Python libraries

```bash

pip install gcsfs

```

In [2]:
import os
import pandas as pd

In [3]:
# Specify the GCS bucket and path
PROJECT = os.getenv('GOOGLE_PROJECT_NAME')
BUCKET = 'gs://ozkary_data_lake_ozkary-de-101'
CONTAINER = 'turnstile'

In [3]:
# show the files in the bucket
!gsutil ls -l $BUCKET/turnstile



Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update

   2733534  2024-01-30T16:03:09Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231104.csv.gz
   2766446  2024-01-30T16:04:16Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231111.csv.gz
   2739833  2024-01-30T16:05:17Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231118.csv.gz
   2697420  2024-01-30T16:06:19Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231125.csv.gz
   2716923  2024-01-30T16:35:34Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231202.csv.gz
   2727822  2024-01-30T16:37:01Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231209.csv.gz
   2714129  2024-01-30T16:38:09Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231216.csv.gz
   2718102  2024-01-30T16:39:08Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231223.csv.gz
   2709209  2024-01-30T16:40:17Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231230.csv.gz
   2704536  2024-03-01T21:41:15Z

## Use the Google Cloud API

In [4]:
# query the bucket using GCS API and display the file names and sizes
from google.cloud import storage
def files_in_storage():
    storage_client = storage.Client(project=PROJECT)
    bucket = storage_client.get_bucket(BUCKET.split('/')[-1])
    files = bucket.list_blobs()
    for file in files:    
        print(file.name, file.size)

files_in_storage()

turnstile/240217.csv.gz 2715438
turnstile/240224.csv.gz 2693738


## View a data sample

In [5]:
def show_data(path: str):
    # Specify the GCS URL with the path to your file in your GCS bucket
    gcs_url = f'{BUCKET}/{path}'
    print(gcs_url)

    # Use Pandas to read data from the GCS URL
    df = pd.read_csv(gcs_url, iterator=False,compression="gzip", index_col=0)

    # Now you can work with the DataFrame 'df' as usual
    print(df.head(10))  # Example: Display the first few rows of the DataFrame

PATH = f'{CONTAINER}/240224.csv.gz'
show_data(PATH)

gs://ozkary_data_lake_ozkary-de-101/turnstile/240224.csv.gz
     CA  UNIT       SCP STATION LINENAME DIVISION        DATE      TIME  \
0  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/17/2024  03:00:00   
1  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/17/2024  07:00:00   
2  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/17/2024  11:00:00   
3  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/17/2024  15:00:00   
4  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/17/2024  19:00:00   
5  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/17/2024  23:00:00   
6  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/18/2024  03:00:00   
7  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/18/2024  07:00:00   
8  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/18/2024  11:00:00   
9  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/18/2024  15:00:00   

      DESC  ENTRIES  EXITS  
0  REGULAR   125307  92102  
1  REGULAR   125311  92112  
2  REGULAR   125341  92181 

## Monitor your orchestration system


In [8]:
# check the prefect system
!prefect flow-run ls

!prefect deployment ls

!prefect block ls


  "class": algorithms.Blowfish,
[32mNo flow runs found.[0m
  "class": algorithms.Blowfish,
[3m                                  Deployments                                  [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1mName                                [0m[1m [0m┃[1m [0m[1mID                                  [0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│[34m [0m[34mMTA Batch flow/[0m[1;34mdep-docker-mta-de-101[0m[34m [0m│[36m [0m[36m32f03a73-ebee-47d4-96af-405c20b7b76d[0m[36m [0m│
│[34m [0m[34mMTA Test/[0m[1;34mprefect-test-deployment[0m[34m    [0m[34m [0m│[36m [0m[36mfe26904d-f3a0-40af-bead-863320ad5cad[0m[36m [0m│
└──────────────────────────────────────┴──────────────────────────────────────┘
  "class": algorithms.Blowfish,
[3m                                     Blocks                                     [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━

## Run the data pipeline process locally

```bash
python3 ./flows/etl_web_to_gcs.py --year 2024 --month 2 --day 24
```

In [9]:
#check the active flows
!prefect flow-run ls


  "class": algorithms.Blowfish,
[3m                                   Flow Runs                                    [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━┳━━━━━━━━━━┓
┃[1m [0m[1m                            ID[0m[1m [0m┃[1m [0m[1mFlow            [0m[1m [0m┃[1m [0m[1mName     [0m[1m [0m┃[1m [0m[1m…[0m[1m [0m┃[1m [0m[1mWhen    [0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━╇━━━━━━━━━━┩
│[36m [0m[36m58309890-9999-4149-96e7-0936d…[0m[36m [0m│[34m [0m[34mMTA : etl_web_t…[0m[34m [0m│[32m [0m[32mjumping-…[0m[32m [0m│ … │[1m [0m[1m42 seco…[0m[1m [0m│
│[36m [0m[36ma5303819-1299-4557-9d33-a4c46…[0m[36m [0m│[34m [0m[34mMTA Batch flow  [0m[34m [0m│[32m [0m[32mdaft-wasp[0m[32m [0m│ … │[1m [0m[1m44 seco…[0m[1m [0m│
└────────────────────────────────┴──────────────────┴───────────┴───┴──────────┘


In [10]:
# check the files in the data lake
files_in_storage()

turnstile/240217.csv.gz 2715438
turnstile/240224.csv.gz 2693738
turnstile/240302.csv.gz 2717306


In [11]:
# show new file content
file_path = f'{CONTAINER}/240302.csv.gz'
show_data(file_path)

gs://ozkary_data_lake_ozkary-de-101/turnstile/240302.csv.gz
     CA  UNIT       SCP STATION LINENAME DIVISION        DATE      TIME  \
0  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  03:00:00   
1  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  07:00:00   
2  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  11:00:00   
3  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  15:00:00   
4  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  19:00:00   
5  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/24/2024  23:00:00   
6  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/25/2024  03:00:00   
7  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/25/2024  07:00:00   
8  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/25/2024  11:00:00   
9  A002  R051  02-00-00   59 ST  NQR456W      BMT  02/25/2024  15:00:00   

      DESC  ENTRIES  EXITS  
0  REGULAR   128436  94650  
1  REGULAR   128443  94666  
2  REGULAR   128479  94733 