# Interacting with the Data Lake files

### Install the cloud provider Python libraries

- Google Cloud Storage File-system
  
```bash
pip install gcsfs
```

In [1]:
import os
import pandas as pd

In [2]:
# Specify the GCS bucket and path
PROJECT = os.getenv('GOOGLE_PROJECT_NAME')
BUCKET = 'gs://ozkary_data_lake_ozkary-de-101'
CONTAINER = 'turnstile'

## Use the Google Cloud API

In [19]:
# query the bucket using GCS API and display the file names and sizes
from google.cloud import storage
def files_in_storage():
    storage_client = storage.Client(project=PROJECT)
    bucket = storage_client.get_bucket(BUCKET.split('/')[-1])
    files = bucket.list_blobs()
    
    for file in files:    
        print(file.name, file.size)

files_in_storage()

turnstile/241005.csv.gz 2619686
turnstile/241012.csv.gz 2666464
turnstile/241019.csv.gz 2633386


## View a data sample

In [17]:
def show_data(path: str):
    # Specify the GCS URL with the path to your file in your GCS bucket
    gcs_url = f'{BUCKET}/{path}'
    print(gcs_url)

    # Use Pandas to read data from the GCS URL
    df = pd.read_csv(gcs_url, iterator=False,compression="gzip", index_col=0)

    # Now you can work with the DataFrame 'df' as usual
    print(df.head(10))  # Example: Display the first few rows of the DataFrame

PATH = f'{CONTAINER}/241005.csv.gz'
show_data(PATH)

gs://ozkary_data_lake_ozkary-de-101/turnstile/241005.csv.gz
     CA  UNIT       SCP STATION LINENAME DIVISION        DATE      TIME  \
0  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  00:00:00   
1  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  04:00:00   
2  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  08:00:00   
3  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  12:00:00   
4  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  16:00:00   
5  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  20:00:00   
6  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/29/2024  00:00:00   
7  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/29/2024  04:00:00   
8  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/29/2024  08:00:00   
9  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/29/2024  12:00:00   

         DESC  ENTRIES   EXITS  
0  RECOVR AUD   216611  166467  
1  RECOVR AUD   216622  166475  
2  RECOVR AUD  

## Monitor your orchestration system


In [8]:
# check the prefect system
!prefect flow-run ls

!prefect deployment ls

!prefect block ls


  "class": algorithms.Blowfish,
[32mNo flow runs found.[0m
  "class": algorithms.Blowfish,
[3m                                  Deployments                                  [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1mName                                [0m[1m [0m┃[1m [0m[1mID                                  [0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│[34m [0m[34mMTA Batch flow/[0m[1;34mdep-docker-mta-de-101[0m[34m [0m│[36m [0m[36m32f03a73-ebee-47d4-96af-405c20b7b76d[0m[36m [0m│
│[34m [0m[34mMTA Test/[0m[1;34mprefect-test-deployment[0m[34m    [0m[34m [0m│[36m [0m[36mfe26904d-f3a0-40af-bead-863320ad5cad[0m[36m [0m│
└──────────────────────────────────────┴──────────────────────────────────────┘
  "class": algorithms.Blowfish,
[3m                                     Blocks                                     [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━

## Run the data pipeline process locally

- Run this command from a terminal to start the pipeline

```bash
python3 ./flows/etl_web_to_gcs.py --year 2024 --month 2 --day 24
```

In [12]:
#check the active flows
!prefect flow-run ls


  "class": algorithms.Blowfish,
[3m                                   Flow Runs                                    [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━┳━━━━━━━━━┓
┃[1m [0m[1m                            ID[0m[1m [0m┃[1m [0m[1mFlow            [0m[1m [0m┃[1m [0m[1mName      [0m[1m [0m┃[1m [0m[1m…[0m[1m [0m┃[1m [0m[1mWhen   [0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━╇━━━━━━━━━┩
│[36m [0m[36m9333211f-df7e-4277-ac26-d5ee1…[0m[36m [0m│[34m [0m[34mMTA : etl_web_t…[0m[34m [0m│[32m [0m[32mgiga-star…[0m[32m [0m│ … │[1m [0m[1m1 minu…[0m[1m [0m│
│[36m [0m[36m46c57f04-c0f1-4e78-93d0-a5f8f…[0m[36m [0m│[34m [0m[34mMTA Batch flow  [0m[34m [0m│[32m [0m[32mmassive-c…[0m[32m [0m│ … │[1m [0m[1m1 minu…[0m[1m [0m│
└────────────────────────────────┴──────────────────┴────────────┴───┴─────────┘


In [13]:
# check the files in the data lake
files_in_storage()

turnstile/241005.csv.gz 2619686


In [18]:
# show new file content
file_path = f'{CONTAINER}/241005.csv.gz'
show_data(file_path)

gs://ozkary_data_lake_ozkary-de-101/turnstile/241005.csv.gz
     CA  UNIT       SCP STATION LINENAME DIVISION        DATE      TIME  \
0  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  00:00:00   
1  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  04:00:00   
2  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  08:00:00   
3  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  12:00:00   
4  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  16:00:00   
5  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/28/2024  20:00:00   
6  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/29/2024  00:00:00   
7  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/29/2024  04:00:00   
8  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/29/2024  08:00:00   
9  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/29/2024  12:00:00   

         DESC  ENTRIES   EXITS  
0  RECOVR AUD   216611  166467  
1  RECOVR AUD   216622  166475  
2  RECOVR AUD  