# Interacting with the Data Lake files

### Install the cloud provider Python libraries

```bash

pip install gcsfs

```

In [1]:
import os
import pandas as pd

In [2]:
# Specify the GCS bucket and path
PROJECT = 'ozkary-de-101'
BUCKET = 'gs://ozkary_data_lake_ozkary-de-101'
PATH = 'turnstile/240113.csv.gz'

In [3]:
# show the files in the bucket
!gsutil ls -l $BUCKET/turnstile



Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update

   2733534  2024-01-30T16:03:09Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231104.csv.gz
   2766446  2024-01-30T16:04:16Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231111.csv.gz
   2739833  2024-01-30T16:05:17Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231118.csv.gz
   2697420  2024-01-30T16:06:19Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231125.csv.gz
   2716923  2024-01-30T16:35:34Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231202.csv.gz
   2727822  2024-01-30T16:37:01Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231209.csv.gz
   2714129  2024-01-30T16:38:09Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231216.csv.gz
   2718102  2024-01-30T16:39:08Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231223.csv.gz
   2709209  2024-01-30T16:40:17Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/231230.csv.gz
   2704536  2024-03-01T21:41:15Z

## Use the Google Cloud API

In [5]:
# query the bucket using GCS API and display the file names and sizes
from google.cloud import storage
storage_client = storage.Client(project=PROJECT)
bucket = storage_client.get_bucket(BUCKET.split('/')[-1])
files = bucket.list_blobs()
for file in files:    
    print(file.name, file.size)

turnstile/231104.csv.gz 2733534
turnstile/231111.csv.gz 2766446
turnstile/231118.csv.gz 2739833
turnstile/231125.csv.gz 2697420
turnstile/231202.csv.gz 2716923
turnstile/231209.csv.gz 2727822
turnstile/231216.csv.gz 2714129
turnstile/231223.csv.gz 2718102
turnstile/231230.csv.gz 2709209
turnstile/240106.csv.gz 2704536
turnstile/240113.csv.gz 2714553
turnstile/240120.csv.gz 2700670
turnstile/240127.csv.gz 2709508
turnstile/240203.csv.gz 2719956
turnstile/240210.csv.gz 2694380
turnstile/batch_19_20240130_155844.csv.gz 250
turnstile/batch_20_20240130_160006.csv.gz 415
turnstile/batch_22_20240130_161658.csv.gz 250


## View a data sample

In [4]:

# Specify the GCS URL with the path to your file in your GCS bucket
gcs_url = f'{BUCKET}/{PATH}'
print(gcs_url)

# Use Pandas to read data from the GCS URL
df = pd.read_csv(gcs_url, iterator=False,compression="gzip", index_col=0)

# Now you can work with the DataFrame 'df' as usual
df.head(10)  # Example: Display the first few rows of the DataFrame


gs://ozkary_data_lake_ozkary-de-101/turnstile/240113.csv.gz


Unnamed: 0,CA,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/06/2024,03:00:00,REGULAR,105161,76217
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/06/2024,07:00:00,REGULAR,105165,76228
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/06/2024,11:00:00,REGULAR,105198,76278
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/06/2024,15:00:00,REGULAR,105257,76326
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/06/2024,19:00:00,REGULAR,105357,76385
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/06/2024,23:00:00,REGULAR,105408,76427
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/07/2024,03:00:00,REGULAR,105413,76434
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/07/2024,07:00:00,REGULAR,105414,76443
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/07/2024,11:00:00,REGULAR,105441,76476
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,01/07/2024,15:00:00,REGULAR,105487,76531
