# Interacting with the Data Lake files

### Install the cloud provider Python libraries

```bash

pip install gcsfs

```

In [1]:
import os
import pandas as pd

In [2]:
# Specify the GCS bucket and path
PROJECT = 'ozkary-de-101'
BUCKET = 'gs://ozkary_data_lake_ozkary-de-101'
PATH = 'turnstile/230930.csv.gz'

In [3]:
# show the files in the bucket
!gsutil ls -l $BUCKET/turnstile



Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update

   2756245  2023-08-25T13:59:47Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/230805.csv.gz
   2742272  2023-08-25T14:01:25Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/230812.csv.gz
   2722456  2023-08-25T14:02:46Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/230819.csv.gz
   2694682  2023-10-23T16:35:00Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/230902.csv.gz
   2706608  2023-10-23T16:36:08Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/230909.csv.gz
   2732408  2023-10-23T16:37:15Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/230916.csv.gz
   2743126  2023-10-23T16:38:18Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/230923.csv.gz
   2711959  2023-10-23T16:39:23Z  gs://ozkary_data_lake_ozkary-de-101/turnstile/230930.csv.gz
TOTAL: 8 objects, 21809756 bytes (20.8 MiB)


In [4]:
# query the bucket using GCS API and display the file names and sizes
from google.cloud import storage
storage_client = storage.Client(project=PROJECT)
bucket = storage_client.get_bucket(BUCKET.split('/')[-1])
files = bucket.list_blobs()
for file in files:
    print(file.name, file.size)

turnstile/230805.csv.gz 2756245
turnstile/230812.csv.gz 2742272
turnstile/230819.csv.gz 2722456
turnstile/230902.csv.gz 2694682
turnstile/230909.csv.gz 2706608
turnstile/230916.csv.gz 2732408
turnstile/230923.csv.gz 2743126
turnstile/230930.csv.gz 2711959


In [5]:

# Specify the GCS URL with the path to your file in your GCS bucket
gcs_url = f'{BUCKET}/{PATH}'
print(gcs_url)

# Use Pandas to read data from the GCS URL
df = pd.read_csv(gcs_url, iterator=False,compression="gzip")

# Now you can work with the DataFrame 'df' as usual
print(df.head(20))  # Example: Display the first few rows of the DataFrame


gs://ozkary_data_lake_ozkary-de-101/turnstile/230930.csv.gz
    Unnamed: 0    CA  UNIT       SCP STATION LINENAME DIVISION        DATE  \
0            0  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/23/2023   
1            1  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/23/2023   
2            2  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/23/2023   
3            3  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/23/2023   
4            4  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/23/2023   
5            5  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/23/2023   
6            6  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/24/2023   
7            7  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/24/2023   
8            8  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/24/2023   
9            9  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/24/2023   
10          10  A002  R051  02-00-00   59 ST  NQR456W      BMT  09/24/2023   
11  