# Interacting with the Data Lake files

### Install the cloud provider Python libraries

```bash

pip install gcsfs

```

In [1]:
import os
import pandas as pd

In [2]:
# Specify the GCS bucket and path
PROJECT = os.getenv('GOOGLE_PROJECT_NAME')
BUCKET = 'gs://ozkary_data_lake_ozkary-de-101'
CONTAINER = 'turnstile'

## Use the Google Cloud API

In [3]:
# query the bucket using GCS API and display the file names and sizes
from google.cloud import storage
def files_in_storage():
    storage_client = storage.Client(project=PROJECT)
    bucket = storage_client.get_bucket(BUCKET.split('/')[-1])
    files = bucket.list_blobs()
    for file in files:    
        print(file.name, file.size)

files_in_storage()

turnstile/240302.csv.gz 2717306
turnstile/240309.csv.gz 2756893
turnstile/batch_23_20240722_210838.csv.gz 372
turnstile/batch_24_20240722_211013.csv.gz 1231


## View a data sample

In [4]:
def show_data(path: str):
    # Specify the GCS URL with the path to your file in your GCS bucket
    gcs_url = f'{BUCKET}/{path}'
    print(gcs_url)

    # Use Pandas to read data from the GCS URL
    df = pd.read_csv(gcs_url, iterator=False,compression="gzip", index_col=0)

    # Now you can work with the DataFrame 'df' as usual
    print(df.head(10))  # Example: Display the first few rows of the DataFrame

PATH = f'{CONTAINER}/batch_24_20240722_211013.csv.gz'
show_data(PATH)

gs://ozkary_data_lake_ozkary-de-101/turnstile/batch_24_20240722_211013.csv.gz
     CA  UNIT       SCP        STATION LINENAME DIVISION        DATE  \
0  A001  R002  02-00-00  Test-Station1   456NQR      BMT  07/22/2024   
1  A002  R001  02-00-00  Test-Station2   456NQR      BMT  07/22/2024   
2  A002  R002  02-00-00  Test-Station1   456NQR      BMT  07/22/2024   
3  A001  R002  02-00-00  Test-Station1   456NQR      BMT  07/22/2024   
4  A001  R001  02-00-00  Test-Station1   456NQR      BMT  01/30/2024   
5  A002  R002  02-00-00  Test-Station1   456NQR      BMT  07/22/2024   
6  A002  R002  02-00-00  Test-Station2   456NQR      BMT  07/22/2024   
7  A001  R002  02-00-00  Test-Station1   456NQR      BMT  07/22/2024   
8  A001  R002  02-00-00  Test-Station1   456NQR      BMT  07/22/2024   
9  A001  R001  02-00-00  Test-Station1   456NQR      BMT  07/22/2024   

       TIME     DESC  ENTRIES  EXITS  
0  17:55:00  REGULAR       26     32  
1  17:35:00  REGULAR       25     25  
2  17:55:00 

## Look at the data from the data warehouse


In [5]:
import google.cloud.bigquery

DATASET = 'mta_data'

# using the bigquery client library 
client = google.cloud.bigquery.Client()

# set a reference to the database
dataset_ref = client.dataset(DATASET, project=PROJECT)

# define a run query function
def run_query(sql):
    query = client.query(sql)
    return query.to_dataframe()
    

### View the data that is available on the data lake by querying the external table.

In [6]:
# list the records from the external table (from the data lake)
sql = """
SELECT CA,UNIT,STATION,LINENAME,DIVISION,DATE,TIME,`DESC`,ENTRIES,EXITS 
FROM `{}.{}.ext_turnstile`  
limit 100
""".format(PROJECT, DATASET)
# query the data using the client reference
df = run_query(sql)
print(sql)
df.head(5)


SELECT CA,UNIT,STATION,LINENAME,DIVISION,DATE,TIME,`DESC`,ENTRIES,EXITS 
FROM `ozkary-de-101.mta_data.ext_turnstile`  
limit 100



Unnamed: 0,CA,UNIT,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A001,R002,Test-Station1,456NQR,BMT,2024-07-22,17:55:00,REGULAR,26,32
1,A002,R001,Test-Station2,456NQR,BMT,2024-07-22,17:35:00,REGULAR,25,25
2,A002,R002,Test-Station1,456NQR,BMT,2024-07-22,17:55:00,REGULAR,6,6
3,A001,R002,Test-Station1,456NQR,BMT,2024-07-22,19:35:00,REGULAR,17,13
4,A001,R001,Test-Station1,456NQR,BMT,2024-01-30,16:20:00,REGULAR,5,9
