### Citibike Analysis

In [34]:
import os
from dotenv import load_dotenv

from google.cloud import bigquery
from google.api_core.exceptions import NotFound, BadRequest


import numpy as np


In [4]:
load_dotenv()

True

In [59]:
client = bigquery.Client()

bq_project_id = os.getenv('GCP_PROJECT_ID')
bq_dataset_name = os.getenv('BQ_DATASET')
bq_table_name = os.getenv('BQ_TABLE')

table_red = f'{bq_project_id}.{bq_dataset_name}.{bq_table_name}'
cleaned_table_ref = f'{bq_project_id}.{bq_dataset_name}.{bq_table_name}_cleaned'

In [6]:
query = f"""
    SELECT *
    FROM `{bq_dataset_name}.{bq_table_name}`
    LIMIT 10
"""

df = client.query(query).to_dataframe()
df.head()



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,58CE55A3C9A28B75,classic_bike,2024-10-22 17:18:00.335000+00:00,2024-10-22 18:01:11.625000+00:00,67 St & Erik Pl,2733.03,67 St & Erik Pl,2733.03,40.633385,-74.016562,40.633385,-74.016562,casual
1,8606F29B790582C7,classic_bike,2024-10-22 17:17:25.659000+00:00,2024-10-22 18:04:03.953000+00:00,67 St & Erik Pl,2733.03,67 St & Erik Pl,2733.03,40.633385,-74.016562,40.633385,-74.016562,casual
2,9AF9C60E85C9B090,classic_bike,2024-10-05 17:34:37.733000+00:00,2024-10-05 17:46:19.378000+00:00,67 St & Erik Pl,2733.03,67 St & Erik Pl,2733.03,40.633385,-74.016562,40.633385,-74.016562,casual
3,94258E9E69D20F70,classic_bike,2024-10-22 17:20:53.652000+00:00,2024-10-22 18:02:19.586000+00:00,67 St & Erik Pl,2733.03,67 St & Erik Pl,2733.03,40.633385,-74.016562,40.633385,-74.016562,casual
4,17A62298CA22EFBA,classic_bike,2024-10-22 17:12:37.859000+00:00,2024-10-22 18:00:59.733000+00:00,67 St & Erik Pl,2733.03,67 St & Erik Pl,2733.03,40.633385,-74.016562,40.633385,-74.016562,casual


In [40]:
df.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')

In [41]:
df.dtypes

ride_id                            object
rideable_type                      object
started_at            datetime64[us, UTC]
ended_at              datetime64[us, UTC]
start_station_name                 object
start_station_id                   object
end_station_name                   object
end_station_id                     object
start_lat                         float64
start_lng                         float64
end_lat                           float64
end_lng                           float64
member_casual                      object
dtype: object

In [48]:
query = f"""
    SELECT 
        COUNT(*) - COUNT(ride_id) as ride_id_null,
        COUNT(*) - COUNT(rideable_type) as rideable_type_null,
        COUNT(*) - COUNT(started_at) as started_at_null,
        COUNT(*) - COUNT(ended_at) as ended_at_null,
        COUNT(*) - COUNT(start_station_name) as start_station_name_null,
        COUNT(*) - COUNT(start_station_id) as start_station_idnull,
        COUNT(*) - COUNT(end_station_name) as end_station_name_null,
        COUNT(*) - COUNT(end_station_id) as end_station_id_null,
        COUNT(*) - COUNT(start_lat) as start_lat_null,
        COUNT(*) - COUNT(start_lng) as start_lng_null,
        COUNT(*) - COUNT(end_lat) as end_lat_null,
        COUNT(*) - COUNT(end_lng) as end_lng_null,
        COUNT(*) - COUNT(member_casual) as member_casual_null
    FROM `{bq_dataset_name}.{bq_table_name}`
"""

df = client.query(query).to_dataframe()
df.T



Unnamed: 0,0
ride_id_null,0
rideable_type_null,0
started_at_null,0
ended_at_null,0
start_station_name_null,35253
start_station_idnull,0
end_station_name_null,152745
end_station_id_null,0
start_lat_null,0
start_lng_null,0


In [164]:
query = f"""
    SELECT start_station_id, COUNT(*)
    FROM `{bq_dataset_name}.{bq_table_name}`
    WHERE start_station_id IS NULL OR SAFE_CAST(start_station_id AS FLOAT64) IS NULL
    GROUP BY start_station_id;
"""

df = client.query(query).to_dataframe()
print(df['start_station_id'].astype(str).tolist())



['JC009', 'JC055', 'HB611', 'JC075', 'JC105', 'JC008', 'HB305', '6173.08_Pillar', 'JC038', 'JC027', 'JC081', 'JC019', 'HB105', 'JC023', 'JC102', 'HB503', 'HB602', 'HB404', 'HB603', 'HB103', 'JC076', 'JC003', 'HB301', 'HB302', 'JC109', 'JC103', 'JC078', 'JC080', 'JC093', 'JC094', 'JC013', 'HB408', 'HB506', 'HB202', 'HB304', '190 Morgan', 'HB102', 'HB203', 'SYS016', '6247.06_Pillar', 'HB601', 'LA Metro Demo 2', 'JC084', 'HB201', 'HB609', 'HB303', 'JC098', 'Lab - NYC', 'HB101', 'JC032', 'JC014', 'SYS038', 'HB407', 'JC024', 'JC020', 'JC116', 'JC018', '6569.09_', 'SYS033', 'JC052', 'JC022', 'JC066', 'HB505', 'Shop Morgan ', 'JC104', 'JC072', 'JC074', '5308.04_', 'HB502', 'HB501', 'JC115', 'HB401', 'LA Metro Demo 1', 'JC099', 'Lab - NYC - Monolith', 'SYS025', 'JC002']


In [55]:
# Regex patterns to exclude
regex_patterns = [
    r'^[A-Za-z]{2}\d{3}$',  # Two letters + 3 digits
    r'^SYS\d{3}$',          # SYS + 3 digits
    r'(?i)Demo',            # Contains "Demo"
    r'(?i)Lab - NYC',       # Contains "Lab - NYC"
    r'(?i)Morgan'           # Contains "Morgan"
]

# Combine patterns using alternation (OR)
combined_pattern = '|'.join(regex_patterns)

# Create the query excluding the combined regex patterns
query = f"""
CREATE OR REPLACE TABLE `{bq_dataset_name}.{bq_table_name}_cleaned`
PARTITION BY DATE_TRUNC(started_at, MONTH)
CLUSTER BY member_casual, rideable_type

 AS

  SELECT * EXCEPT(start_station_id, start_station_name),
  -- Clean start_station_id
  CASE
    WHEN REGEXP_CONTAINS(start_station_id, r'^\\d+\\.\\d+_$') THEN REPLACE(start_station_id, '_', '')
    WHEN CONTAINS_SUBSTR(start_station_id, 'Pillar') THEN REGEXP_REPLACE(start_station_id, r'_Pillar$', '')
    ELSE start_station_id
  END AS start_station_id,

  -- Clean start_station_name only if ID contains 'Pillar'
  CASE
    WHEN CONTAINS_SUBSTR(start_station_id, 'Pillar') THEN REGEXP_REPLACE(start_station_name, r'(?i)Pillar', '')
    ELSE start_station_name
  END AS start_station_name

FROM `{bq_dataset_name}.{bq_table_name}`
WHERE NOT REGEXP_CONTAINS(start_station_id, r"{combined_pattern}")
"""

job = client.query(query)
job.result()


<google.cloud.bigquery.table._EmptyRowIterator at 0x7d2632d22e70>

In [60]:
query = f"""
    SELECT start_station_name, start_station_id, COUNT(*)
    FROM `{cleaned_table_ref}`
    WHERE start_station_id IS NULL OR SAFE_CAST(start_station_id AS FLOAT64) IS NULL
    GROUP BY start_station_name, start_station_id;
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,start_station_name,start_station_id,f0_
