### Citibike Analysis

In [2]:
import os
from dotenv import load_dotenv

from google.cloud import bigquery
from google.api_core.exceptions import NotFound, BadRequest


import numpy as np


In [3]:
load_dotenv()

True

In [4]:
client = bigquery.Client()

bq_project_id = os.getenv('GCP_PROJECT_ID')
bq_dataset_name = os.getenv('BQ_DATASET')
bq_table_name = os.getenv('BQ_TABLE')

table_red = f'{bq_project_id}.{bq_dataset_name}.{bq_table_name}'
cleaned_table_ref = f'{bq_project_id}.{bq_dataset_name}.{bq_table_name}_cleaned'

In [6]:
query = f"""
    SELECT *
    FROM `{bq_dataset_name}.{bq_table_name}`
    LIMIT 5
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,cleaned_start_station_id,cleaned_start_station_name
0,BEE0EBE884D553D7,classic_bike,2024-01-09 14:03:12.497000+00:00,2024-01-09 14:05:11.977000+00:00,7 Ave & 62 St,2821.05,67 St & Erik Pl,2733.03,40.63556,-74.01298,40.633385,-74.016562,member,,
1,DBD25BDEA5B08974,classic_bike,2024-01-08 15:22:34.567000+00:00,2024-01-08 15:29:16.699000+00:00,3 Ave & Wakeman Pl,2883.03,67 St & Erik Pl,2733.03,40.638246,-74.024714,40.633385,-74.016562,member,,
2,E4E190622BE414D5,classic_bike,2024-01-17 15:07:48.899000+00:00,2024-01-17 15:11:57.181000+00:00,62 St & 4 Ave,2923.01,67 St & Erik Pl,2733.03,40.639859,-74.019776,40.633385,-74.016562,member,,
3,FF65D4B58A1405E3,classic_bike,2024-01-23 15:08:24.730000+00:00,2024-01-23 15:12:43.652000+00:00,62 St & 4 Ave,2923.01,67 St & Erik Pl,2733.03,40.639859,-74.019776,40.633385,-74.016562,member,,
4,A66EB4B910EAE69B,classic_bike,2024-01-23 18:39:35.432000+00:00,2024-01-23 18:46:03.936000+00:00,Wakeman Pl & Ridge Blvd,2932.03,67 St & Erik Pl,2733.03,40.639421,-74.026823,40.633385,-74.016562,member,,


In [7]:
df.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'cleaned_start_station_id',
       'cleaned_start_station_name'],
      dtype='object')

In [8]:
df.dtypes

ride_id                                    object
rideable_type                              object
started_at                    datetime64[us, UTC]
ended_at                      datetime64[us, UTC]
start_station_name                         object
start_station_id                           object
end_station_name                           object
end_station_id                             object
start_lat                                 float64
start_lng                                 float64
end_lat                                   float64
end_lng                                   float64
member_casual                              object
cleaned_start_station_id                   object
cleaned_start_station_name                 object
dtype: object

In [63]:
query = f"""
    SELECT 
        COUNT(*) - COUNT(ride_id) as ride_id_null,
        COUNT(*) - COUNT(rideable_type) as rideable_type_null,
        COUNT(*) - COUNT(started_at) as started_at_null,
        COUNT(*) - COUNT(ended_at) as ended_at_null,
        COUNT(*) - COUNT(start_station_name) as start_station_name_null,
        COUNT(*) - COUNT(start_station_id) as start_station_idnull,
        COUNT(*) - COUNT(end_station_name) as end_station_name_null,
        COUNT(*) - COUNT(end_station_id) as end_station_id_null,
        COUNT(*) - COUNT(start_lat) as start_lat_null,
        COUNT(*) - COUNT(start_lng) as start_lng_null,
        COUNT(*) - COUNT(end_lat) as end_lat_null,
        COUNT(*) - COUNT(end_lng) as end_lng_null,
        COUNT(*) - COUNT(member_casual) as member_casual_null
    FROM `{bq_dataset_name}.{bq_table_name}`
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,ride_id_null,rideable_type_null,started_at_null,ended_at_null,start_station_name_null,start_station_idnull,end_station_name_null,end_station_id_null,start_lat_null,start_lng_null,end_lat_null,end_lng_null,member_casual_null
0,0,0,0,0,35253,0,152745,0,0,0,12784,12784,0


In [61]:
query = f"""
    SELECT *
    FROM `{bq_dataset_name}.{bq_table_name}`
    WHERE start_station_name IS NULL
    LIMIT 5
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,cleaned_start_station_id,cleaned_start_station_name
0,28F90869755C48C9,electric_bike,2025-02-27 14:34:16.410000+00:00,2025-02-27 14:56:52.371000+00:00,,,West Drive & Prospect Park West,3651.04,40.71,-73.96,40.661063,-73.979453,casual,,
1,82C96FB98A12AF94,electric_bike,2025-02-13 18:26:58.798000+00:00,2025-02-13 19:14:27.990000+00:00,,,10 St & 2 Ave,3922.02,40.67,-73.99,40.671907,-73.993612,casual,,
2,9D3F9FF6A1CB04FA,electric_bike,2025-02-28 13:24:11.986000+00:00,2025-02-28 13:42:06.072000+00:00,,,Plaza St West & Flatbush Ave,4010.13,40.69,-73.96,40.675021,-73.971115,casual,,
3,AAECB307DC987EAC,electric_bike,2025-02-18 15:21:46.179000+00:00,2025-02-18 15:31:33.579000+00:00,,,Berkeley Pl & 7 Ave,4051.01,40.69,-73.97,40.675147,-73.975232,casual,,
4,78FE019D3793595F,electric_bike,2025-02-06 18:11:29.620000+00:00,2025-02-06 18:27:23.690000+00:00,,,President St & 4 Ave,4101.17,40.69,-73.99,40.676757,-73.983262,casual,,


In [62]:
query = f"""
    SELECT *
    FROM `{bq_dataset_name}.{bq_table_name}`
    WHERE end_station_name IS NULL
    LIMIT 5
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,cleaned_start_station_id,cleaned_start_station_name
0,AF649320B00E8900,classic_bike,2024-01-02 11:45:54.266000+00:00,2024-01-03 12:45:46.174000+00:00,Ocean Pkwy & Church Ave,3125.09,,,40.644719,-73.974515,,,member,,
1,4A1941D57351D2DE,classic_bike,2024-01-09 16:57:23.341000+00:00,2024-01-10 17:57:18.311000+00:00,53 St & 2 Ave,3211.06,,,40.64744,-74.018846,,,member,,
2,35D42290A540E5A1,classic_bike,2024-01-13 14:32:30.239000+00:00,2024-01-14 15:32:25.997000+00:00,Tilden Ave & Lott St,3214.04,,,40.64661,-73.95401,,,member,,
3,BD650AE985791B9E,classic_bike,2024-01-08 16:09:12.422000+00:00,2024-01-09 17:08:57.808000+00:00,Nostrand Ave & Rutland Rd,3585.06,,,40.6595,-73.95041,,,member,,
4,E0814A1F63DDF1A2,classic_bike,2024-01-04 08:01:16.027000+00:00,2024-01-05 09:01:09.430000+00:00,6 Ave & 21 St,3628.11,,,40.66016,-73.990974,,,member,,


In [11]:
query = f"""
    SELECT COUNT(*) as cnt
    FROM `{bq_dataset_name}.{bq_table_name}`
    WHERE start_station_id = 'nan' OR end_station_id = 'nan'
"""

nan_cnt = client.query(query).to_dataframe()['cnt'][0]
nan_cnt



np.int64(182206)

In [37]:
query = f"""
    CREATE OR REPLACE TABLE {cleaned_table_ref}
    PARTITION BY DATE_TRUNC(started_at, MONTH)
    CLUSTER BY member_casual, rideable_type 
        AS
            SELECT * FROM `{bq_dataset_name}.{bq_table_name}`
            WHERE start_station_id != 'nan' AND end_station_id != 'nan'
"""

job = client.query(query)
job.result()


<google.cloud.bigquery.table._EmptyRowIterator at 0x74f5785f3710>

In [38]:
query = f"""
    SELECT COUNT(*) as cnt
    FROM {cleaned_table_ref}
    WHERE start_station_id = 'nan' OR end_station_id = 'nan'
"""

nan_cnt = client.query(query).to_dataframe()['cnt'][0]
nan_cnt



np.int64(0)

In [39]:
query = f"""
    SELECT start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    WHERE start_station_id IS NULL OR SAFE_CAST(start_station_id AS FLOAT64) IS NULL
    GROUP BY start_station_id;
"""

df = client.query(query).to_dataframe()
df

Unnamed: 0,start_station_id,f0_
0,JC066,3
1,Lab - NYC,6
2,HB305,4
3,JC098,1
4,HB202,3
...,...,...
72,HB404,2
73,JC024,6
74,HB603,5
75,HB303,1


In [48]:
regex = r'^[A-Za-z]{2}\d{3}$'

query = f"""
    DELETE 
    FROM {cleaned_table_ref}
    WHERE REGEXP_CONTAINS(start_station_id, r'{regex}') OR REGEXP_CONTAINS(end_station_id, r'{regex}')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x74f57821c3b0>

In [46]:
query = f"""
    SELECT start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    WHERE start_station_id IS NULL OR SAFE_CAST(start_station_id AS FLOAT64) IS NULL
    GROUP BY start_station_id;
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,start_station_id,f0_


In [56]:
regex_patterns = [
    r'^SYS\d{3}$',          # SYS + 3 digits
    r'(?i)Demo',            # Contains "Demo"
    r'(?i)Lab - NYC',       # Contains "Lab - NYC"
    r'(?i)Morgan'           # Contains "Morgan"
]

combined_pattern = '|'.join(regex_patterns)

query = f"""
    DELETE FROM {cleaned_table_ref}
    WHERE REGEXP_CONTAINS(start_station_id, r'{combined_pattern}') OR REGEXP_CONTAINS(end_station_id, r'{combined_pattern}')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x74f57831d3a0>

In [52]:
regex = r'^\d+\.\d+_$'

query = f"""
    UPDATE {cleaned_table_ref}
    SET 
        start_station_id = CASE 
            WHEN REGEXP_CONTAINS(start_station_id, r'{regex}') THEN REPLACE(start_station_id, '_', '')
            ELSE start_station_id
        END,
        end_station_id = CASE 
            WHEN REGEXP_CONTAINS(end_station_id, r'{regex}') THEN REPLACE(end_station_id, '_', '')
            ELSE end_station_id
        END
    WHERE 
        REGEXP_CONTAINS(start_station_id, r'{regex}') OR
        REGEXP_CONTAINS(end_station_id, r'{regex}')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x74f5881b5940>

In [53]:
regex_id = r'_Pillar$'
regex_name = r'(?i)\s*Pillar\s*'

query = f"""
    UPDATE {cleaned_table_ref}
    SET 
        start_station_id = CASE 
            WHEN REGEXP_CONTAINS(start_station_id, r'Pillar') THEN REGEXP_REPLACE(start_station_id, r'{regex_id}', '') 
            ELSE start_station_id END,
        start_station_name = CASE 
            WHEN REGEXP_CONTAINS(start_station_name, r'Pillar') THEN TRIM(REGEXP_REPLACE(start_station_name, r'{regex_name}', ' ')) 
            ELSE start_station_name END,
        end_station_id = CASE 
            WHEN REGEXP_CONTAINS(end_station_id, r'Pillar') THEN REGEXP_REPLACE(end_station_id, r'{regex_id}', '') 
            ELSE end_station_id END,
        end_station_name = CASE 
            WHEN REGEXP_CONTAINS(end_station_name, r'Pillar') THEN TRIM(REGEXP_REPLACE(end_station_name, r'{regex_name}', ' ')) 
            ELSE end_station_name END
        WHERE 
            REGEXP_CONTAINS(start_station_id, r'Pillar') OR
            REGEXP_CONTAINS(start_station_name, r'Pillar') OR
            REGEXP_CONTAINS(end_station_id, r'Pillar') OR
            REGEXP_CONTAINS(end_station_name, r'Pillar')
"""

job = client.query(query)
job.result()


<google.cloud.bigquery.table._EmptyRowIterator at 0x74f58a37bef0>

In [54]:
query = f"""
    SELECT start_station_name, start_station_id, COUNT(*)
    FROM `{cleaned_table_ref}`
    WHERE start_station_id IS NULL OR SAFE_CAST(start_station_id AS FLOAT64) IS NULL
    GROUP BY start_station_name, start_station_id;
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,start_station_name,start_station_id,f0_


In [57]:
query = f"""
    SELECT end_station_name, end_station_id, COUNT(*)
    FROM `{cleaned_table_ref}`
    WHERE end_station_id IS NULL OR SAFE_CAST(end_station_id AS FLOAT64) IS NULL
    GROUP BY end_station_name, end_station_id;
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,end_station_name,end_station_id,f0_


In [62]:
query = f"""
    SELECT start_station_name, start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    GROUP BY start_station_name, start_station_id
"""

df = client.query(query).to_dataframe()
df[df['start_station_id'].duplicated(keep=False)].sort_values(by='start_station_id')



Unnamed: 0,start_station_name,start_station_id,f0_
26,Eastern Pkwy & St Marks Ave,3982.01,1169
570,Eastern Pkwy\t& St Marks Ave,3982.01,1557
1808,Bridge St & Front St,4968.03,19105
1744,Bridge St & Water St,4968.03,26853
573,Morton St & Washington St,5772.05,30999
773,Morton St & Greenwich St,5772.05,39300
2145,34th Ave & Vernon Blvd,6873.01,12397
435,34 Ave & Vernon Blvd,6873.01,3864
453,Central Park West & W 68 St,7079.06,82351
2187,Central Park W & W 68 St,7079.06,26746


In [63]:
query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_id = CAST(FORMAT('%.2f', CAST(start_station_id AS FLOAT64)) AS STRING)
    WHERE TRUE
"""
job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x73d090134bc0>

In [65]:
convert_names = {
    'West': 'W',
    'Fort': 'Ft',
    'Av': 'Ave',
    'Ichan Stadium': 'Icahn Stadium'
}

sql_expr = "start_station_name"
for old, new in convert_names.items():
    sql_expr = f"REGEXP_REPLACE({sql_expr}, r'\\b{old}\\b', '{new}')"

query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_name = {sql_expr}
    WHERE TRUE
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x73d090142c60>

In [66]:
pattern = r'\\t'

query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_name = REGEXP_REPLACE(start_station_name, r'{pattern}', ' ')
    WHERE REGEXP_CONTAINS(start_station_name, r'{pattern}')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x73d08a740800>

In [67]:
pattern = r'(\d+)(st|nd|rd|th)\b'
replace = r'\1'

query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_id = REGEXP_REPLACE(start_station_name, r'{pattern}', r'{replace}')
    WHERE REGEXP_CONTAINS(start_station_name, r'{pattern}')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x73d08a582c30>

In [68]:
query = f"""
    SELECT start_station_name, start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    GROUP BY start_station_name, start_station_id
"""

df = client.query(query).to_dataframe()
df[df['start_station_id'].duplicated(keep=False)].sort_values(by='start_station_id')



Unnamed: 0,start_station_name,start_station_id,f0_
846,Bridge St & Front St,4968.03,19105
1434,Bridge St & Water St,4968.03,26853
443,Morton St & Greenwich St,5772.05,39300
1401,Morton St & Washington St,5772.05,30999
1740,3 Ave & E 82 St,7154.1,41574
2217,3 Ave & E 81 St,7154.1,21819


In [76]:
convert_names = {
    'Bridge St & Water St': 'Bridge St & Front St',
    'Morton St & Washington St': 'Morton St & Greenwich St',
    '3 Ave & E 81 St': '3 Ave & E 82 St'
}

case_statements = []
for old, new in convert_names.items():
    case_statements.append(f"WHEN start_station_name = '{old}' THEN '{new}'")

query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_name = CASE {' '.join(case_statements)} ELSE start_station_name END
    WHERE start_station_name IN ({', '.join(f"'{old}'" for old in convert_names.keys())})
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x73d09017fef0>

In [77]:
query = f"""
    SELECT start_station_name, start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    GROUP BY start_station_name, start_station_id
"""

df = client.query(query).to_dataframe()
df[df['start_station_id'].duplicated(keep=False)].sort_values(by='start_station_id')



Unnamed: 0,start_station_name,start_station_id,f0_
