### Citibike Analysis

In [2]:
import os
from dotenv import load_dotenv

from google.cloud import bigquery
from google.api_core.exceptions import NotFound, BadRequest


import numpy as np


In [3]:
load_dotenv()

True

In [4]:
client = bigquery.Client()

bq_project_id = os.getenv('GCP_PROJECT_ID')
bq_dataset_name = os.getenv('BQ_DATASET')
bq_table_name = os.getenv('BQ_TABLE')

table_red = f'{bq_project_id}.{bq_dataset_name}.{bq_table_name}'
cleaned_table_ref = f'{bq_project_id}.{bq_dataset_name}.{bq_table_name}_cleaned'

In [6]:
query = f"""
    SELECT *
    FROM `{bq_dataset_name}.{bq_table_name}`
    LIMIT 5
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,cleaned_start_station_id,cleaned_start_station_name
0,BEE0EBE884D553D7,classic_bike,2024-01-09 14:03:12.497000+00:00,2024-01-09 14:05:11.977000+00:00,7 Ave & 62 St,2821.05,67 St & Erik Pl,2733.03,40.63556,-74.01298,40.633385,-74.016562,member,,
1,DBD25BDEA5B08974,classic_bike,2024-01-08 15:22:34.567000+00:00,2024-01-08 15:29:16.699000+00:00,3 Ave & Wakeman Pl,2883.03,67 St & Erik Pl,2733.03,40.638246,-74.024714,40.633385,-74.016562,member,,
2,E4E190622BE414D5,classic_bike,2024-01-17 15:07:48.899000+00:00,2024-01-17 15:11:57.181000+00:00,62 St & 4 Ave,2923.01,67 St & Erik Pl,2733.03,40.639859,-74.019776,40.633385,-74.016562,member,,
3,FF65D4B58A1405E3,classic_bike,2024-01-23 15:08:24.730000+00:00,2024-01-23 15:12:43.652000+00:00,62 St & 4 Ave,2923.01,67 St & Erik Pl,2733.03,40.639859,-74.019776,40.633385,-74.016562,member,,
4,A66EB4B910EAE69B,classic_bike,2024-01-23 18:39:35.432000+00:00,2024-01-23 18:46:03.936000+00:00,Wakeman Pl & Ridge Blvd,2932.03,67 St & Erik Pl,2733.03,40.639421,-74.026823,40.633385,-74.016562,member,,


In [7]:
df.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'cleaned_start_station_id',
       'cleaned_start_station_name'],
      dtype='object')

In [8]:
df.dtypes

ride_id                                    object
rideable_type                              object
started_at                    datetime64[us, UTC]
ended_at                      datetime64[us, UTC]
start_station_name                         object
start_station_id                           object
end_station_name                           object
end_station_id                             object
start_lat                                 float64
start_lng                                 float64
end_lat                                   float64
end_lng                                   float64
member_casual                              object
cleaned_start_station_id                   object
cleaned_start_station_name                 object
dtype: object

In [9]:
query = f"""
    SELECT 
        COUNT(*) - COUNT(ride_id) as ride_id_null,
        COUNT(*) - COUNT(rideable_type) as rideable_type_null,
        COUNT(*) - COUNT(started_at) as started_at_null,
        COUNT(*) - COUNT(ended_at) as ended_at_null,
        COUNT(*) - COUNT(start_station_name) as start_station_name_null,
        COUNT(*) - COUNT(start_station_id) as start_station_idnull,
        COUNT(*) - COUNT(end_station_name) as end_station_name_null,
        COUNT(*) - COUNT(end_station_id) as end_station_id_null,
        COUNT(*) - COUNT(start_lat) as start_lat_null,
        COUNT(*) - COUNT(start_lng) as start_lng_null,
        COUNT(*) - COUNT(end_lat) as end_lat_null,
        COUNT(*) - COUNT(end_lng) as end_lng_null,
        COUNT(*) - COUNT(member_casual) as member_casual_null
    FROM `{bq_dataset_name}.{bq_table_name}`
"""

df = client.query(query).to_dataframe()
df.T



Unnamed: 0,0
ride_id_null,0
rideable_type_null,0
started_at_null,0
ended_at_null,0
start_station_name_null,35253
start_station_idnull,0
end_station_name_null,152745
end_station_id_null,0
start_lat_null,0
start_lng_null,0


In [10]:
query = f"""
    SELECT *
    FROM `{bq_dataset_name}.{bq_table_name}`
    WHERE start_station_name IS NULL
    LIMIT 10
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,cleaned_start_station_id,cleaned_start_station_name
0,D95657EFF4EB2175,electric_bike,2024-10-12 15:09:26.993000+00:00,2024-10-12 17:05:36.866000+00:00,,,Cortelyou Rd & E 34 St,3087.01,40.64,-73.95,40.64454,-73.94486,casual,,
1,D160033186FF878F,electric_bike,2024-10-25 17:47:08.947000+00:00,2024-10-25 17:54:59.801000+00:00,,,Stratford Rd & Beverley Rd,3113.1,40.66,-73.97,40.64378,-73.96867,casual,,
2,68FB24A367A64A5F,electric_bike,2024-10-08 16:06:52.445000+00:00,2024-10-08 16:16:58.292000+00:00,,,Schenectady Ave & Cortelyou Rd,3155.01,40.65,-73.96,40.64535,-73.93268,casual,,
3,6D8654B152D83210,electric_bike,2024-10-14 14:09:04.005000+00:00,2024-10-14 14:19:52.315000+00:00,,,Turner Pl & Coney Island Ave,3157.08,40.67,-73.98,40.64521,-73.97073,casual,,
4,CECE2EC8784ADA7B,electric_bike,2024-10-23 18:05:49.712000+00:00,2024-10-23 21:11:25.323000+00:00,,,E 21 St & Church Ave,3294.03,40.64,-73.95,40.65007,-73.96001,casual,,
5,5AE76B575577C467,electric_bike,2024-10-16 14:29:14.561000+00:00,2024-10-16 14:32:44.204000+00:00,,,Caton Ave & St. Pauls Pl,3335.09,40.65,-73.96,40.65153,-73.96277,casual,,
6,199986E82CE497E7,electric_bike,2024-10-12 09:57:28.317000+00:00,2024-10-12 10:10:32.675000+00:00,,,Winthrop St & Rogers Ave,3512.03,40.66,-73.98,40.65686,-73.95331,casual,,
7,736600775D99896E,electric_bike,2024-10-07 16:21:57.557000+00:00,2024-10-07 16:41:53.622000+00:00,,,Winthrop St & Rogers Ave,3512.03,40.7,-73.95,40.65686,-73.95331,casual,,
8,708F03396B2D13E0,electric_bike,2024-10-20 16:10:02.136000+00:00,2024-10-20 16:46:05.916000+00:00,,,Windsor Pl & Howard Pl,3579.04,40.69,-74.0,40.659491,-73.980139,casual,,
9,E9335E67D6445576,electric_bike,2024-10-05 17:58:28.917000+00:00,2024-10-05 18:17:44.059000+00:00,,,Flatbush Ave & Lincoln Rd,3593.1,40.66,-73.96,40.66131,-73.96091,casual,,


In [107]:
query = f"""
    SELECT *
    FROM `{bq_dataset_name}.{bq_table_name}`
    WHERE end_station_name IS NULL
    LIMIT 10
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,cleaned_start_station_id,cleaned_start_station_name
0,17C766D3A655BF8A,classic_bike,2024-01-04 22:51:30.140000+00:00,2024-01-05 23:51:26.067000+00:00,Caton Ave & Bedford Ave,3368.04,,,40.65237,-73.95623,,,member,,
1,13EF25F33B63EA85,classic_bike,2024-01-05 08:13:53.376000+00:00,2024-01-06 09:13:40.930000+00:00,14 St & 7 Ave,3731.11,,,40.663779,-73.983968,,,member,,
2,998C13F35E3AA327,classic_bike,2024-01-15 14:48:00.507000+00:00,2024-01-16 15:47:53.855000+00:00,Halsey St & Tompkins Ave,4319.07,,,40.682369,-73.944118,,,member,,
3,73A4227F8817C643,classic_bike,2024-01-02 07:53:15.249000+00:00,2024-01-03 08:53:09.992000+00:00,Carroll St & Columbia St,4348.07,,,40.683046,-74.003486,,,member,,
4,A482F744AD22ECE9,classic_bike,2024-01-15 15:49:10.968000+00:00,2024-01-16 16:48:51.174000+00:00,Atlantic Ave & Fort Greene Pl,4354.07,,,40.683826,-73.976323,,,member,,
5,5B497FF3315AD009,classic_bike,2024-01-23 09:41:56.595000+00:00,2024-01-24 10:41:51.461000+00:00,Warren St & Smith St,4371.01,,,40.685424,-73.991278,,,member,,
6,38689AEFFD0D8270,classic_bike,2024-01-06 15:40:21.941000+00:00,2024-01-07 16:40:17.865000+00:00,Putnam Ave & Throop Ave,4392.04,,,40.685153,-73.94111,,,member,,
7,6CBC7FF21FB45CA3,classic_bike,2024-01-27 12:04:07.207000+00:00,2024-01-28 13:04:02.710000+00:00,Warren St & Court St,4413.08,,,40.686371,-73.993833,,,member,,
8,B36E561B23B3D403,classic_bike,2024-01-29 15:33:28.034000+00:00,2024-01-30 16:33:19.773000+00:00,Monroe St & Tompkins Ave,4434.06,,,40.686203,-73.944694,40.68,-73.95,member,,
9,32BD4CFEEC4F53EC,classic_bike,2024-01-13 10:51:14.782000+00:00,2024-01-14 11:51:06.763000+00:00,3 Ave & Schermerhorn St,4437.01,,,40.686832,-73.979677,,,member,,


In [11]:
query = f"""
    SELECT COUNT(*) as cnt
    FROM `{bq_dataset_name}.{bq_table_name}`
    WHERE start_station_id = 'nan' OR end_station_id = 'nan'
"""

nan_cnt = client.query(query).to_dataframe()['cnt'][0]
nan_cnt



np.int64(182206)

In [37]:
query = f"""
    CREATE OR REPLACE TABLE {cleaned_table_ref}
    PARTITION BY DATE_TRUNC(started_at, MONTH)
    CLUSTER BY member_casual, rideable_type 
        AS
            SELECT * FROM `{bq_dataset_name}.{bq_table_name}`
            WHERE start_station_id != 'nan' AND end_station_id != 'nan'
"""

job = client.query(query)
job.result()


<google.cloud.bigquery.table._EmptyRowIterator at 0x74f5785f3710>

In [38]:
query = f"""
    SELECT COUNT(*) as cnt
    FROM {cleaned_table_ref}
    WHERE start_station_id = 'nan' OR end_station_id = 'nan'
"""

nan_cnt = client.query(query).to_dataframe()['cnt'][0]
nan_cnt



np.int64(0)

In [39]:
query = f"""
    SELECT start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    WHERE start_station_id IS NULL OR SAFE_CAST(start_station_id AS FLOAT64) IS NULL
    GROUP BY start_station_id;
"""

df = client.query(query).to_dataframe()
df

Unnamed: 0,start_station_id,f0_
0,JC066,3
1,Lab - NYC,6
2,HB305,4
3,JC098,1
4,HB202,3
...,...,...
72,HB404,2
73,JC024,6
74,HB603,5
75,HB303,1


In [40]:
regex = r'^[A-Za-z]{2}\d{3}$'

query = f"""
    DELETE 
    FROM {cleaned_table_ref}
    WHERE REGEXP_CONTAINS(start_station_id, r'{regex}')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x74f5785f3290>

In [46]:
query = f"""
    SELECT start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    WHERE start_station_id IS NULL OR SAFE_CAST(start_station_id AS FLOAT64) IS NULL
    GROUP BY start_station_id;
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,start_station_id,f0_


In [42]:
regex_patterns = [
    r'^SYS\d{3}$',          # SYS + 3 digits
    r'(?i)Demo',            # Contains "Demo"
    r'(?i)Lab - NYC',       # Contains "Lab - NYC"
    r'(?i)Morgan'           # Contains "Morgan"
]

combined_pattern = '|'.join(regex_patterns)

query = f"""
    DELETE FROM {cleaned_table_ref}
    WHERE REGEXP_CONTAINS(start_station_id, r'{combined_pattern}')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x74f589d89c40>

In [43]:
regex = r'^\d+\.\d+_$'

query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_id = REPLACE(start_station_id, '_', '')
    WHERE REGEXP_CONTAINS(start_station_id, r'{regex}')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x74f5804a22d0>

In [45]:
regex_id = r'_Pillar$'
regex_name = r'(?i)\s*Pillar\s*'

query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_id = REGEXP_REPLACE(start_station_id, r'{regex_id}', ''),
        start_station_name = TRIM(REGEXP_REPLACE(start_station_name, r'{regex_name}', ' '))
    WHERE CONTAINS_SUBSTR(start_station_id, 'Pillar')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x74f57821e4e0>

In [61]:
query = f"""
    SELECT start_station_name, start_station_id, COUNT(*)
    FROM `{cleaned_table_ref}`
    WHERE start_station_id IS NULL OR SAFE_CAST(start_station_id AS FLOAT64) IS NULL
    GROUP BY start_station_name, start_station_id;
"""

df = client.query(query).to_dataframe()
df



Unnamed: 0,start_station_name,start_station_id,f0_


In [62]:
query = f"""
    SELECT start_station_name, start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    GROUP BY start_station_name, start_station_id
"""

df = client.query(query).to_dataframe()
df[df['start_station_id'].duplicated(keep=False)].sort_values(by='start_station_id')



Unnamed: 0,start_station_name,start_station_id,f0_
26,Eastern Pkwy & St Marks Ave,3982.01,1169
570,Eastern Pkwy\t& St Marks Ave,3982.01,1557
1808,Bridge St & Front St,4968.03,19105
1744,Bridge St & Water St,4968.03,26853
573,Morton St & Washington St,5772.05,30999
773,Morton St & Greenwich St,5772.05,39300
2145,34th Ave & Vernon Blvd,6873.01,12397
435,34 Ave & Vernon Blvd,6873.01,3864
453,Central Park West & W 68 St,7079.06,82351
2187,Central Park W & W 68 St,7079.06,26746


In [63]:
query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_id = CAST(FORMAT('%.2f', CAST(start_station_id AS FLOAT64)) AS STRING)
    WHERE TRUE
"""
job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x73d090134bc0>

In [64]:
query = f"""
    SELECT start_station_name, start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    GROUP BY start_station_name, start_station_id
"""

df = client.query(query).to_dataframe()
df[df['start_station_id'].duplicated(keep=False)].sort_values(by='start_station_id')



Unnamed: 0,start_station_name,start_station_id,f0_
66,Eastern Pkwy & St Marks Ave,3982.01,1169
1155,Eastern Pkwy\t& St Marks Ave,3982.01,1557
1547,Bridge St & Water St,4968.03,26853
943,Bridge St & Front St,4968.03,19105
1119,Morton St & Greenwich St,5772.05,39300
363,Morton St & Washington St,5772.05,30999
1946,34 Ave & Vernon Blvd,6873.01,3864
1566,34th Ave & Vernon Blvd,6873.01,12397
1014,Central Park W & W 68 St,7079.06,26746
254,Central Park West & W 68 St,7079.06,82351


In [65]:
convert_names = {
    'West': 'W',
    'Fort': 'Ft',
    'Av': 'Ave',
    'Ichan Stadium': 'Icahn Stadium'
}

sql_expr = "start_station_name"
for old, new in convert_names.items():
    sql_expr = f"REGEXP_REPLACE({sql_expr}, r'\\b{old}\\b', '{new}')"

query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_name = {sql_expr}
    WHERE TRUE
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x73d090142c60>

In [66]:
pattern = r'\\t'

query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_name = REGEXP_REPLACE(start_station_name, r'{pattern}', ' ')
    WHERE REGEXP_CONTAINS(start_station_name, r'{pattern}')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x73d08a740800>

In [67]:
pattern = r'(\d+)(st|nd|rd|th)\b'
replace = r'\1'

query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_id = REGEXP_REPLACE(start_station_name, r'{pattern}', r'{replace}')
    WHERE REGEXP_CONTAINS(start_station_name, r'{pattern}')
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x73d08a582c30>

In [68]:
query = f"""
    SELECT start_station_name, start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    GROUP BY start_station_name, start_station_id
"""

df = client.query(query).to_dataframe()
df[df['start_station_id'].duplicated(keep=False)].sort_values(by='start_station_id')



Unnamed: 0,start_station_name,start_station_id,f0_
846,Bridge St & Front St,4968.03,19105
1434,Bridge St & Water St,4968.03,26853
443,Morton St & Greenwich St,5772.05,39300
1401,Morton St & Washington St,5772.05,30999
1740,3 Ave & E 82 St,7154.1,41574
2217,3 Ave & E 81 St,7154.1,21819


In [76]:
convert_names = {
    'Bridge St & Water St': 'Bridge St & Front St',
    'Morton St & Washington St': 'Morton St & Greenwich St',
    '3 Ave & E 81 St': '3 Ave & E 82 St'
}

case_statements = []
for old, new in convert_names.items():
    case_statements.append(f"WHEN start_station_name = '{old}' THEN '{new}'")

query = f"""
    UPDATE {cleaned_table_ref}
    SET start_station_name = CASE {' '.join(case_statements)} ELSE start_station_name END
    WHERE start_station_name IN ({', '.join(f"'{old}'" for old in convert_names.keys())})
"""

job = client.query(query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x73d09017fef0>

In [77]:
query = f"""
    SELECT start_station_name, start_station_id, COUNT(*)
    FROM {cleaned_table_ref}
    GROUP BY start_station_name, start_station_id
"""

df = client.query(query).to_dataframe()
df[df['start_station_id'].duplicated(keep=False)].sort_values(by='start_station_id')



Unnamed: 0,start_station_name,start_station_id,f0_
