**How to Query the Chicago Taxi Dataset (BigQuery)**

In [1]:
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
chicago_taxi = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="chicago_taxi_trips")

In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "chicago_taxi_trips")
bq_assistant.list_tables()

['taxi_trips']

In [3]:
bq_assistant.head("taxi_trips", num_rows=3)

Unnamed: 0,unique_key,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location
0,c63b0eac4ae81ec853c9301063ccd33980de19ad,dd6e5bcacd9a0f2cf09ca5ffe57409184500b08460811a...,2014-05-29 09:45:00+00:00,2014-05-29 10:00:00+00:00,1080,0.0,,17031839100,6,32,...,0.0,19.05,Credit Card,Taxi Affiliation Services,41.943155,-87.640698,POINT (-87.640698076 41.9431550855),41.880994,-87.632746,POINT (-87.6327464887 41.8809944707)
1,979f6b589770f650a601ab5bd806399f0bb6fb8a,6f2cc4053af60221e38f6ab9fd083abee209098b045aa1...,2013-06-26 00:45:00+00:00,2013-06-26 01:00:00+00:00,1440,33.7,,17031081300,76,8,...,2.0,69.84,Cash,,41.979071,-87.90304,POINT (-87.9030396611 41.9790708201),41.898332,-87.620763,POINT (-87.6207628651 41.8983317935)
2,272e6d08a7bd88b024a8fe3955bb216f09a417d4,1e09e6453a168cabfcca5a4c9f4d1cb01fd4b38f58e2d9...,2013-11-07 12:00:00+00:00,2013-11-07 12:15:00+00:00,660,0.0,,17031281900,33,28,...,1.0,12.5,Credit Card,Blue Ribbon Taxi Association Inc.,41.849247,-87.624135,POINT (-87.6241352979 41.8492467545),41.879255,-87.642649,POINT (-87.642648998 41.8792550844)


In [4]:
bq_assistant.table_schema("taxi_trips")

[SchemaField('unique_key', 'STRING', 'REQUIRED', 'Unique identifier for the trip.', ()),
 SchemaField('taxi_id', 'STRING', 'REQUIRED', 'A unique identifier for the taxi.', ()),
 SchemaField('trip_start_timestamp', 'TIMESTAMP', 'NULLABLE', 'When the trip started, rounded to the nearest 15 minutes.', ()),
 SchemaField('trip_end_timestamp', 'TIMESTAMP', 'NULLABLE', 'When the trip ended, rounded to the nearest 15 minutes.', ()),
 SchemaField('trip_seconds', 'INTEGER', 'NULLABLE', 'Time of the trip in seconds.', ()),
 SchemaField('trip_miles', 'FLOAT', 'NULLABLE', 'Distance of the trip in miles.', ()),
 SchemaField('pickup_census_tract', 'INTEGER', 'NULLABLE', 'The Census Tract where the trip began. For privacy, this Census Tract is not shown for some trips.', ()),
 SchemaField('dropoff_census_tract', 'INTEGER', 'NULLABLE', 'The Census Tract where the trip ended. For privacy, this Census Tract is not shown for some trips.', ()),
 SchemaField('pickup_community_area', 'INTEGER', 'NULLABLE', '

What are the maximum, minimum and average fares for rides lasting 10 minutes or more?


In [5]:
query1 = """SELECT
  EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS day,
  FORMAT('%3.2f', MAX(fare)) AS maximum_fare,
  FORMAT('%3.2f', MIN(fare)) AS minimum_fare,
  FORMAT('%3.2f', AVG(fare)) AS avg_fare,
  FORMAT('%3.2f', STDDEV(fare)) AS std_dev_fare,
  COUNT(1) AS rides
FROM
  `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE
  trip_seconds >= 600
GROUP BY
  day
ORDER BY
  day
        """
response1 = chicago_taxi.query_to_pandas_safe(query1, max_gb_scanned=10)
response1.head(10)

Unnamed: 0,day,maximum_fare,minimum_fare,avg_fare,std_dev_fare,rides
0,1,9900.41,0.0,18.48,53.3,5466828
1,2,9900.42,0.0,19.91,59.87,5028575
2,3,9900.41,0.0,18.68,57.43,5295719
3,4,9825.04,0.0,18.7,60.21,5667210
4,5,9702.08,0.0,18.45,55.54,6187603
5,6,9900.45,0.0,17.39,59.58,7172208
6,7,9900.21,0.0,15.59,53.57,6910394


Which drop-off areas have the highest average tip?


In [6]:
query2 = """SELECT
  dropoff_community_area,
  FORMAT('%3.2f', AVG(tips)) AS average_tip,
  FORMAT('%3.2f', MAX(tips)) AS max_tip
FROM
  `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE
  dropoff_community_area IS NOT NULL
GROUP BY
  dropoff_community_area
ORDER BY
  average_tip DESC
LIMIT
  10
        """
response2 = chicago_taxi.query_to_pandas_safe(query2, max_gb_scanned=10)
response2.head(10)

Unnamed: 0,dropoff_community_area,average_tip,max_tip
0,76,3.54,596.85
1,72,3.27,80.0
2,56,3.0,285.0
3,74,2.39,150.0
4,75,2.0,75.0
5,9,1.72,74.75
6,41,1.69,97.0
7,64,1.63,100.0
8,55,1.56,27.9
9,42,1.37,90.0


How does trip duration affect fare rates for trips lasting less than 90 minutes?


In [7]:
query3 = """SELECT
  FORMAT('%02.0fm to %02.0fm', min_minutes, max_minutes) AS minutes_range,
  SUM(trips) AS total_trips,
  FORMAT('%3.2f', SUM(total_fare) / SUM(trips)) AS average_fare
FROM (
  SELECT
    MIN(duration_in_minutes) OVER (quantiles) AS min_minutes,
    MAX(duration_in_minutes) OVER (quantiles) AS max_minutes,
    SUM(trips) AS trips,
    SUM(total_fare) AS total_fare
  FROM (
    SELECT
      ROUND(trip_seconds / 60) AS duration_in_minutes,
      NTILE(10) OVER (ORDER BY trip_seconds / 60) AS quantile,
      COUNT(1) AS trips,
      SUM(fare) AS total_fare
    FROM
      `bigquery-public-data.chicago_taxi_trips.taxi_trips`
    WHERE
      ROUND(trip_seconds / 60) BETWEEN 1 AND 90
    GROUP BY
      trip_seconds,
      duration_in_minutes )
  GROUP BY
    duration_in_minutes,
    quantile
  WINDOW quantiles AS (PARTITION BY quantile)
  )
GROUP BY
  minutes_range
ORDER BY
  Minutes_range
        """
response3 = chicago_taxi.query_to_pandas_safe(query3, max_gb_scanned=10)
response3.head(10)

Unnamed: 0,minutes_range,total_trips,average_fare
0,01m to 07m,29462275,6.17
1,07m to 14m,26605869,9.2
2,14m to 20m,12461147,14.31
3,20m to 27m,5275740,24.01
4,27m to 33m,3402285,29.82
5,33m to 40m,1936548,34.17
6,40m to 47m,1188024,36.99
7,47m to 55m,816889,39.27
8,55m to 66m,524929,41.88
9,66m to 90m,363612,46.6


![](https://cloud.google.com/bigquery/images/chicago-taxi-fares-by-duration.png)
https://cloud.google.com/bigquery/images/chicago-taxi-fares-by-duration.png