**How to Query the San Francisco Open Data (BigQuery Dataset)**

In [1]:
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
sf = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="san_francisco")

In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "san_francisco")
bq_assistant.list_tables()

['311_service_requests',
 'bikeshare_stations',
 'bikeshare_status',
 'bikeshare_trips',
 'film_locations',
 'sffd_service_calls',
 'sfpd_incidents',
 'street_trees']

In [3]:
bq_assistant.head("film_locations", num_rows=30)

Unnamed: 0,title,release_year,locations,fun_facts,production_company,distributor,director,writer,actor_1,actor_2,actor_3
0,A Jitney Elopement,1915,20th and Folsom Streets,,The Essanay Film Manufacturing Company,General Film Company,Charles Chaplin,Charles Chaplin,Charles Chaplin,Edna Purviance,
1,A Jitney Elopement,1915,Golden Gate Park,"During San Francisco's Gold Rush era, the Park...",The Essanay Film Manufacturing Company,General Film Company,Charles Chaplin,Charles Chaplin,Charles Chaplin,Edna Purviance,
2,Greed,1924,Hayes Street at Laguna,,Metro-Goldwyn-Mayer (MGM),Metro-Goldwyn-Mayer (MGM),Eric von Stroheim,Eric von Stroheim,Zasu Pitts,Gibson Gowland,Jean Hersholt
3,Greed,1924,Bush and Sutter Streets,,Metro-Goldwyn-Mayer (MGM),Metro-Goldwyn-Mayer (MGM),Eric von Stroheim,Eric von Stroheim,Zasu Pitts,Gibson Gowland,Jean Hersholt
4,Greed,1924,Cliff House (1090 Point Lobos Avenue),"In 1887, the Cliff House was severely damaged ...",Metro-Goldwyn-Mayer (MGM),Metro-Goldwyn-Mayer (MGM),Eric von Stroheim,Eric von Stroheim,Zasu Pitts,Gibson Gowland,Jean Hersholt
5,The Jazz Singer,1927,Coffee Dan's (O'Farrell Street at Powell),,Warner Bros. Pictures,Warner Bros. Pictures,Alan Crosland,Alfred A. Cohn,Al Jolson,May McAvoy,Warner Oland
6,Barbary Coast,1935,,,The Samuel Goldwyn Company,United Artists,Howard Hawks,Ben Hecht,Mariam Hopkins,Edward G. Robinson,Joel McCrea
7,San Francisco,1936,The Barbary Coast,The Barbary Coast was a red-light district tha...,Metro-Goldwyn Mayer,Metro-Goldwyn Mayer,W.S. Van Dyke,Anita Loos,Clark Gable,Jeanette MacDonald,Spencer Tracy
8,San Francisco,1936,City Hall,The dome of SF's City Hall is almost a foot ta...,Metro-Goldwyn Mayer,Metro-Goldwyn Mayer,W.S. Van Dyke,Anita Loos,Clark Gable,Jeanette MacDonald,Spencer Tracy
9,After the Thin Man,1936,Coit Tower,The Tower was funded by a gift bequeathed by L...,Metro-Goldwyn Mayer,Metro-Goldwyn Mayer,W.S. Van Dyke,Frances Goodrich,William Powell,Myrna Loy,James Stewart


In [4]:
bq_assistant.table_schema("film_locations")

[SchemaField('title', 'STRING', 'NULLABLE', '', ()),
 SchemaField('release_year', 'INTEGER', 'NULLABLE', '', ()),
 SchemaField('locations', 'STRING', 'NULLABLE', '', ()),
 SchemaField('fun_facts', 'STRING', 'NULLABLE', '', ()),
 SchemaField('production_company', 'STRING', 'NULLABLE', '', ()),
 SchemaField('distributor', 'STRING', 'NULLABLE', '', ()),
 SchemaField('director', 'STRING', 'NULLABLE', '', ()),
 SchemaField('writer', 'STRING', 'NULLABLE', '', ()),
 SchemaField('actor_1', 'STRING', 'NULLABLE', '', ()),
 SchemaField('actor_2', 'STRING', 'NULLABLE', '', ()),
 SchemaField('actor_3', 'STRING', 'NULLABLE', '', ())]

Which neighborhoods have the highest proportion of offensive graffiti?


In [5]:
query0 = """SELECT
  neighborhood,
  ROUND(100*COUNTIF(STRPOS(descriptor,
        "- Not_Offensive") > 0) / COUNT(*), 2) AS not_offensive_pct,
  ROUND(100*COUNTIF(STRPOS(descriptor,
        "- Offensive") > 0) / COUNT(*), 2) AS offensive_pct,
  COUNT(*) AS total_count
FROM
  `bigquery-public-data.san_francisco.311_service_requests`
WHERE
  STRPOS(category,
    "Graffiti") > 0
GROUP BY
  neighborhood
ORDER BY
  offensive_pct DESC
LIMIT
  10;
        """
response0 = sf.query_to_pandas_safe(query0)
response0.head(10)

Unnamed: 0,neighborhood,not_offensive_pct,offensive_pct,total_count
0,Presidio Terrace,14.89,84.51,994
1,Presidio Heights,18.39,81.13,832
2,Golden Gate Park,20.32,79.55,4630
3,Lone Mountain,21.46,77.84,6770
4,Parnassus Heights,22.89,76.21,1564
5,Ashbury Heights,23.06,76.11,837
6,Presidio National Park,23.76,75.25,101
7,Cole Valley,26.45,73.16,3267
8,Panhandle,27.0,72.44,8541
9,Laurel Heights / Jordan Park,27.21,72.3,2462


Which complaint is most likely to be made using Twitter and in which neighborhood?


In [6]:
query1 = """SELECT
  neighborhood,
  complaint_type,
  COUNT(*) AS total_count
FROM
  `bigquery-public-data.san_francisco.311_service_requests`
WHERE
  Source="Twitter"
GROUP BY
  Neighborhood,
  complaint_type
ORDER BY
  total_count DESC
LIMIT
  30;
        """
response1 = sf.query_to_pandas_safe(query1)
response1.head(30)

Unnamed: 0,neighborhood,complaint_type,total_count
0,Mission,Graffiti on Parking_meter,1350
1,Mission,Graffiti on Pole,978
2,Inner Richmond,Sidewalk_Defect,925
3,Mission,Graffiti on Sidewalk_in_front_of_property,698
4,Mission,Graffiti on Building_commercial,544
5,Mission,Graffiti,524
6,Silver Terrace,General Cleaning,507
7,,MUNI - Services_Service_Delivery_Facilities,402
8,Mission,Graffiti on News_rack,385
9,Mission,General Cleaning,322


What are the most complained about Muni stops in San Francisco?


In [7]:
query2 = """SELECT
  descriptor,
  incident_address,
  COUNT(*) AS total_count
FROM
  `bigquery-public-data.san_francisco.311_service_requests`
WHERE
  category = "MUNI Feedback" AND incident_address != "Not associated with a specific address"
GROUP BY
  incident_address,
  descriptor
ORDER BY
  total_count DESC
LIMIT 10;
        """
response2 = sf.query_to_pandas_safe(query2)
response2.head(10)

Unnamed: 0,descriptor,incident_address,total_count
0,201_Pass_Up_Did_Not_Wait_for_Transferee,Intersection of 16TH ST and MISSION ST,87
1,201_Pass_Up_Did_Not_Wait_for_Transferee,Intersection of 24TH ST and MISSION ST,87
2,601_Delay_No_Show,Intersection of BEALE ST and FOLSOM ST,78
3,601_Delay_No_Show,Intersection of PINE ST and DAVIS ST,66
4,601_Delay_No_Show,Intersection of DAVIS ST and PINE ST,65
5,201_Pass_Up_Did_Not_Wait_for_Transferee,Intersection of 5TH ST and MARKET ST,58
6,601_Delay_No_Show,Intersection of SUTTER ST and SANSOME ST,54
7,201_Pass_Up_Did_Not_Wait_for_Transferee,Intersection of MARKET ST and 5TH ST,52
8,301_Discourtesy_to_Customer,Intersection of 16TH ST and MISSION ST,51
9,301_Discourtesy_to_Customer,Intersection of POWELL ST and MARKET ST,51


What are the top 10 incident types that the San Francisco Fire Department responds to?


In [8]:
query3 = """SELECT
  call_type,
  COUNT(*) AS call_type_count
FROM
  `bigquery-public-data.san_francisco.sffd_service_calls`
WHERE
  call_type != ''
GROUP BY
  call_type
ORDER BY
  call_type_count DESC
LIMIT
  10;
        """
response3 = sf.query_to_pandas_safe(query3, max_gb_scanned=10)
response3.head(10)

Unnamed: 0,call_type,call_type_count
0,Medical Incident,2950934
1,Structure Fire,605663
2,Alarms,486984
3,Traffic Collision,186443
4,Other,73508
5,Citizen Assist / Service Call,68976
6,Outside Fire,53177
7,Vehicle Fire,22318
8,Water Rescue,21721
9,Gas Leak (Natural and LP Gases),16889


How many medical incidents and structure fires are there in each neighborhood?


In [9]:
query4 = """SELECT
  neighborhood_district,
  COUNTIF(call_type = "Medical Incident") AS medical_incident_count,
  COUNTIF(call_type = "Structure Fire") AS structure_fire_count,
  Count(*) as total_count
FROM
  `bigquery-public-data.san_francisco.sffd_service_calls`
GROUP BY
  neighborhood_district
ORDER BY
  total_count DESC;
        """
response4 = sf.query_to_pandas_safe(query4, max_gb_scanned=10)
response4.head(10)

Unnamed: 0,neighborhood_district,medical_incident_count,structure_fire_count,total_count
0,,2950934,605663,4557045


What’s the average response time for each type of dispatched vehicle?


In [10]:
query5 = """SELECT
  unit_type,
  ROUND(AVG(TIMESTAMP_DIFF(on_scene_timestamp, received_timestamp, MINUTE)), 2)
    as latency,
  Count(*) as total_count
FROM
  `bigquery-public-data.san_francisco.sffd_service_calls`
WHERE
  EXTRACT(DATE from received_timestamp) = EXTRACT(DATE from on_scene_timestamp)
GROUP BY
  unit_type
ORDER BY
  latency ASC;
        """
response5 = sf.query_to_pandas_safe(query5, max_gb_scanned=10)
response5.head(10)

Unnamed: 0,unit_type,latency,total_count
0,ENGINE,5.94,1424301
1,TRUCK,6.73,285438
2,RESCUE SQUAD,7.09,36655
3,CHIEF,7.66,220663
4,MEDIC,9.84,1166177
5,RESCUE CAPTAIN,10.82,100376
6,PRIVATE,10.95,229565
7,AIRPORT,11.4,9717
8,SUPPORT,30.01,6997
9,INVESTIGATION,40.89,3929


Which category of police incidents have historically been the most common in San Francisco?


In [11]:
query6 = """SELECT
  category,
  count(*) as incident_count
FROM
  `bigquery-public-data.san_francisco.sfpd_incidents`
GROUP BY
  category
ORDER BY
  incident_count DESC
LIMIT
  10;
        """
response6 = sf.query_to_pandas_safe(query6, max_gb_scanned=10)
response6.head(10)

Unnamed: 0,category,incident_count
0,LARCENY/THEFT,467657
1,OTHER OFFENSES,304042
2,NON-CRIMINAL,233323
3,ASSAULT,190394
4,VEHICLE THEFT,125209
5,DRUG/NARCOTIC,118260
6,VANDALISM,113436
7,WARRANTS,99799
8,BURGLARY,89528
9,SUSPICIOUS OCC,78823


What were the most common police incidents in the category of LARCENY/THEFT in 2016?


In [12]:
query7 = """SELECT
  descript,
  COUNT(*) AS incident_count_2016
FROM
  `bigquery-public-data.san_francisco.sfpd_incidents`
WHERE
  category="LARCENY/THEFT"
  AND EXTRACT(YEAR FROM timestamp) = 2016
GROUP BY
  descript
ORDER BY
  incident_count_2016 DESC
LIMIT
  10;
        """
response7 = sf.query_to_pandas_safe(query7, max_gb_scanned=10)
response7.head(10)

Unnamed: 0,descript,incident_count_2016
0,GRAND THEFT FROM LOCKED AUTO,17752
1,PETTY THEFT OF PROPERTY,4425
2,PETTY THEFT FROM LOCKED AUTO,3995
3,GRAND THEFT OF PROPERTY,2206
4,GRAND THEFT FROM UNLOCKED AUTO,1919
5,PETTY THEFT FROM A BUILDING,1863
6,PETTY THEFT SHOPLIFTING,1701
7,GRAND THEFT FROM PERSON,1459
8,GRAND THEFT FROM A BUILDING,1115
9,GRAND THEFT PICKPOCKET,840


Which non-criminal incidents saw the biggest reporting change from 2015 to 2016?


In [13]:
query8 = """SELECT
  descript,
  COUNTIF(EXTRACT(YEAR FROM timestamp) = 2016) -
  COUNTIF(EXTRACT(YEAR FROM timestamp) = 2015) AS yoy_change,
  COUNTIF(EXTRACT(YEAR FROM timestamp) = 2016) AS count_2016
FROM
  `bigquery-public-data.san_francisco.sfpd_incidents`
WHERE
  category != "NON-CRIMINAL"
GROUP BY
  descript
ORDER BY
  ABS(yoy_change) DESC
LIMIT
  10;
        """
response8 = sf.query_to_pandas_safe(query8, max_gb_scanned=10)
response8.head(10)

Unnamed: 0,descript,yoy_change,count_2016
0,"DRIVERS LICENSE, SUSPENDED OR REVOKED",-1433,3376
1,STOLEN AUTOMOBILE,-1387,3603
2,"MALICIOUS MISCHIEF, VANDALISM",976,4267
3,GRAND THEFT FROM UNLOCKED AUTO,-861,1919
4,WARRANT ARREST,-846,3145
5,TRAFFIC VIOLATION,496,1820
6,TRAFFIC VIOLATION ARREST,460,2238
7,BATTERY,312,4216
8,"CREDIT CARD, THEFT BY USE OF",-307,1375
9,PETTY THEFT FROM LOCKED AUTO,-305,3995


What is the diameter of the average tree in the city of San Francisco?


In [14]:
query9 = """SELECT
  ROUND(AVG(CAST(dbh as FLOAT64)), 2) as avg_width
FROM
  `bigquery-public-data.san_francisco.street_trees`
WHERE dbh != "";
        """
response9 = sf.query_to_pandas_safe(query9, max_gb_scanned=10)
response9.head(10)

Unnamed: 0,avg_width
0,10.07


What is the largest number of a particular species of tree planted in a single year?


In [15]:
query10 = """SELECT
  EXTRACT(YEAR from plant_date) as plantdate,
  species,
  COUNT(*) as count
FROM
  `bigquery-public-data.san_francisco.street_trees`
WHERE
  plant_date IS NOT null AND
  species != "Tree(s) ::"
GROUP BY
  plantdate, species
ORDER BY
  count desc
LIMIT 10;
        """
response10 = sf.query_to_pandas_safe(query10, max_gb_scanned=10)
response10.head(10)

Unnamed: 0,plantdate,species,count
0,2008,Arbutus 'Marina' :: Hybrid Strawberry Tree,497
1,2008,Tristaniopsis laurina :: Swamp Myrtle,403
2,1998,Arbutus 'Marina' :: Hybrid Strawberry Tree,323
3,1998,Tristaniopsis laurina :: Swamp Myrtle,318
4,1995,Lophostemon confertus :: Brisbane Box,307
5,2000,Prunus cerasifera :: Cherry Plum,291
6,1995,Tristaniopsis laurina :: Swamp Myrtle,268
7,1998,Prunus cerasifera :: Cherry Plum,251
8,2006,Tristaniopsis laurina :: Swamp Myrtle,242
9,2005,Lophostemon confertus :: Brisbane Box,233


Which San Francisco locations feature the largest number of trees?


In [16]:
query11 = """SELECT
  latitude,
  longitude,
  COUNT(*) AS count
FROM
  `bigquery-public-data.san_francisco.street_trees`
WHERE latitude IS NOT null AND longitude IS NOT null
GROUP BY
  latitude, longitude
ORDER BY
  count DESC
LIMIT
  20;
        """
response11 = sf.query_to_pandas_safe(query11, max_gb_scanned=10)
response11.head(20)

Unnamed: 0,latitude,longitude,count
0,37.717092,-122.472662,59
1,37.793335,-122.394213,50
2,37.734135,-122.436707,49
3,37.780607,-122.403939,48
4,37.747866,-122.462344,47
5,47.269987,-138.28367,45
6,37.774807,-122.424087,43
7,37.729326,-122.493386,43
8,37.797653,-122.395986,40
9,37.77036,-122.40982,40
