**How to Query the BigQuery Sample Tables (BigQuery Dataset)**

In [1]:
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
sampleTables = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="samples")

In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "samples")
bq_assistant.list_tables()

['github_nested',
 'github_timeline',
 'gsod',
 'natality',
 'shakespeare',
 'trigrams',
 'wikipedia']

In [3]:
bq_assistant.head("wikipedia", num_rows=20)

Unnamed: 0,title,id,language,wp_namespace,is_redirect,revision_id,contributor_ip,contributor_id,contributor_username,timestamp,is_minor,is_bot,reversion_id,comment,num_characters
0,Strait of Messina Bridge,1462053,,0,,115349459,80.129.30.196,,,1173977859,,,,/* Controversy and concerns */,20009
1,Linux Mint,8577251,,0,,209452597,121.209.217.45,,,1209649052,,,,/* Comparison with Ubuntu */,13819
2,Espérance Sportive de Tunis,1953499,,0,,302602567,41.230.49.104,,,1247840296,,,,/* Notable Former Players */,12668
3,Robinho,1172821,,0,,171033541,85.59.201.199,,,1194899902,,,,/* External links */,11832
4,May 6,19514,,0,,42108324,69.76.4.245,,,1141422917,,,,,11608
5,Cecilia Reyes,900517,,0,,60209228,84.141.10.80,,,1151086811,,,,/* Powers and abilities */,4808
6,MiniBooNE,2572596,,0,,22146237,67.85.249.11,,,1125369006,,,,,146
7,Wario,100061,,0,,30882250,64.203.5.29,,,1134265171,,,,/* Powers */,22448
8,Phantom Darkness,12776066,,0,True,165049267,206.74.6.225,,,1192571898,,,,/* Super Rares */,3399
9,Melissa McGhee,4095975,,0,True,56159022,65.4.34.92,,,1149095260,,,,,2504


What words did Shakespeare use that contain the stem "laugh" or "prais"?

In [4]:
query1 = """SELECT
  word
FROM
  `bigquery-public-data.samples.shakespeare`
WHERE
  word LIKE 'prais%' AND word LIKE '%ing' OR
  word LIKE 'laugh%' AND word LIKE '%ed';
        """
response1 = sampleTables.query_to_pandas_safe(query1, max_gb_scanned=10)
response1.head(10)

Unnamed: 0,word
0,laughed
1,laughed
2,laughed
3,laughed
4,laughed
5,laughed
6,laughed
7,laughed
8,praising
9,praising


What are twenty random words from the Shakespeare table?

In [5]:
query2 = """SELECT word
FROM `bigquery-public-data.samples.shakespeare`
WHERE RAND() < 20/164656;
        """
response2 = sampleTables.query_to_pandas_safe(query2, max_gb_scanned=10)
response2.head(20)

Unnamed: 0,word
0,hurl
1,Novi
2,mile
3,dignity
4,send
5,mass
6,dudgeon
7,empty
8,lions
9,inquiry


Did Shakespeare ever use words that begin with the letters "TH"?

In [6]:
query3 = """SELECT
  word,
  corpus,
  COUNT(word)
FROM
  `bigquery-public-data.samples.shakespeare`
WHERE
  word LIKE 'th%'
GROUP BY
  word,
  corpus;
        """
response3 = sampleTables.query_to_pandas_safe(query3, max_gb_scanned=10)
response3.head(10)

Unnamed: 0,word,corpus,f0_
0,things,hamlet,1
1,thereof,hamlet,1
2,third,hamlet,1
3,they'll,hamlet,1
4,thrift,hamlet,1
5,thieves,hamlet,1
6,think'st,hamlet,1
7,throw,hamlet,1
8,thousand,hamlet,1
9,think't,hamlet,1


Did Shakespeare ever use words that end with the letters "TH"?

In [7]:
query4 = """SELECT
  word,
  corpus,
  COUNT(word)
FROM
  `bigquery-public-data.samples.shakespeare`
WHERE
  word LIKE '%th'
GROUP BY
  word,
  corpus;
        """
response4 = sampleTables.query_to_pandas_safe(query4, max_gb_scanned=10)
response4.head(10)

Unnamed: 0,word,corpus,f0_
0,forthwith,hamlet,1
1,sith,hamlet,1
2,birth,hamlet,1
3,With,hamlet,1
4,Both,hamlet,1
5,Youth,hamlet,1
6,'faith,hamlet,1
7,both,hamlet,1
8,month,hamlet,1
9,forth,hamlet,1


How many children were born to mother's over 50 in the ten states with the most births?

In [8]:
query5 = """SELECT
  mother_age,
  COUNT(mother_age) total
FROM
  `bigquery-public-data.samples.natality`
WHERE
  state IN (SELECT
              state
            FROM
              (SELECT
                 state,
                 COUNT(state) total
               FROM
                 `bigquery-public-data.samples.natality`
               GROUP BY
                 state
               ORDER BY
                 total DESC
               LIMIT 20))
  AND mother_age > 50
GROUP BY
  mother_age
ORDER BY
  mother_age DESC;
        """
response5 = sampleTables.query_to_pandas_safe(query5, max_gb_scanned=10)
response5.head(10)

Unnamed: 0,mother_age,total
0,54,91
1,53,137
2,52,176
3,51,347


How many children were born to mother's over 50 in the forty states with the least births?

In [9]:
query6 = """SELECT
  mother_age,
  COUNT(mother_age) total
FROM
  `bigquery-public-data.samples.natality`
WHERE
  state NOT IN (SELECT
                  state
                FROM
                  (SELECT
                     state,
                     COUNT(state) total
                   FROM
                     `bigquery-public-data.samples.natality`
                   GROUP BY
                     state
                   ORDER BY
                     total DESC
                   LIMIT 10))
  AND mother_age > 50
GROUP BY
  mother_age
ORDER BY
  mother_age DESC;
        """
response6 = sampleTables.query_to_pandas_safe(query6, max_gb_scanned=10)
response6.head(10)

Unnamed: 0,mother_age,total
0,54,28
1,53,49
2,52,66
3,51,133


How many boys and girls were born between the years 2000 and 2002?

In [10]:
query8 = """SELECT
  year,
  is_male,
  COUNT(1) as count
FROM
  `bigquery-public-data.samples.natality`
WHERE
  year >= 2000
  AND year <= 2002
GROUP BY
  ROLLUP(year, is_male)
ORDER BY
  year,
  is_male;
        """
response8 = sampleTables.query_to_pandas_safe(query8, max_gb_scanned=10)
response8.head(10)

Unnamed: 0,year,is_male,count
0,,,12122730
1,2000.0,,4063823
2,2000.0,False,1984255
3,2000.0,True,2079568
4,2001.0,,4031531
5,2001.0,False,1970770
6,2001.0,True,2060761
7,2002.0,,4027376
8,2002.0,False,1966519
9,2002.0,True,2060857


What is the average birth weight in Ohio in 2003 for mothers who smoke cigarettes?



In [11]:
query9 = """SELECT
  cigarette_use,
  /* Finds average and standard deviation */
  AVG(weight_pounds) baby_weight,
  STDDEV(weight_pounds) baby_weight_stdev,
  AVG(mother_age) mother_age
FROM
  `bigquery-public-data.samples.natality`
WHERE
  year=2003 AND state='OH'
/* Group the result values by those */
/* who smoked and those who didn't.  */
GROUP BY
  cigarette_use;
        """
response9 = sampleTables.query_to_pandas_safe(query9, max_gb_scanned=10)
response9.head(10)

Unnamed: 0,cigarette_use,baby_weight,baby_weight_stdev,mother_age
0,True,6.838024,1.305376,25.014222
1,,7.080023,1.574009,28.535461
2,False,7.331774,1.342397,27.593438


How many boys and girls were born in the 3 states with the most births?

In [12]:
query10 = """SELECT
  state,
  /* If 'is_male' is True, return 'Male', */
  /* otherwise return 'Female' */
  IF (is_male, 'Male', 'Female') AS sex,
  /* The count value is aliased as 'cnt' */
  /* and used in the HAVING clause below. */
  COUNT(*) AS cnt
FROM
  `bigquery-public-data.samples.natality`
WHERE
  state != ''
GROUP BY
  state, sex
HAVING
  cnt > 3000000
ORDER BY
  cnt DESC;
        """
response10 = sampleTables.query_to_pandas_safe(query10, max_gb_scanned=10)
response10.head(10)

Unnamed: 0,state,sex,cnt
0,CA,Male,7060826
1,CA,Female,6733288
2,TX,Male,5107542
3,TX,Female,4879247
4,NY,Male,4442246
5,NY,Female,4227891
6,IL,Male,3089555
