**How to Query the USA Names Data (BigQuery Dataset)**

In [1]:
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
usa = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="usa_names")

In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "usa_names")
bq_assistant.list_tables()

['usa_1910_2013', 'usa_1910_current']

In [3]:
bq_assistant.head("usa_1910_current", num_rows=15)

Unnamed: 0,state,gender,year,name,number
0,OH,F,2014,Dani,13
1,OH,F,2013,Jacey,7
2,AK,M,1957,Vincent,6
3,CA,M,1921,Mitchell,7
4,DC,M,1940,Irvin,8
5,FL,M,1974,Marshall,17
6,NC,F,2009,Kenadie,6
7,AR,F,1972,Sonja,18
8,SC,F,2000,Amya,10
9,AR,F,1940,Jannie,8


In [4]:
bq_assistant.table_schema("usa_1910_current")

[SchemaField('state', 'STRING', 'NULLABLE', '2-digit state code', ()),
 SchemaField('gender', 'STRING', 'NULLABLE', 'Sex (M=male or F=female)', ()),
 SchemaField('year', 'INTEGER', 'NULLABLE', '4-digit year of birth', ()),
 SchemaField('name', 'STRING', 'NULLABLE', 'Given name of a person at birth', ()),
 SchemaField('number', 'INTEGER', 'NULLABLE', 'Number of occurrences of the name', ())]

What are the most common names?


In [5]:
query1 = """
  SELECT
  names_step_1.name AS names_step_1_name,
  names_step_1.gender AS names_step_1_gender,
  COALESCE(CAST(SUM(names_step_1.number) AS FLOAT64),0) AS namesstep1totalpopulat_1
FROM
  `bigquery-public-data.usa_names.usa_1910_2013` AS names_step_1
GROUP BY
  1,
  2
ORDER BY
  3 DESC
LIMIT
  500;
        """
response1 = usa.query_to_pandas_safe(query1)
response1.head(50)

Unnamed: 0,names_step_1_name,names_step_1_gender,namesstep1totalpopulat_1
0,James,M,4924235.0
1,John,M,4818746.0
2,Robert,M,4703680.0
3,Michael,M,4280040.0
4,William,M,3811998.0
5,Mary,F,3728041.0
6,David,M,3541625.0
7,Richard,M,2526927.0
8,Joseph,M,2467298.0
9,Charles,M,2237170.0


What are the most common female names?


In [6]:
query2 = """
  SELECT
  names_step_1.name AS names_step_1_name,
  COALESCE(CAST(SUM(names_step_1.number) AS FLOAT64),0) AS namesstep1totalpopulat_1
FROM
  `bigquery-public-data.usa_names.usa_1910_2013` AS names_step_1
WHERE
  (names_step_1.gender = 'F')
GROUP BY
  1
ORDER BY
  2 DESC
LIMIT
  500;
        """
response2 = usa.query_to_pandas_safe(query2)
response2.head(50)

Unnamed: 0,names_step_1_name,namesstep1totalpopulat_1
0,Mary,3728041.0
1,Patricia,1567405.0
2,Elizabeth,1490772.0
3,Jennifer,1460272.0
4,Linda,1445838.0
5,Barbara,1422631.0
6,Margaret,1120006.0
7,Susan,1107916.0
8,Dorothy,1051262.0
9,Jessica,1036257.0


What are the names of 50 people in Texas?

In [7]:
query3 = """  
    SELECT name FROM `bigquery-public-data.usa_names.usa_1910_2013` 
    WHERE state = "TX" 
    LIMIT 100;
        """
response3 = usa.query_to_pandas_safe(query3)
response3.head(50)

Unnamed: 0,name
0,Frances
1,Alice
2,Beatrice
3,Ella
4,Gertrude
5,Josephine
6,Lula
7,Blanche
8,Marjorie
9,Christine
