### Access respective Azure storage container and blob to fetch data.

In [0]:
# Azure storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "censusdatacontainer"
blob_relative_path = "release/us_population_county/"
blob_sas_token = r""

### Construct remote blob path to access the parquet files remotely.

In [0]:
# Allow SPARK to read from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(
  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
  blob_sas_token)
print('Remote blob path: ' + wasbs_path)

Remote blob path: wasbs://censusdatacontainer@azureopendatastorage.blob.core.windows.net/release/us_population_county/


### Read the parquet files and load the data into Temporary view.

In [0]:
# SPARK read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)
print('Register the DataFrame as a SQL temporary view: source')
df.createOrReplaceTempView('source')

Register the DataFrame as a SQL temporary view: source


### SQL Operations on the Temporary View.

Display total number of rows

In [0]:
#Display total number of rows:
print("Count of rows :")
display(spark.sql('SELECT count(*) as Count FROM source'))

Count of rows :


Count
3664512


Display top 10 rows

In [0]:
# Display top 10 rows
print('Displaying top 10 rows: ')
display(spark.sql('SELECT * FROM source LIMIT 10'))

Displaying top 10 rows: 


decennialTime,stateName,countyName,population,race,sex,minAge,maxAge,year
2010,Texas,Crockett County,123,WHITE ALONE,Male,5.0,9.0,2010
2010,Texas,Crockett County,1,ASIAN ALONE,Female,67.0,69.0,2010
2010,Texas,Crockett County,111,WHITE ALONE,Female,55.0,59.0,2010
2010,Texas,Crockett County,64,TWO OR MORE RACES,,,,2010
2010,Texas,Crockett County,18,,Male,85.0,,2010
2010,Texas,Crockett County,16,AMERICAN INDIAN AND ALASKA NATIVE ALONE,Female,,,2010
2010,Texas,Crockett County,7,WHITE ALONE,Male,21.0,21.0,2010
2010,Texas,Crockett County,45,,Female,85.0,,2010
2010,Texas,Crockett County,0,NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE,Female,67.0,69.0,2010
2010,Texas,Crockett County,4,SOME OTHER RACE ALONE,Male,67.0,69.0,2010


In [0]:
display(spark.sql('SELECT year, statename, countyname, race, sex, minage, maxage, population FROM source LIMIT 10'))

year,statename,countyname,race,sex,minage,maxage,population
2010,Texas,Crockett County,WHITE ALONE,Male,5.0,9.0,123
2010,Texas,Crockett County,ASIAN ALONE,Female,67.0,69.0,1
2010,Texas,Crockett County,WHITE ALONE,Female,55.0,59.0,111
2010,Texas,Crockett County,TWO OR MORE RACES,,,,64
2010,Texas,Crockett County,,Male,85.0,,18
2010,Texas,Crockett County,AMERICAN INDIAN AND ALASKA NATIVE ALONE,Female,,,16
2010,Texas,Crockett County,WHITE ALONE,Male,21.0,21.0,7
2010,Texas,Crockett County,,Female,85.0,,45
2010,Texas,Crockett County,NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE,Female,67.0,69.0,0
2010,Texas,Crockett County,SOME OTHER RACE ALONE,Male,67.0,69.0,4


Display list of unique years in the data

In [0]:

display(spark.sql('SELECT distinct year FROM source'))

year
2010
2000


Display the list of unique state names in the data

In [0]:
display(spark.sql('SELECT distinct statename FROM source'))

statename
Utah
Hawaii
Minnesota
Ohio
Oregon
Arkansas
Texas
North Dakota
Pennsylvania
Connecticut


Display the list of unique County names in the data

In [0]:
display(spark.sql('SELECT distinct countyname FROM source'))

countyname
Owen County
McLennan County
Williamson County
Yoakum County
Wasatch County
Canadian County
Unicoi County
Rock County
Lewis and Clark County
Webster County


Display total population by year

In [0]:
display(spark.sql('SELECT year, count(population) AS Total_Population FROM source group by year'))

year,Total_Population
2010,1855296
2000,1809216


Databricks visualization. Run in Databricks to view.

In [0]:
display(spark.sql('SELECT distinct race FROM source'))

race
AMERICAN INDIAN AND ALASKA NATIVE ALONE
""
WHITE ALONE
ASIAN ALONE
NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE
SOME OTHER RACE ALONE
TWO OR MORE RACES
BLACK OR AFRICAN AMERICAN ALONE


In [0]:
display(spark.sql('SELECT year, statename, count(population) AS Total_Population FROM source group by year, statename order by statename'))

year,statename,Total_Population
2010,Alabama,38592
2000,Alabama,38592
2000,Alaska,15552
2010,Alaska,16704
2000,Arizona,8640
2010,Arizona,8640
2000,Arkansas,43200
2010,Arkansas,43200
2000,California,33408
2010,California,33408


In [0]:
display(spark.sql('SELECT year, statename, count(population) AS Total_Population FROM source WHERE year = 2010 group by year, statename order by statename'))

year,statename,Total_Population
2010,Alabama,38592
2010,Alaska,16704
2010,Arizona,8640
2010,Arkansas,43200
2010,California,33408
2010,Colorado,36864
2010,Connecticut,4608
2010,Delaware,1728
2010,District of Columbia,576
2010,Florida,38592


In [0]:
display(spark.sql('SELECT year, sex, count(population) AS Total_Population FROM source group by year,sex'))

year,sex,Total_Population
2010,Male,618432
2010,Female,618432
2010,,618432
2000,Male,603072
2000,,603072
2000,Female,603072


Databricks visualization. Run in Databricks to view.