In [15]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

Define data file paths

In [16]:
current_dir = os.getcwd()
penanners_csv_file_name = "../csv_files/hd/PEP_2017_PEPANNERS.csv"
institution_csv_file_name = "../csv_files/hd/institution_campus.csv"
country_zip_ratios_csv_file_name = "../csv_files/hd/country_zip_ratios.csv"
penanners_csv_file_path = os.path.join(current_dir, penanners_csv_file_name)
institution_csv_file_path = os.path.join(current_dir, institution_csv_file_name)
country_zip_ratios_csv_file_path = os.path.join(current_dir, country_zip_ratios_csv_file_name)

Initialize Spark Session

In [17]:
spark = SparkSession.builder.appName("Data Transformation with Joins").getOrCreate()

Ingesting and clean cencus data into Spark

In [18]:
cencus_df = (spark.read.format("csv")
             .option("header", "true")
             .option("inferSchema", "true")
             .option("encoding", "cp1252")
             .load(penanners_csv_file_path))
# Transformation data
drop_cols = ['GEO.id',
             'rescen42010',
             'resbase42010',
             'respop72010',
             'respop72011,',
             'respop72012',
             'respop72013',
             'respop72014',
             'respop72015',
             'respop72016']
cencus_df = (cencus_df.drop(*drop_cols)
             .withColumnRenamed('respop72017', 'pop2017')
             .withColumnRenamed('GEO.id2', 'country_id')
             .withColumnRenamed('GEO.display-label', 'country'))
cencus_df.show(10, False)

+----------+------------------------+-----------+-------+
|country_id|country                 |respop72011|pop2017|
+----------+------------------------+-----------+-------+
|1001      |Autauga County, Alabama |55199      |55504  |
|1003      |Baldwin County, Alabama |186534     |212628 |
|1005      |Barbour County, Alabama |27351      |25270  |
|1007      |Bibb County, Alabama    |22745      |22668  |
|1009      |Blount County, Alabama  |57562      |58013  |
|1011      |Bullock County, Alabama |10675      |10309  |
|1013      |Butler County, Alabama  |20880      |19825  |
|1015      |Calhoun County, Alabama |117785     |114728 |
|1017      |Chambers County, Alabama|34031      |33713  |
|1019      |Cherokee County, Alabama|25993      |25857  |
+----------+------------------------+-----------+-------+
only showing top 10 rows



Ingest, transform and clean up HE institution data into Spark

In [19]:
drop_cols = ["DapipId", "OpeId", "ParentName", "ParentDapipId",
             "LocationType", "Address", "GeneralPhone", "AdminName",
             "AdminPhone", "AdminEmail", "Fax", "UpdateDate", "zip9",
             "address_elements", "address_elements_count", "split_zipcode"]
institution_df = (spark.read.format('csv')
                  .option("header", "true")
                  .option("inferSchema", "true")
                  .option("encoding", "cp1252")
                  .load(institution_csv_file_path))
institution_df = (institution_df.filter("LocationType='Institution'")
                  .withColumn("address_elements", F.split(F.col('Address'), ' ')))
institution_df = institution_df.withColumn("address_elements_count", F.size(F.col('address_elements')))
institution_df = institution_df.withColumn("zip9", F.element_at(F.col('address_elements'), F.col('address_elements_count')))
institution_df = institution_df.withColumn('split_zipcode', F.split(F.col('zip9'), '-'))
institution_df = institution_df.withColumn('zipcode', F.col('split_zipcode').getItem(0))
institution_df = institution_df.drop(*drop_cols)
institution_df.show(10)

+--------------------+-------+
|        LocationName|zipcode|
+--------------------+-------+
|Community College...|  36112|
|Alabama A & M Uni...|  35762|
|University of Ala...|  35233|
|  Amridge University|  36117|
|University of Ala...|  35899|
|Alabama State Uni...|  36104|
|The University of...|  35487|
|Central Alabama C...|  35010|
|Athens State Univ...|  35611|
|Auburn University...|  36117|
+--------------------+-------+
only showing top 10 rows



Ingest country zip ratios data into Spark

In [25]:
country_zip_ratio_df = (spark.read.format("csv")
                        .option("header", "true")
                        .option("inferSchema", "true")
                        .load(country_zip_ratios_csv_file_path))
country_zip_ratio_df = country_zip_ratio_df.drop(*['res_ratio', 'bus_ratio', 'oth_ratio', 'tot_ratio'])
country_zip_ratio_df.show(5)

+------+-----+
|county|  zip|
+------+-----+
|  1001|36701|
|  1001|36051|
|  1001|36006|
|  1001|36003|
|  1001|36022|
+------+-----+
only showing top 5 rows



Start joining datasets

In [26]:
institution_country_zip_join_df = institution_df.join(country_zip_ratio_df, institution_df['zipcode'] == country_zip_ratio_df['zip'], "inner")
institution_country_zip_join_df.drop('zip')# remove duplicated column
institution_country_zip_join_df.show(10)

+--------------------+-------+------+-----+
|        LocationName|zipcode|county|  zip|
+--------------------+-------+------+-----+
|Community College...|  36112|  1101|36112|
|Alabama A & M Uni...|  35762|  1089|35762|
|University of Ala...|  35233|  1073|35233|
|  Amridge University|  36117|  1101|36117|
|University of Ala...|  35899|  1089|35899|
|Alabama State Uni...|  36104|  1101|36104|
|Central Alabama C...|  35010|  1123|35010|
|Central Alabama C...|  35010|  1051|35010|
|Central Alabama C...|  35010|  1037|35010|
|Athens State Univ...|  35611|  1083|35611|
+--------------------+-------+------+-----+
only showing top 10 rows



In [30]:
institution_country_zip_cencus_join =  institution_country_zip_join_df.join(cencus_df, institution_country_zip_join_df["county"] == cencus_df["country_id"] , "left_outer")
institution_country_zip_cencus_join = institution_country_zip_cencus_join.drop(*['country_id', 'county', 'zipcode']) # remove duplicated column
institution_country_zip_cencus_join.show(10, False)

+-----------------------------------+-----+--------------------------+-----------+-------+
|LocationName                       |zip  |country                   |respop72011|pop2017|
+-----------------------------------+-----+--------------------------+-----------+-------+
|Community College of the Air Force |36112|Montgomery County, Alabama|229162     |226646 |
|Alabama A & M University           |35762|Madison County, Alabama   |339519     |361046 |
|University of Alabama at Birmingham|35233|Jefferson County, Alabama |657789     |659197 |
|Amridge University                 |36117|Montgomery County, Alabama|229162     |226646 |
|University of Alabama at Huntsville|35899|Madison County, Alabama   |339519     |361046 |
|Alabama State University           |36104|Montgomery County, Alabama|229162     |226646 |
|Central Alabama Community College  |35010|Tallapoosa County, Alabama|41394      |40681  |
|Central Alabama Community College  |35010|Elmore County, Alabama    |80006      |81677  |