In [13]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

Define file path


In [14]:
current_dir = os.getcwd()
csv_data_file_name = "../csv_files/chapter12/data.csv"
csv_data_file_path = os.path.join(current_dir, csv_data_file_name)

Start to read csv files via Spark



In [15]:
spark_session = (SparkSession.builder.appName("Data Transformation Example").getOrCreate())
df = spark_session.read.csv(path=csv_data_file_path, header=True, inferSchema=True)

In [16]:
df.printSchema()

root
 |-- GEO.id: string (nullable = true)
 |-- GEO.id2: integer (nullable = true)
 |-- GEO.display-label: string (nullable = true)
 |-- rescen42010: integer (nullable = true)
 |-- resbase42010: integer (nullable = true)
 |-- respop72010: integer (nullable = true)
 |-- respop72011: integer (nullable = true)
 |-- respop72012: integer (nullable = true)
 |-- respop72013: integer (nullable = true)
 |-- respop72014: integer (nullable = true)
 |-- respop72015: integer (nullable = true)
 |-- respop72016: integer (nullable = true)
 |-- respop72017: integer (nullable = true)



In [17]:
df.show()

+--------------+-------+--------------------+-----------+------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
|        GEO.id|GEO.id2|   GEO.display-label|rescen42010|resbase42010|respop72010|respop72011|respop72012|respop72013|respop72014|respop72015|respop72016|respop72017|
+--------------+-------+--------------------+-----------+------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
|0500000US01001|   1001|Autauga County, A...|      54571|       54571|      54750|      55199|      54927|      54695|      54864|      54838|      55278|      55504|
|0500000US01003|   1003|Baldwin County, A...|     182265|      182265|     183110|     186534|     190048|     194736|     199064|     202863|     207509|     212628|
|0500000US01005|   1005|Barbour County, A...|      27457|       27457|      27332|      27351|      27175|      26947|      26749|      26264|      25774|      25270

Do Transformation with dataframe


In [18]:
df = (df.drop("GEO.id")
      .withColumnRenamed("GEO.id2", "id")
      .withColumnRenamed("rescen42010", "real2010")
      .withColumnRenamed("GEO.display-label", "label")
      .drop("resbase42010")
      .withColumnRenamed("respop72010", "est2010")
      .withColumnRenamed("respop72011", "est2011")
      .withColumnRenamed("respop72012", "est2012")
      .withColumnRenamed("respop72013", "est2013")
      .withColumnRenamed("respop72014", "est2014")
      .withColumnRenamed("respop72015", "est2015")
      .withColumnRenamed("respop72016", "est2016")
      .withColumnRenamed("respop72017", "est2017"))
df.show()

+----+--------------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+
|  id|               label|real2010|est2010|est2011|est2012|est2013|est2014|est2015|est2016|est2017|
+----+--------------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+
|1001|Autauga County, A...|   54571|  54750|  55199|  54927|  54695|  54864|  54838|  55278|  55504|
|1003|Baldwin County, A...|  182265| 183110| 186534| 190048| 194736| 199064| 202863| 207509| 212628|
|1005|Barbour County, A...|   27457|  27332|  27351|  27175|  26947|  26749|  26264|  25774|  25270|
|1007|Bibb County, Alabama|   22915|  22872|  22745|  22658|  22503|  22533|  22561|  22633|  22668|
|1009|Blount County, Al...|   57322|  57381|  57562|  57595|  57623|  57546|  57590|  57562|  58013|
|1011|Bullock County, A...|   10914|  10880|  10675|  10612|  10549|  10673|  10419|  10441|  10309|
|1013|Butler County, Al...|   20947|  20944|  20880|  20688|  20372|  20327|  20141|  19965

More transformations


In [19]:
df = (df.withColumn("countryState", F.split(F.column("label"), ","))
      .withColumn("stateId", F.expr("int(id/1000)"))
      .withColumn("countryId", F.expr("id%1000")))
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- label: string (nullable = true)
 |-- real2010: integer (nullable = true)
 |-- est2010: integer (nullable = true)
 |-- est2011: integer (nullable = true)
 |-- est2012: integer (nullable = true)
 |-- est2013: integer (nullable = true)
 |-- est2014: integer (nullable = true)
 |-- est2015: integer (nullable = true)
 |-- est2016: integer (nullable = true)
 |-- est2017: integer (nullable = true)
 |-- countryState: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- stateId: integer (nullable = true)
 |-- countryId: integer (nullable = true)



In [20]:
df.show(10, False)

+----+------------------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+---------------------------+-------+---------+
|id  |label                   |real2010|est2010|est2011|est2012|est2013|est2014|est2015|est2016|est2017|countryState               |stateId|countryId|
+----+------------------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+---------------------------+-------+---------+
|1001|Autauga County, Alabama |54571   |54750  |55199  |54927  |54695  |54864  |54838  |55278  |55504  |[Autauga County,  Alabama] |1      |1        |
|1003|Baldwin County, Alabama |182265  |183110 |186534 |190048 |194736 |199064 |202863 |207509 |212628 |[Baldwin County,  Alabama] |1      |3        |
|1005|Barbour County, Alabama |27457   |27332  |27351  |27175  |26947  |26749  |26264  |25774  |25270  |[Barbour County,  Alabama] |1      |5        |
|1007|Bibb County, Alabama    |22915   |22872  |22745  |22658  |22503  |22533  |22561  |22633 

Add another transformations

In [21]:
df = df.withColumn("country", F.column("countryState").getItem(0)).withColumn("state",F.column("countryState").getItem(1))
df.sample(.01).show(10, False)

+-----+---------------------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+------------------------------+-------+---------+---------------+-----------+
|id   |label                      |real2010|est2010|est2011|est2012|est2013|est2014|est2015|est2016|est2017|countryState                  |stateId|countryId|country        |state      |
+-----+---------------------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+------------------------------+-------+---------+---------------+-----------+
|1005 |Barbour County, Alabama    |27457   |27332  |27351  |27175  |26947  |26749  |26264  |25774  |25270  |[Barbour County,  Alabama]    |1      |5        |Barbour County | Alabama   |
|6055 |Napa County, California    |136484  |136794 |137905 |138876 |139978 |140918 |141507 |141649 |140973 |[Napa County,  California]    |6      |55       |Napa County    | California|
|8111 |San Juan County, Colorado  |699     |708    |690    |692    |69

Create the final statistic dataframe

In [23]:
drop_columns = ["id", "label", "real2010", "est2010", "est2011", "est2012", "est2013", "est2014", "est2015", "est2016",
                "est2017", "stateId", "countryId", "CountryState"]
statistic_df = (df.withColumn("diff", F.expr("est2010 - real2010"))
                .withColumn("growth", F.expr("est2017 - real2010"))
                .drop(*drop_columns))
statistic_df.sample(.01).show(10, False)

+--------------------+-----------+----+------+
|country             |state      |diff|growth|
+--------------------+-----------+----+------+
|Dallas County       | Alabama   |43  |-4605 |
|Cleveland County    | Arkansas  |-11 |-487  |
|Searcy County       | Arkansas  |-18 |-257  |
|San Francisco County| California|535 |79128 |
|Oglethorpe County   | Georgia   |-7  |-22   |
|Benton County       | Indiana   |9   |-241  |
|Lawrence County     | Indiana   |-32 |-468  |
|Morgan County       | Indiana   |250 |819   |
|Starke County       | Indiana   |-18 |-470  |
|Audubon County      | Iowa      |-21 |-541  |
+--------------------+-----------+----+------+
only showing top 10 rows

