In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

tabular data - data represented in a two-dimensional table. Cells with each containing a single value, organized into rows and columns.

#### Creating a data frame

In [None]:
### first parameter: data - can be list of lists, pandas df, or RDD (resilient distributed dataset)
### second parametr: schema -
spark.createDataFrame()


### PySpark for analyzing and processing tabular data

Note: when using toPandas(), remember that you lose the advantages of working with multiple machines, as the data will accumulate on the driver &rarr; use for an aggregated or manageable (rows X columns <= 100000) data set.

In [4]:
broadcast_sample = "/sparkdata/DataAnalysisWithPythonAndPySpark-Data/broadcast_logs/BroadcastLogs_2018_Q3_M8_sample.CSV"
call_signs = "/sparkdata/DataAnalysisWithPythonAndPySpark-Data/broadcast_logs/Call_Signs"
data_dict = "/sparkdata/DataAnalysisWithPythonAndPySpark-Data/broadcast_logs/data dictionary.doc"
ref_tables = "/sparkdata/DataAnalysisWithPythonAndPySpark-Data/broadcast_logs/ReferenceTables"


#### Reading and acessing delimited data

In [5]:
logs = spark.read.csv(
    broadcast_sample,
    sep="|",
    header=True,
    inferSchema=True,
    timestampFormat="yyyy-MM-dd")

PySpark can infer the schema of a CSV file by setting the inferSchema optional
parameter to True. PySpark accomplishes this by reading the data twice: once
for setting the appropriate types for each column and once to ingest the data in
the inferred format.

In [7]:
logs.printSchema()

root
 |-- BroadcastLogID: integer (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: date (nullable = true)
 |-- SequenceNO: integer (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: date (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string (nullable 

### The basics of data manipulation: Selecting, dropping, renaming, ordering, diagnosing

#### <b>select() method</b>

In [21]:
logs.select("BroadcastLogID", "LogServiceID", "LogDate").show(5, False)

+--------------+------------+----------+
|BroadcastLogID|LogServiceID|LogDate   |
+--------------+------------+----------+
|1196192316    |3157        |2018-08-01|
|1196192317    |3157        |2018-08-01|
|1196192318    |3157        |2018-08-01|
|1196192319    |3157        |2018-08-01|
|1196192320    |3157        |2018-08-01|
+--------------+------------+----------+
only showing top 5 rows



In [None]:
# Using the string to column conversion
logs.select("BroadCastLogID", "LogServiceID", "LogDate")

logs.select(*["BroadCastLogID", "LogServiceID", "LogDate"])

# Passing the column object explicitly
logs.select(
    F.col("BroadCastLogID"), F.col("LogServiceID"), F.col("LogDate")
)

logs.select(
    *[F.col("BroadCastLogID"), F.col("LogServiceID"), F.col("LogDate")]
)

Note: when using Databricks, simply call display(dataframe)

#### <b>Deleting columns</b>

In [22]:
logs = logs.drop("BroadcastLogID","SequenceNO")

In [23]:
print("BroadcastLogID" in logs.columns)

False


In [24]:
print("SequenceNO" in logs.columns)

False


Unlike select(), dropping a nonexistent column is a no-op. PySpark will ignore columns it does not find.

#### <b>New columns with _withColumn()_</b>

In [25]:
from pyspark.sql import functions as F

In [26]:
logs.select(F.col("Duration")).show(5)

+----------------+
|        Duration|
+----------------+
|02:00:00.0000000|
|00:00:30.0000000|
|00:00:15.0000000|
|00:00:15.0000000|
|00:00:15.0000000|
+----------------+
only showing top 5 rows



In [28]:
print(logs.select(F.col("Duration")).dtypes)

[('Duration', 'string')]


We see that the column has string types, formatted as HH:MM:SS.mmmmmmm

In [32]:
# ignoring milliseconds
logs.select(
    F.col("Duration"),
    F.col("Duration").substr(1,2).cast("int").alias("dur_hours"),
    F.col("Duration").substr(4,2).cast("int").alias("dur_min"),
    F.col("Duration").substr(7,2).cast("int").alias("dur_seconds")
).distinct().show(5)

+----------------+---------+-------+-----------+
|        Duration|dur_hours|dur_min|dur_seconds|
+----------------+---------+-------+-----------+
|00:04:52.0000000|        0|      4|         52|
|00:10:06.0000000|        0|     10|          6|
|00:26:41.0000000|        0|     26|         41|
|00:05:29.0000000|        0|      5|         29|
|00:08:18.0000000|        0|      8|         18|
+----------------+---------+-------+-----------+
only showing top 5 rows



In [34]:
## summarizing the duration to seconds
logs.select(
    F.col("Duration"),
    (
        F.col("Duration").substr(1,2).cast("int")*60*60
        + F.col("Duration").substr(4,2).cast("int")*60
        + F.col("Duration").substr(7,2).cast("int")
    ).alias("Duration_seconds")
).distinct().show(5)

+----------------+----------------+
|        Duration|Duration_seconds|
+----------------+----------------+
|01:59:30.0000000|            7170|
|00:31:00.0000000|            1860|
|00:28:08.0000000|            1688|
|00:10:30.0000000|             630|
|00:32:00.0000000|            1920|
+----------------+----------------+
only showing top 5 rows



#### What if we only add a column to the end of our df?


In [36]:
logs = logs.withColumn(
    "Duration_seconds",
    (
        F.col("Duration").substr(1,2).cast("int")*60*60
        + F.col("Duration").substr(4,2).cast("int")*60
        + F.col("Duration").substr(7,2).cast("int")
    ),
)

In [37]:
logs.printSchema()

root
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: date (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: date (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string (nullable = true)
 |-- NetworkAffiliationID: integer (nullable = true)
 |-- SpecialAttentionID: inte

Using withColumn() and giving a name that already exists in the dataframe will make PySpark overwrite the existing column.

We can create columns both with select() and withColumn(). select() will be useful when we want to work with a few columns, and withColumn() will be useful when we want to add some columns without changing the reset of the data set.
 * Creating many (100+) new columns with withColumn() will slow Spark. For these cases, use select()

#### <b>Renaming and reordering columns</b>
can be done with select() and alias(), but also with withColumnRenamed()

In [39]:
logs = logs.withColumnRenamed("Duration_seconds","duration_seconds")

In [42]:
logs.printSchema()

root
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: date (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: date (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string (nullable = true)
 |-- NetworkAffiliationID: integer (nullable = true)
 |-- SpecialAttentionID: inte

applying to all columns: toDF()

In [43]:
logs.toDF(*[x.lower() for x in logs.columns]).printSchema()

root
 |-- logserviceid: integer (nullable = true)
 |-- logdate: date (nullable = true)
 |-- audiencetargetageid: integer (nullable = true)
 |-- audiencetargetethnicid: integer (nullable = true)
 |-- categoryid: integer (nullable = true)
 |-- closedcaptionid: integer (nullable = true)
 |-- countryoforiginid: integer (nullable = true)
 |-- dubdramacreditid: integer (nullable = true)
 |-- ethnicprogramid: integer (nullable = true)
 |-- productionsourceid: integer (nullable = true)
 |-- programclassid: integer (nullable = true)
 |-- filmclassificationid: integer (nullable = true)
 |-- exhibitionid: integer (nullable = true)
 |-- duration: string (nullable = true)
 |-- endtime: string (nullable = true)
 |-- logentrydate: date (nullable = true)
 |-- productionno: string (nullable = true)
 |-- programtitle: string (nullable = true)
 |-- starttime: string (nullable = true)
 |-- subtitle: string (nullable = true)
 |-- networkaffiliationid: integer (nullable = true)
 |-- specialattentionid: inte

To reorder columns, simply use select() with the wanted order.

In [44]:
logs.select(sorted(logs.columns)).printSchema()

root
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- BroadcastOriginPointID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CompositionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- Language1: integer (nullable = true)
 |-- Language2: integer (nullable = true)
 |-- LogDate: date (nullable = true)
 |-- LogEntryDate: date (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- NetworkAffiliationID: integer (nullable = true)
 |-- Producer1: string (nullable = true)
 |-- Producer2: string (nullable = true)
 |-- ProductionNO: string (nu

#### <b>Diagnosing a data frame with describe() and summary()</b>

quickly exploring numerical columns
 * describe(): summary statistics 

In [48]:
for i in logs.columns[:4]:
    logs.describe(i).show()

## if not compatible, PySpark only displays the title column.

+-------+------------------+
|summary|      LogServiceID|
+-------+------------------+
|  count|            238945|
|   mean| 3450.890284375065|
| stddev|199.50673962555592|
|    min|              3157|
|    max|              3925|
+-------+------------------+

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

+-------+-------------------+
|summary|AudienceTargetAgeID|
+-------+-------------------+
|  count|              16112|
|   mean| 3.4929245283018866|
| stddev| 1.0415963394745125|
|    min|                  1|
|    max|                  4|
+-------+-------------------+

+-------+----------------------+
|summary|AudienceTargetEthnicID|
+-------+----------------------+
|  count|                  1710|
|   mean|    120.56432748538012|
| stddev|     71.98694059436133|
|    min|                     4|
|    max|                   337|
+-------+----------------------+



Where descibre() takes *cols as an parameter, summary() takes *statistics as parameter. So you have to select the columns you want to work with.

In [50]:
for i in logs.columns[:3]:
    logs.select(i).summary().show()

+-------+------------------+
|summary|      LogServiceID|
+-------+------------------+
|  count|            238945|
|   mean| 3450.890284375065|
| stddev|199.50673962555592|
|    min|              3157|
|    25%|              3287|
|    50%|              3379|
|    75%|              3627|
|    max|              3925|
+-------+------------------+

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    25%|
|    50%|
|    75%|
|    max|
+-------+

+-------+-------------------+
|summary|AudienceTargetAgeID|
+-------+-------------------+
|  count|              16112|
|   mean| 3.4929245283018866|
| stddev| 1.0415963394745125|
|    min|                  1|
|    25%|                  4|
|    50%|                  4|
|    75%|                  4|
|    max|                  4|
+-------+-------------------+



In [51]:
for i in logs.columns[:3]:
    logs.select(i).summary('min','max').show()

+-------+------------+
|summary|LogServiceID|
+-------+------------+
|    min|        3157|
|    max|        3925|
+-------+------------+

+-------+
|summary|
+-------+
|    min|
|    max|
+-------+

+-------+-------------------+
|summary|AudienceTargetAgeID|
+-------+-------------------+
|    min|                  1|
|    max|                  4|
+-------+-------------------+



We can input count, mean, stddev, min, or max

* Note: both methods work <b>ONLY</b> on non-null values. Null values will not be counted.

### Exercises

Reread the data in a logs_raw data frame (the data file is ./data/broadcast_logs-
BroadcastLogs_2018_Q3_M8.CSV), this time without passing any optional parameters.
Print the first five rows of data, as well as the schema. What are the differences in
terms of data and schema between logs and logs_raw?

In [52]:
logs = spark.read.csv(
    broadcast_sample,
    sep="|",
    header=True,
    inferSchema=True,
    timestampFormat="yyyy-MM-dd")

In [54]:
logs_raw = spark.read.csv(
    broadcast_sample
)

In [55]:
logs_raw.show(5)

+--------------------+
|                 _c0|
+--------------------+
|BroadcastLogID|Lo...|
|1196192316|3157|2...|
|1196192317|3157|2...|
|1196192318|3157|2...|
|1196192319|3157|2...|
+--------------------+
only showing top 5 rows



In [57]:
logs_raw.printSchema()

root
 |-- _c0: string (nullable = true)



As we see, all gets read as a single string column, with column name _c0, as default convention.

Create a new data frame, logs_clean, that contains only the columns that do not
end with ID.

In [65]:
logs_clean = logs.select(*[i for i in logs.columns if not i.endswith("ID")])

In [66]:
logs_clean.printSchema()

root
 |-- LogDate: date (nullable = true)
 |-- SequenceNO: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: date (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string (nullable = true)
 |-- Producer1: string (nullable = true)
 |-- Producer2: string (nullable = true)
 |-- Language1: integer (nullable = true)
 |-- Language2: integer (nullable = true)

