# Columns and Expressions

## Prerrequisites

Install Spark and Java in VM

In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.1
!wget -q https://apache.osuosl.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop2.tgz

In [2]:
ls -l # check the .tgz is there

total 267684
drwxr-xr-x 1 root root      4096 Dec  6 14:35 [0m[01;34msample_data[0m/
-rw-r--r-- 1 root root 274099817 Oct 15 10:53 spark-3.3.1-bin-hadoop2.tgz


In [3]:
# unzip it
!tar xf spark-3.3.1-bin-hadoop2.tgz

In [4]:
!pip install -q findspark

In [5]:

!pip install py4j

# For maps
!pip install folium
!pip install plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 7.6 MB/s 
[?25hInstalling collected packages: py4j
Successfully installed py4j-0.10.9.7
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Define the environment

In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop2"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [7]:
import findspark
findspark.init("spark-3.3.1-bin-hadoop2")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Columns and Expressions") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.3.1'

In [8]:
spark

In [9]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [10]:
# Import sql functions
from pyspark.sql.functions import *

Download datasets

In [11]:
!mkdir -p dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2022/master/datasets/cars.json -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2022/master/datasets/movies.json -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2022/master/datasets/more_cars.json -P /dataset

Read JSON file

In [12]:
carsDF = spark.read \
    .option("inferSchema", True) \
    .json("/dataset/cars.json")

## Examples

Select a column

In [13]:
carsDF.select(col("Name")).show(3, False)

+-------------------------+
|Name                     |
+-------------------------+
|chevrolet chevelle malibu|
|buick skylark 320        |
|plymouth satellite       |
+-------------------------+
only showing top 3 rows



In [14]:
# various select methods
carsDF.select(
    carsDF.Name,
    col("Acceleration"),
    "Weight_in_lbs"
).show(3)

+--------------------+------------+-------------+
|                Name|Acceleration|Weight_in_lbs|
+--------------------+------------+-------------+
|chevrolet chevell...|        12.0|         3504|
|   buick skylark 320|        11.5|         3693|
|  plymouth satellite|        11.0|         3436|
+--------------------+------------+-------------+
only showing top 3 rows



Expressions

In [None]:
# 
carsWithKgDF = carsDF.select(
    col("Name"),
    col("Weight_in_lbs"),
    (col("Weight_in_lbs")/2.2).cast("int").alias("Weight_in_kg_2"), #cast result to int
    expr("Weight_in_lbs / 1000").cast("string").alias("Weight_in_T") #cast result to str
)
carsWithKgDF.printSchema()
carsWithKgDF.show(3)

root
 |-- Name: string (nullable = true)
 |-- Weight_in_lbs: long (nullable = true)
 |-- Weight_in_kg_2: integer (nullable = true)
 |-- Weight_in_T: string (nullable = true)

+--------------------+-------------+--------------+-----------+
|                Name|Weight_in_lbs|Weight_in_kg_2|Weight_in_T|
+--------------------+-------------+--------------+-----------+
|chevrolet chevell...|         3504|          1592|      3.504|
|   buick skylark 320|         3693|          1678|      3.693|
|  plymouth satellite|         3436|          1561|      3.436|
+--------------------+-------------+--------------+-----------+
only showing top 3 rows



In [None]:
# with expressions
carsWithSelectExprWeightsDF = carsDF.selectExpr(
    "Name",
    "Weight_in_lbs",
    "Weight_in_lbs / 2.2"
  )
carsWithSelectExprWeightsDF.show(3)

+--------------------+-------------+---------------------+
|                Name|Weight_in_lbs|(Weight_in_lbs / 2.2)|
+--------------------+-------------+---------------------+
|chevrolet chevell...|         3504|          1592.727273|
|   buick skylark 320|         3693|          1678.636364|
|  plymouth satellite|         3436|          1561.818182|
+--------------------+-------------+---------------------+
only showing top 3 rows



### DF Processing

Add a column

In [None]:
carsWithKg3DF = carsDF.withColumn("Weight_in_kg_3", col("Weight_in_lbs") / 2.2)
carsWithKg3DF.show(3)

+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+------------------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|    Weight_in_kg_3|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+------------------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|1970-01-01|1592.7272727272725|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|         3693|1970-01-01|1678.6363636363635|
|        11.0|        8|       318.0|       150|            18.0|  plymouth satellite|   USA|         3436|1970-01-01|1561.8181818181818|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+------------------+
only showing top 3 rows



Rename a column

In [None]:
carsWithColumnRenamed = carsDF.withColumnRenamed("Weight_in_lbs", "Weight in pounds")
carsWithColumnRenamed.show(3)

+------------+---------+------------+----------+----------------+--------------------+------+----------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight in pounds|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+----------------+----------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|            3504|1970-01-01|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|            3693|1970-01-01|
|        11.0|        8|       318.0|       150|            18.0|  plymouth satellite|   USA|            3436|1970-01-01|
+------------+---------+------------+----------+----------------+--------------------+------+----------------+----------+
only showing top 3 rows



In [None]:
# careful with column names
# carsWithColumnRenamed.selectExpr("Weight in pounds")

In [None]:
# as we hace special characters (spaces) we have to use the ``
carsWithColumnRenamed.selectExpr("`Weight in pounds`").show(3)

+----------------+
|Weight in pounds|
+----------------+
|            3504|
|            3693|
|            3436|
+----------------+
only showing top 3 rows



Remove a column

In [None]:
carsWithColumnRenamed.printSchema()

root
 |-- Acceleration: double (nullable = true)
 |-- Cylinders: long (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: long (nullable = true)
 |-- Miles_per_Gallon: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Weight in pounds: long (nullable = true)
 |-- Year: string (nullable = true)



In [None]:
dropColsDF = carsWithColumnRenamed.drop("Cylinders", "Displacement")
dropColsDF.printSchema()


root
 |-- Acceleration: double (nullable = true)
 |-- Horsepower: long (nullable = true)
 |-- Miles_per_Gallon: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Weight in pounds: long (nullable = true)
 |-- Year: string (nullable = true)



Filtering

In [None]:
nonUSCarsDF = carsDF.filter(col("Origin") != "USA")
nonUSCarsDF2 = carsDF.where(col("Origin") != "USA")
nonUSCarsDF.show(3)
print(f"{nonUSCarsDF.count()} == {nonUSCarsDF2.count()}")

+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        17.5|        4|       133.0|       115|            null|citroen ds-21 pallas|Europe|         3090|1970-01-01|
|        15.0|        4|       113.0|        95|            24.0|toyota corona mar...| Japan|         2372|1970-01-01|
|        14.5|        4|        97.0|        88|            27.0|        datsun pl510| Japan|         2130|1970-01-01|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
only showing top 3 rows

152 == 152


In [None]:
# filtering with expression strings
americanCarsDF = carsDF.filter("Origin = 'USA'")
americanCarsDF.show(3)

+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|1970-01-01|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|         3693|1970-01-01|
|        11.0|        8|       318.0|       150|            18.0|  plymouth satellite|   USA|         3436|1970-01-01|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
only showing top 3 rows



Chain filters

In [None]:
americanPowerfulCarsDF = carsDF.filter(col("Origin") == "USA").filter(col("Horsepower") > 150)
americanPowerfulCarsDF2 = carsDF.filter((col("Origin") == "USA") & (col("Horsepower") > 150))
americanPowerfulCarsDF3 = carsDF.filter("Origin = 'USA' and Horsepower > 150")
americanPowerfulCarsDF.show(3)

+------------+---------+------------+----------+----------------+-----------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|             Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+-----------------+------+-------------+----------+
|        11.5|        8|       350.0|       165|            15.0|buick skylark 320|   USA|         3693|1970-01-01|
|        10.0|        8|       429.0|       198|            15.0| ford galaxie 500|   USA|         4341|1970-01-01|
|         9.0|        8|       454.0|       220|            14.0| chevrolet impala|   USA|         4354|1970-01-01|
+------------+---------+------------+----------+----------------+-----------------+------+-------------+----------+
only showing top 3 rows



Unioning (adding more columns)

In [None]:
moreCarsDF = spark.read.option("inferSchema", "true").json("/dataset/more_cars.json")
allCarsDF = carsDF.union(moreCarsDF) # works if the DFs have the same schema

In [None]:
print(f"{carsDF.count()} + {moreCarsDF.count()} =? {allCarsDF.count()}")

406 + 2 =? 408


Distinc values

In [None]:
allCountriesDF = carsDF.select("Origin").distinct()
allCountriesDF.count()

3

## Exercises
1. Read the movies DF and select 2 columns of your choice
2. Create another column summing up the total profit of the movies = US_Gross + Worldwide_Gross + DVD sales. Are you pbtaining nulls? How you can solve it?
3. Select all COMEDY movies with IMDB rating above 6
Use as many versions as possible

Exercise 1

In [None]:
moviesDF = spark.read.option("inferSchema", "true").json("/dataset/movies.json")

In [None]:
moviesDF.show(3)

+-------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+------+--------------------+------------+--------+---------------+
|Creative_Type|Director|Distributor|IMDB_Rating|IMDB_Votes|MPAA_Rating|Major_Genre|Production_Budget|Release_Date|Rotten_Tomatoes_Rating|Running_Time_min|Source|               Title|US_DVD_Sales|US_Gross|Worldwide_Gross|
+-------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+------+--------------------+------------+--------+---------------+
|         null|    null|   Gramercy|        6.1|      1071|          R|       null|          8000000|   12-Jun-98|                  null|            null|  null|      The Land Girls|        null|  146083|         146083|
|         null|    null|     Strand|        6.9|       207|          R|      Drama|           300000|    7-Aug-98|  

In [None]:
moviesReleaseDF = moviesDF.select("Title", "Release_Date")

moviesReleaseDF2 = moviesDF.select(
    moviesDF.Title,
    col("Release_Date"),
    "Major_Genre",
    expr("IMDB_Rating"))
    
moviesReleaseDF3 = moviesDF.selectExpr(
    "Title", "Release_Date")

In [None]:
moviesReleaseDF2.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Release_Date: string (nullable = true)
 |-- Major_Genre: string (nullable = true)
 |-- IMDB_Rating: double (nullable = true)



Exercise 2

In [None]:
moviesProfitDF = moviesDF.select(
    col("Title"),
    col("US_Gross"),
    col("Worldwide_Gross"),
    col("US_DVD_Sales"),
    (col("US_Gross") + col("Worldwide_Gross") + col("US_DVD_Sales")).alias("Total_Gross"))

moviesProfitDF.show(3)

+--------------------+--------+---------------+------------+-----------+
|               Title|US_Gross|Worldwide_Gross|US_DVD_Sales|Total_Gross|
+--------------------+--------+---------------+------------+-----------+
|      The Land Girls|  146083|         146083|        null|       null|
|First Love, Last ...|   10876|          10876|        null|       null|
|I Married a Stran...|  203134|         203134|        null|       null|
+--------------------+--------+---------------+------------+-----------+
only showing top 3 rows



In [None]:
# we can replace null, or make our sum null same (via udf, filter, sql statement, ect) we will not describe it in detail now
moviesProfitDF2 = moviesDF.fillna(0).select(
    col("Title"),
    (col("US_Gross") + col("Worldwide_Gross") + col("US_DVD_Sales")).alias("Total_Gross"))

moviesProfitDF2.show(3)

+--------------------+-----------+
|               Title|Total_Gross|
+--------------------+-----------+
|      The Land Girls|     292166|
|First Love, Last ...|      21752|
|I Married a Stran...|     406268|
+--------------------+-----------+
only showing top 3 rows



In [None]:
moviesProfitDF3 = moviesDF.fillna(0).selectExpr(
    "Title",
    "US_Gross",
    "Worldwide_Gross",
    "US_DVD_Sales",
    "US_Gross + Worldwide_Gross + US_DVD_Sales as Total_Gross")

moviesProfitDF3.show(3)

+--------------------+--------+---------------+------------+-----------+
|               Title|US_Gross|Worldwide_Gross|US_DVD_Sales|Total_Gross|
+--------------------+--------+---------------+------------+-----------+
|      The Land Girls|  146083|         146083|           0|     292166|
|First Love, Last ...|   10876|          10876|           0|      21752|
|I Married a Stran...|  203134|         203134|           0|     406268|
+--------------------+--------+---------------+------------+-----------+
only showing top 3 rows



In [None]:
moviesProfitDF4 = moviesDF.fillna(0).select("Title", "US_Gross", "Worldwide_Gross", "US_DVD_Sales") \
    .withColumn("Total_Gross", col("US_Gross") + col("Worldwide_Gross") + col("US_DVD_Sales"))

moviesProfitDF4.show(3)

+--------------------+--------+---------------+------------+-----------+
|               Title|US_Gross|Worldwide_Gross|US_DVD_Sales|Total_Gross|
+--------------------+--------+---------------+------------+-----------+
|      The Land Girls|  146083|         146083|           0|     292166|
|First Love, Last ...|   10876|          10876|           0|      21752|
|I Married a Stran...|  203134|         203134|           0|     406268|
+--------------------+--------+---------------+------------+-----------+
only showing top 3 rows



In [None]:
# using conditions over columns to handle with nulls (harder but sometimes necessary)
moviesProfitDF5 = moviesDF.select("Title", "US_Gross", "Worldwide_Gross", "US_DVD_Sales") \
    .withColumn("US_DVD_Sales", when(col("US_DVD_Sales").isNotNull(), col("US_DVD_Sales")).otherwise(0)) \
    .withColumn("Total_Gross", col("US_Gross") + col("Worldwide_Gross") + col("US_DVD_Sales"))

moviesProfitDF5.show(3)

+--------------------+--------+---------------+------------+-----------+
|               Title|US_Gross|Worldwide_Gross|US_DVD_Sales|Total_Gross|
+--------------------+--------+---------------+------------+-----------+
|      The Land Girls|  146083|         146083|           0|     292166|
|First Love, Last ...|   10876|          10876|           0|      21752|
|I Married a Stran...|  203134|         203134|           0|     406268|
+--------------------+--------+---------------+------------+-----------+
only showing top 3 rows



Exercise 3

In [None]:
comediesDF = moviesDF.select("Title", "IMDB_Rating") \
    .where((col("Major_Genre") == "Comedy") & (col("IMDB_Rating") > 6))

comediesDF.show(3)

+--------------------+-----------+
|               Title|IMDB_Rating|
+--------------------+-----------+
|I Married a Stran...|        6.8|
|24 7: Twenty Four...|        6.9|
|          Four Rooms|        6.4|
+--------------------+-----------+
only showing top 3 rows



In [None]:

comediesDF2 = moviesDF.select("Title", "IMDB_Rating") \
    .where(col("Major_Genre") == "Comedy") \
    .where(col("IMDB_Rating") > 6)

comediesDF2.show(3)

+--------------------+-----------+
|               Title|IMDB_Rating|
+--------------------+-----------+
|I Married a Stran...|        6.8|
|24 7: Twenty Four...|        6.9|
|          Four Rooms|        6.4|
+--------------------+-----------+
only showing top 3 rows



In [None]:
comediesDF3 = moviesDF.select("Title", "IMDB_Rating") \
    .where("Major_Genre = 'Comedy' and IMDB_Rating > 6")

comediesDF3.show(3)

+--------------------+-----------+
|               Title|IMDB_Rating|
+--------------------+-----------+
|I Married a Stran...|        6.8|
|24 7: Twenty Four...|        6.9|
|          Four Rooms|        6.4|
+--------------------+-----------+
only showing top 3 rows

