# SQL Exercises

## Prerrequisites

Install Spark and Java in VM

In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.1
!wget -q https://apache.osuosl.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop2.tgz

In [2]:
ls -l # check the .tgz is there

total 267680
drwxr-xr-x 1 root root      4096 Dec  8 14:36 [0m[01;34msample_data[0m/
-rw-r--r-- 1 root root 274099817 Oct 15 10:53 spark-3.3.1-bin-hadoop2.tgz


In [3]:
# unzip it
!tar xf spark-3.3.1-bin-hadoop2.tgz

In [4]:
!pip install -q findspark

Defining the environment

In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop2"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [6]:
import findspark
findspark.init("spark-3.3.1-bin-hadoop2")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("SQL Exercises") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.3.1'

In [7]:
spark

In [8]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [9]:
# Import sql functions
from pyspark.sql.functions import *

Download datasets

In [10]:
!mkdir -p dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2022/master/datasets/movies.json -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2022/master/datasets/employees.csv -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2022/master/datasets/salaries.csv -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2022/master/datasets/deptmanagers.csv -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2022/master/datasets/titles.csv -P /dataset

## Aggregations Exercises

1. Sum up all the worldwide profits of ALL the movies in the DF. Then sum the worldwide profits per director
2. Count how many distinct directors we have
3. Show the mean and standard deviation of US gross revenue for the movies (all the movies)
4. Compute the average IMDB rating and the average US gross revenue PER DIRECTOR
5. Sum up ALL the profits of ALL the movies in the DF. Then sum ALL the profits per director. Can you see null values? Why? How you can solve it?

In [None]:
df = spark.read.json("/dataset/movies.json")

df.printSchema()
#df.show()
#df.count()

root
 |-- Creative_Type: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Distributor: string (nullable = true)
 |-- IMDB_Rating: double (nullable = true)
 |-- IMDB_Votes: long (nullable = true)
 |-- MPAA_Rating: string (nullable = true)
 |-- Major_Genre: string (nullable = true)
 |-- Production_Budget: long (nullable = true)
 |-- Release_Date: string (nullable = true)
 |-- Rotten_Tomatoes_Rating: long (nullable = true)
 |-- Running_Time_min: long (nullable = true)
 |-- Source: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- US_DVD_Sales: long (nullable = true)
 |-- US_Gross: long (nullable = true)
 |-- Worldwide_Gross: long (nullable = true)



In [None]:
df.select(sum(col("Worldwide_Gross"))).alias("Suma WorldWide").show()

df.groupby(col("Director")).agg(sum(col("US_Gross"))).orderBy(col("Director")).show()

df.select(col("Director"), col("Worldwide_Gross")).groupby("Director").sum("Worldwide_Gross").orderBy(col("Director")).show()

df.groupby(col("Director")).agg(count(col("Director"))).distinct().orderBy(col("Director")).show()

df.select(countDistinct(col("Director"))).show()

df.select(avg(col("US_Gross"))).show()

df.select(stddev(col("US_Gross"))).show()

df.groupby(col("Director")).agg(avg("IMDB_Rating"), avg("US_Gross")).orderBy(col("Director")).show()

df.select(sum(col("Worldwide_Gross"))-sum(col("Production_Budget"))).show()

df.groupby(col("Director")).agg(sum(col("Worldwide_Gross"))-sum(col("Production_Budget"))).orderBy(col("Director")).na.drop().show()


+--------------------+
|sum(Worldwide_Gross)|
+--------------------+
|        272586820052|
+--------------------+

+--------------------+-------------+
|            Director|sum(US_Gross)|
+--------------------+-------------+
|                null|  27594211599|
|        Abel Ferrara|      1212799|
|          Adam McKay|    232350286|
|       Adam Shankman|    548704471|
|         Adrian Lyne|    307744370|
|     Adrienne Shelly|     19097550|
|      Akira Kurosawa|       320592|
|           Alan Alda|     42488161|
|      Alan J. Pakula|     42885593|
|         Alan Parker|     82780474|
|        Alan Rudolph|       178287|
|       Albert Brooks|     11614954|
|       Albert Hughes|     59329835|
|Alejandro Gonzale...|     55935372|
|       Alex Kendrick|     43629810|
|         Alex Proyas|    239193733|
|     Alexander Payne|    151451102|
|       Alexandre Aja|     41778863|
|      Alfonso Cuaron|    298741216|
|    Alfred Hitchcock|     98264742|
+--------------------+-----------

## Joins Exercises

1. Read employees.csv, deptmanagers.csv, salaries.csv and titles.csv to a DFs.
2. Show all employees and their max salary (there can be different salaries registered for the same employee)
3. Show all employees who were never managers. Check it by getting all the managers and checking that they are not in the table
4. Find the job titles of the best paid 10 employees in the company (note that there can be different titles registered for the same employee)

In [73]:
employees_df = spark.read.option("inferSchema",True).option("header","true").csv("/dataset/employees.csv")

deptmanag_df = spark.read.option("inferSchema",True).option("header","true").csv("/dataset/deptmanagers.csv")

salaries_df = spark.read.option("inferSchema",True).option("header","true").csv("/dataset/salaries.csv")

titles_df = spark.read.option("inferSchema",True).option("header","true").csv("/dataset/titles.csv")

employees_df.printSchema()

deptmanag_df.printSchema()

salaries_df.printSchema()


titles_df.printSchema()


#employees_df.alias("df1").join(salaries_df.alias("df2"),col("df1.emp_no") == col("df2.emp_no"), "left").show()


emp_sal_df = employees_df.alias("df1").join(salaries_df.alias("df2"),employees_df.emp_no == salaries_df.emp_no, "left")

emp_sal_df.printSchema()

emp_sal_df.select(col("first_name"), col("last_name"),col("df1.emp_no"), col("salary")).groupby(col("first_name"),col("df1.emp_no")).max("salary").orderBy(col("emp_no")).show()


######

deptmanag_df.show()

print("Ej 3")

emp_no_manag = employees_df.join(deptmanag_df,employees_df.emp_no == deptmanag_df.emp_no,"leftanti")


emp_no_manag.show()



#######

print("Ej 4")


top_10_emp = emp_sal_df.select(col("first_name"), col("last_name"),col("df1.emp_no"), col("salary")) \
.groupby(col("first_name"),col("df1.emp_no")) \
.max("salary").orderBy(col("max(salary)").desc()).limit(10)


top_10_emp.alias("df3").join(titles_df.alias("df4"),col("df3.emp_no") == col("df4.emp_no"),"left").show()


#top_10_emp.show()

#titles_df.show()

'''
employees_df.show()

deptmanag_df.show()

salaries_df.show()

titles_df.show()
'''

root
 |-- emp_no: integer (nullable = true)
 |-- birth_date: timestamp (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |--  hire_date: timestamp (nullable = true)

root
 |-- dept_no: string (nullable = true)
 |-- emp_no: integer (nullable = true)
 |-- from_date: timestamp (nullable = true)
 |-- to_date: timestamp (nullable = true)

root
 |-- emp_no: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- from_date: timestamp (nullable = true)
 |-- to_date: timestamp (nullable = true)

root
 |-- emp_no: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- from_date: timestamp (nullable = true)
 |-- to_date: timestamp (nullable = true)

root
 |-- emp_no: integer (nullable = true)
 |-- birth_date: timestamp (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |--  hire_date: timestamp

'\nemployees_df.show()\n\ndeptmanag_df.show()\n\nsalaries_df.show()\n\ntitles_df.show()\n'