# Prerequisites

In [1]:
# Installing required packages

!pip install pyspark  findspark wget

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=5846fd9c6f1555020becb8615645d0a2538a48a8acac25df704d90eb257e7cea
  Stored in directory: /root/.cache/pip/wheels/40/b3/0f/a40dbd1c6861731779f62cc4babcb234387e11d697df70ee97
Successfully built wget
Installing collected packages: wget, findspark
Successfully installed findspark-2.0.1 wget-3.2


In [2]:
import findspark

findspark.init()

In [3]:
# PySpark is the Spark API for Python. In this lab, we use PySpark to initialize the SparkContext.

from pyspark import SparkContext, SparkConf

from pyspark.sql import SparkSession

In [4]:
# Creating a SparkContext object

sc = SparkContext.getOrCreate()

# Creating a SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [5]:
# Download the CSV data first into a local `employees.csv` file
import wget
wget.download("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/data/employees.csv")

'employees.csv'

Task 1: Generate aSpark DataFrame from the CSV data

In [6]:
# load the data into a pyspark dataframe
employees_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("employees.csv")

## Task 2: Define a chema for the data

In [7]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType

schema = StructType([
    StructField("Emp_No", IntegerType(), True),
    StructField("Emp_Name", StringType(), True),
    StructField("Salary", IntegerType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Department", StringType(), True)
])

df = spark.read.csv("employees.csv", schema=schema, header=True)


In [8]:
df.printSchema()

root
 |-- Emp_No: integer (nullable = true)
 |-- Emp_Name: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Department: string (nullable = true)



Task 3: Display schema of DataFrame

In [9]:
employees_df.printSchema()

root
 |-- Emp_No: integer (nullable = true)
 |-- Emp_Name: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Department: string (nullable = true)



## Task 4: Create a temporary view

In [10]:
employees_df.createOrReplaceTempView("employees")

## Task 5: Execute an SQL query

In [12]:
age_temp_df = spark.sql("""
    SELECT *
    FROM employees
    WHERE Age > 30
""")

age_temp_df.show()


+------+-----------+------+---+----------+
|Emp_No|   Emp_Name|Salary|Age|Department|
+------+-----------+------+---+----------+
|   199|    Douglas|  2600| 34|     Sales|
|   200|   Jennifer|  4400| 36| Marketing|
|   201|    Michael| 13000| 32|        IT|
|   202|        Pat|  6000| 39|        HR|
|   203|      Susan|  6500| 36| Marketing|
|   205|    Shelley| 12008| 33|   Finance|
|   206|    William|  8300| 37|        IT|
|   100|     Steven| 24000| 39|        IT|
|   102|        Lex| 17000| 37| Marketing|
|   103|  Alexander|  9000| 39| Marketing|
|   104|      Bruce|  6000| 38|        IT|
|   105|      David|  4800| 39|        IT|
|   106|      Valli|  4800| 38|     Sales|
|   107|      Diana|  4200| 35|     Sales|
|   109|     Daniel|  9000| 35|        HR|
|   110|       John|  8200| 31| Marketing|
|   111|     Ismael|  7700| 32|        IT|
|   112|Jose Manuel|  7800| 34|        HR|
|   113|       Luis|  6900| 34|     Sales|
|   116|     Shelli|  2900| 37|   Finance|
+------+---

## Task 6:  Calculate Average Salary by Deparment

In [13]:
avg_salary_df = spark.sql("""
    SELECT department, AVG(salary) AS avg_salary
    FROM employees
    GROUP BY department
""")

avg_salary_df.show()

+----------+-----------------+
|department|       avg_salary|
+----------+-----------------+
|     Sales|5492.923076923077|
|        HR|           5837.5|
|   Finance|           5730.8|
| Marketing|6633.333333333333|
|        IT|           7400.0|
+----------+-----------------+



## Task 7: Filter and Display IT Department Employees

In [15]:
filtered_temp_df = spark.sql("""
    SELECT *
    FROM employees
    WHERE department = "IT"
""")

filtered_temp_df.show()

+------+--------+------+---+----------+
|Emp_No|Emp_Name|Salary|Age|Department|
+------+--------+------+---+----------+
|   198|  Donald|  2600| 29|        IT|
|   201| Michael| 13000| 32|        IT|
|   206| William|  8300| 37|        IT|
|   100|  Steven| 24000| 39|        IT|
|   104|   Bruce|  6000| 38|        IT|
|   105|   David|  4800| 39|        IT|
|   111|  Ismael|  7700| 32|        IT|
|   129|   Laura|  3300| 38|        IT|
|   132|      TJ|  2100| 34|        IT|
|   136|   Hazel|  2200| 29|        IT|
+------+--------+------+---+----------+



## Task 8: Add 10% Bonus to Salaries

In [16]:
from pyspark.sql.functions import col

# Add the SalaryAfterBonus column with 10% bonus
employees_with_bonus = employees_df.withColumn("SalaryAfterBonus", col("salary") * 1.10)

# Display the updated DataFrame
employees_with_bonus.show()


+------+---------+------+---+----------+------------------+
|Emp_No| Emp_Name|Salary|Age|Department|  SalaryAfterBonus|
+------+---------+------+---+----------+------------------+
|   198|   Donald|  2600| 29|        IT|2860.0000000000005|
|   199|  Douglas|  2600| 34|     Sales|2860.0000000000005|
|   200| Jennifer|  4400| 36| Marketing|            4840.0|
|   201|  Michael| 13000| 32|        IT|14300.000000000002|
|   202|      Pat|  6000| 39|        HR| 6600.000000000001|
|   203|    Susan|  6500| 36| Marketing| 7150.000000000001|
|   204|  Hermann| 10000| 29|   Finance|           11000.0|
|   205|  Shelley| 12008| 33|   Finance|13208.800000000001|
|   206|  William|  8300| 37|        IT|            9130.0|
|   100|   Steven| 24000| 39|        IT|26400.000000000004|
|   101|    Neena| 17000| 27|     Sales|           18700.0|
|   102|      Lex| 17000| 37| Marketing|           18700.0|
|   103|Alexander|  9000| 39| Marketing|            9900.0|
|   104|    Bruce|  6000| 38|        IT|

## Task: Find Maximum Salary by Age

In [17]:
from pyspark.sql.functions import max

# Group data by age and calculate the maximum salary for each age group
max_salary_df = spark.sql("""
    SELECT age, max(salary) AS max_salary
    FROM employees
    GROUP BY age
""")

max_salary_df.show()

+---+----------+
|age|max_salary|
+---+----------+
| 31|      8200|
| 34|      7800|
| 28|     12008|
| 27|     17000|
| 26|      3600|
| 37|     17000|
| 35|      9000|
| 39|     24000|
| 38|      6000|
| 29|     10000|
| 32|     13000|
| 33|     12008|
| 30|      8000|
| 36|      7900|
+---+----------+



## Task 10: Self-Join on Employee Data

In [18]:
# Alias the original DataFrame
emp1 = employees_df.alias("emp1")
emp2 = employees_df.alias("emp2")

# Perform self-join on the "Emp_No" column
joined_df = emp1.join(emp2, emp1["Emp_No"] == emp2["Emp_No"])

# Display the result
joined_df.show()


+------+---------+------+---+----------+------+---------+------+---+----------+
|Emp_No| Emp_Name|Salary|Age|Department|Emp_No| Emp_Name|Salary|Age|Department|
+------+---------+------+---+----------+------+---------+------+---+----------+
|   198|   Donald|  2600| 29|        IT|   198|   Donald|  2600| 29|        IT|
|   199|  Douglas|  2600| 34|     Sales|   199|  Douglas|  2600| 34|     Sales|
|   200| Jennifer|  4400| 36| Marketing|   200| Jennifer|  4400| 36| Marketing|
|   201|  Michael| 13000| 32|        IT|   201|  Michael| 13000| 32|        IT|
|   202|      Pat|  6000| 39|        HR|   202|      Pat|  6000| 39|        HR|
|   203|    Susan|  6500| 36| Marketing|   203|    Susan|  6500| 36| Marketing|
|   204|  Hermann| 10000| 29|   Finance|   204|  Hermann| 10000| 29|   Finance|
|   205|  Shelley| 12008| 33|   Finance|   205|  Shelley| 12008| 33|   Finance|
|   206|  William|  8300| 37|        IT|   206|  William|  8300| 37|        IT|
|   100|   Steven| 24000| 39|        IT|

## Task 11: Calculate the average age of employees using the built-in aggregation function. Display the result.

In [19]:
from pyspark.sql.functions import avg

# Calculate average age
average_age_df = employees_df.select(avg("age").alias("Average_Age"))

# Display the result
average_age_df.show()

+-----------+
|Average_Age|
+-----------+
|      33.56|
+-----------+



## Task 12: Calculate Total Salary by Deparment

In [20]:
from pyspark.sql.functions import sum

# Group by department and calculate total salary
total_salary_per_dept = employees_df.groupBy("department").agg(sum("salary").alias("Total_Salary"))

# Display the result
total_salary_per_dept.show()

+----------+------------+
|department|Total_Salary|
+----------+------------+
|     Sales|       71408|
|        HR|       46700|
|   Finance|       57308|
| Marketing|       59700|
|        IT|       74000|
+----------+------------+



## Task 13: Sort Data by Age and Salary

In [21]:
from pyspark.sql.functions import col

# Sort by age (ascending) and salary (descending)
sorted_df = employees_df.orderBy(col("age").asc(), col("salary").desc())

# Display the sorted DataFrame
sorted_df.show()

+------+---------+------+---+----------+
|Emp_No| Emp_Name|Salary|Age|Department|
+------+---------+------+---+----------+
|   137|   Renske|  3600| 26| Marketing|
|   101|    Neena| 17000| 27|     Sales|
|   114|      Den| 11000| 27|   Finance|
|   108|    Nancy| 12008| 28|     Sales|
|   130|    Mozhe|  2800| 28| Marketing|
|   126|    Irene|  2700| 28|        HR|
|   204|  Hermann| 10000| 29|   Finance|
|   115|Alexander|  3100| 29|   Finance|
|   134|  Michael|  2900| 29|     Sales|
|   198|   Donald|  2600| 29|        IT|
|   140|   Joshua|  2500| 29|   Finance|
|   136|    Hazel|  2200| 29|        IT|
|   120|  Matthew|  8000| 30|        HR|
|   110|     John|  8200| 31| Marketing|
|   127|    James|  2400| 31|        HR|
|   201|  Michael| 13000| 32|        IT|
|   111|   Ismael|  7700| 32|        IT|
|   119|    Karen|  2500| 32|   Finance|
|   205|  Shelley| 12008| 33|   Finance|
|   124|    Kevin|  5800| 33| Marketing|
+------+---------+------+---+----------+
only showing top

## Task 14: Count Employees in Each Department

In [22]:
from pyspark.sql.functions import count

# Calculate the number of employees in each department. Display the result.
count_emp_df = spark.sql("""
    SELECT department, count(Emp_No)
    FROM employees
    GROUP BY department
""")

count_emp_df.show()

# Group by department and count employees
employee_count_per_dept = employees_df.groupBy("department").agg(count("*").alias("Employee_Count"))

# Display the result
employee_count_per_dept.show()

+----------+-------------+
|department|count(Emp_No)|
+----------+-------------+
|     Sales|           13|
|        HR|            8|
|   Finance|           10|
| Marketing|            9|
|        IT|           10|
+----------+-------------+

+----------+--------------+
|department|Employee_Count|
+----------+--------------+
|     Sales|            13|
|        HR|             8|
|   Finance|            10|
| Marketing|             9|
|        IT|            10|
+----------+--------------+



## Task 15: Filter Employees with the letter o in the Nme

In [25]:
# Apply a filter to select records where the employee's name contains the letter 'o'
filtered_emp_df = employees_df.filter(col("Emp_Name").like("%o%"))
filtered_emp_df.show()

+------+-----------+------+---+----------+
|Emp_No|   Emp_Name|Salary|Age|Department|
+------+-----------+------+---+----------+
|   198|     Donald|  2600| 29|        IT|
|   199|    Douglas|  2600| 34|     Sales|
|   110|       John|  8200| 31| Marketing|
|   112|Jose Manuel|  7800| 34|        HR|
|   130|      Mozhe|  2800| 28| Marketing|
|   133|      Jason|  3300| 38|     Sales|
|   139|       John|  2700| 36|     Sales|
|   140|     Joshua|  2500| 29|   Finance|
+------+-----------+------+---+----------+

