# Spark HR Data Pipeline Project

## Prerequisites
1. Install required libraries and prepare spark environment

In [1]:
# Installing requuired packages
%pip install pyspark  findspark wget


Defaulting to user installation because normal site-packages is not writeable
Collecting pyspark
  Downloading pyspark-4.0.1.tar.gz (434.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.2/434.2 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting findspark
  Using cached findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting py4j==0.10.9.9 (from pyspark)
  Using cached py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Using cached py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
Using cached findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)


In [4]:
import findspark

findspark.init()

In [None]:
# PySpark is the Spark API for Python. We use PySpark to initialize the SparkContext.

from pyspark import SparkContext, SparkConf

from pyspark.sql import SparkSession

In [None]:
# Creating a SparkContext object
sc = SparkContext.getOrCreate()

# Creating a SparkSession

spark = SparkSession.builder.appName(
    "Python Spark HR Data Pipeline Project"
).getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/10 15:32:52 WARN Utils: Your hostname, maishuji, resolves to a loopback address: 127.0.1.1; using 192.168.0.14 instead (on interface wlp4s0)
25/09/10 15:32:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/10 15:32:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/10 15:32:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


2. Download the CSV data

In [None]:
# Download the CSV data first into a local `employees.csv` file
import wget

wget.download(
    "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/data/employees.csv"
)

'employees.csv'

## Tasks

1. Generate a Spark DataFrame from the CSV data

In [7]:
# Read data from the "employees" CSV file and import it into a DataFrame variable named "employees_df"
employees_df = spark.read.csv("employees.csv", header=True, inferSchema=True)
employees_df.show()


+------+---------+------+---+----------+
|Emp_No| Emp_Name|Salary|Age|Department|
+------+---------+------+---+----------+
|   198|   Donald|  2600| 29|        IT|
|   199|  Douglas|  2600| 34|     Sales|
|   200| Jennifer|  4400| 36| Marketing|
|   201|  Michael| 13000| 32|        IT|
|   202|      Pat|  6000| 39|        HR|
|   203|    Susan|  6500| 36| Marketing|
|   204|  Hermann| 10000| 29|   Finance|
|   205|  Shelley| 12008| 33|   Finance|
|   206|  William|  8300| 37|        IT|
|   100|   Steven| 24000| 39|        IT|
|   101|    Neena| 17000| 27|     Sales|
|   102|      Lex| 17000| 37| Marketing|
|   103|Alexander|  9000| 39| Marketing|
|   104|    Bruce|  6000| 38|        IT|
|   105|    David|  4800| 39|        IT|
|   106|    Valli|  4800| 38|     Sales|
|   107|    Diana|  4200| 35|     Sales|
|   108|    Nancy| 12008| 28|     Sales|
|   109|   Daniel|  9000| 35|        HR|
|   110|     John|  8200| 31| Marketing|
+------+---------+------+---+----------+
only showing top

2. Define q schema for the data

In [None]:
# Lets first print the inferred schema
employees_df.printSchema()

root
 |-- Emp_No: integer (nullable = true)
 |-- Emp_Name: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Department: string (nullable = true)



In [12]:
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

# Now we define an improved schema with more descriptive names and precise data types
employees_schema_improved = StructType(
    [
        StructField(
            "employee_id", IntegerType(), False
        ),  # Not nullable, as it's a primary key
        StructField("employee_name", StringType(), True),
        StructField(
            "salary", DoubleType(), True
        ),  # Changed to DoubleType for precision
        StructField("age", IntegerType(), True),
        StructField("department_name", StringType(), True),
    ]
)

In [15]:
# Apply the new schema
employees_df = spark.read.csv(
    "employees.csv", schema=employees_schema_improved, header=True
)

3. Dislay schema of DataFrame

In [16]:
employees_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- department_name: string (nullable = true)

