<a href="https://colab.research.google.com/github/mayureshpawashe/ad_spark/blob/main/ad_spark_day1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

url = "https://gist.githubusercontent.com/kevin336/acbb2271e66c10a5b73aacf82ca82784/raw/e38afe62e088394d61ed30884dd50a6826eee0a8/employees.csv"
local_file = "employees.csv"
response = requests.get(url)
with open(local_file, "wb") as f:
    f.write(response.content)
spark = SparkSession.builder.appName("EmployeeData").getOrCreate()
df = spark.read.csv(local_file, header=True, inferSchema=True)

df.show(10)

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|


##Select specific columns

In [6]:
selected_employees = df.select("FIRST_NAME", "LAST_NAME", "SALARY", "DEPARTMENT_ID")
selected_employees.show()

+----------+---------+------+-------------+
|FIRST_NAME|LAST_NAME|SALARY|DEPARTMENT_ID|
+----------+---------+------+-------------+
|    Donald| OConnell|  2600|           50|
|   Douglas|    Grant|  2600|           50|
|  Jennifer|   Whalen|  4400|           10|
|   Michael|Hartstein| 13000|           20|
|       Pat|      Fay|  6000|           20|
|     Susan|   Mavris|  6500|           40|
|   Hermann|     Baer| 10000|           70|
|   Shelley|  Higgins| 12008|          110|
|   William|    Gietz|  8300|          110|
|    Steven|     King| 24000|           90|
|     Neena|  Kochhar| 17000|           90|
|       Lex|  De Haan| 17000|           90|
| Alexander|   Hunold|  9000|           60|
|     Bruce|    Ernst|  6000|           60|
|     David|   Austin|  4800|           60|
|     Valli|Pataballa|  4800|           60|
|     Diana|  Lorentz|  4200|           60|
|     Nancy|Greenberg| 12008|          100|
|    Daniel|   Faviet|  9000|          100|
|      John|     Chen|  8200|   

##Filter employees in a specific department

In [10]:
department_10_employees = df.filter(col("DEPARTMENT_ID") == 90 )
department_10_employees.show()

+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|17-JUN-03|AD_PRES| 24000|            - |        - |           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|21-SEP-05|  AD_VP| 17000|            - |       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|13-JAN-01|  AD_VP| 17000|            - |       100|           90|
+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+



##salary greater than a certain value

In [11]:
high_salary_employees = df.filter(col("SALARY") > 15000)
high_salary_employees.show()

+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|17-JUN-03|AD_PRES| 24000|            - |        - |           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|21-SEP-05|  AD_VP| 17000|            - |       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|13-JAN-01|  AD_VP| 17000|            - |       100|           90|
+-----------+----------+---------+--------+------------+---------+-------+------+--------------+----------+-------------+



##Filter employees by name

In [12]:
employees_with_name_steven = df.filter(col("FIRST_NAME") == "Steven")
employees_with_name_steven.show()

+-----------+----------+---------+-------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|  EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+-------+------------+---------+--------+------+--------------+----------+-------------+
|        100|    Steven|     King|  SKING|515.123.4567|17-JUN-03| AD_PRES| 24000|            - |        - |           90|
|        128|    Steven|   Markle|SMARKLE|650.124.1434|08-MAR-08|ST_CLERK|  2200|            - |       120|           50|
+-----------+----------+---------+-------+------------+---------+--------+------+--------------+----------+-------------+



In [14]:
df.show(2)

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|SH_CLERK|  2600|            - |       124|           50|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
only showing top 2 rows



##Count the number of employees

In [15]:
employee_count = df.count()
print("Total Employees:", employee_count)

Total Employees: 50


##Collect the first few rows

In [16]:
first_few_rows = df.take(5)
for row in first_few_rows:
    print(row)

Row(EMPLOYEE_ID=198, FIRST_NAME='Donald', LAST_NAME='OConnell', EMAIL='DOCONNEL', PHONE_NUMBER='650.507.9833', HIRE_DATE='21-JUN-07', JOB_ID='SH_CLERK', SALARY=2600, COMMISSION_PCT=' - ', MANAGER_ID='124', DEPARTMENT_ID=50)
Row(EMPLOYEE_ID=199, FIRST_NAME='Douglas', LAST_NAME='Grant', EMAIL='DGRANT', PHONE_NUMBER='650.507.9844', HIRE_DATE='13-JAN-08', JOB_ID='SH_CLERK', SALARY=2600, COMMISSION_PCT=' - ', MANAGER_ID='124', DEPARTMENT_ID=50)
Row(EMPLOYEE_ID=200, FIRST_NAME='Jennifer', LAST_NAME='Whalen', EMAIL='JWHALEN', PHONE_NUMBER='515.123.4444', HIRE_DATE='17-SEP-03', JOB_ID='AD_ASST', SALARY=4400, COMMISSION_PCT=' - ', MANAGER_ID='101', DEPARTMENT_ID=10)
Row(EMPLOYEE_ID=201, FIRST_NAME='Michael', LAST_NAME='Hartstein', EMAIL='MHARTSTE', PHONE_NUMBER='515.123.5555', HIRE_DATE='17-FEB-04', JOB_ID='MK_MAN', SALARY=13000, COMMISSION_PCT=' - ', MANAGER_ID='100', DEPARTMENT_ID=20)
Row(EMPLOYEE_ID=202, FIRST_NAME='Pat', LAST_NAME='Fay', EMAIL='PFAY', PHONE_NUMBER='603.123.6666', HIRE_DATE=

##WithColumn and Drop Operations
Add a bonus column (e.g., 10% of salary)

In [17]:
employees_with_bonus = df.withColumn("BONUS", col("SALARY") * 0.1)
employees_with_bonus.show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID| BONUS|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50| 260.0|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50| 260.0|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10| 440.0|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|1300.0|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP

##Add a new column based on conditions


In [18]:
employees_with_salary_category = df.withColumn("SALARY_CATEGORY", when(col("SALARY") > 10000, "High").when(col("SALARY") > 5000, "Medium").otherwise("Low"))
employees_with_salary_category.show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+---------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|SALARY_CATEGORY|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+---------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|            Low|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|            Low|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|            Low|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|           High|
|        202|

##Drop the COMMISSION_PCT column

In [25]:
employees_without_commission = df.drop("COMMISSION_PCT")
employees_without_commission.show()

+-----------+----------+---------+--------+------------+---------+----------+------+----------+-------------+---------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|MANAGER_ID|DEPARTMENT_ID|FULL_NAME|
+-----------+----------+---------+--------+------------+---------+----------+------+----------+-------------+---------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|       124|           50|     NULL|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|       124|           50|     NULL|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|       101|           10|     NULL|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|       100|           20|     NULL|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|       201|           20|     NULL|
|        203|     Susan|   Mavris| SMAVR

In [31]:
rdd = df.rdd

In [32]:
full_names_rdd = rdd.map(lambda row: row["FIRST_NAME"] + " " + row["LAST_NAME"])
full_names_rdd.take(5)

['Donald OConnell',
 'Douglas Grant',
 'Jennifer Whalen',
 'Michael Hartstein',
 'Pat Fay']

In [34]:
salary_rdd = rdd.map(lambda row: row["SALARY"])
salary_rdd.take(10)

[2600, 2600, 4400, 13000, 6000, 6500, 10000, 12008, 8300, 24000]

In [36]:
high_salary_rdd = rdd.filter(lambda row: row["SALARY"] > 17000)
high_salary_rdd.take(5)

[Row(EMPLOYEE_ID=100, FIRST_NAME='Steven', LAST_NAME='King', EMAIL='SKING', PHONE_NUMBER='515.123.4567', HIRE_DATE='17-JUN-03', JOB_ID='AD_PRES', SALARY=24000, COMMISSION_PCT=' - ', MANAGER_ID=' - ', DEPARTMENT_ID=90, FULL_NAME=None)]

In [37]:
row_count_rdd = rdd.count()
print("RDD Row Count:", row_count_rdd)

RDD Row Count: 50
