https://www.youtube.com/watch?v=_C8kWso4ne4&t=1046s

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as f

In [3]:
spark = SparkSession.builder.appName("Pyspark Filter").getOrCreate()

In [4]:
df = spark.read.csv('employee_data.csv', header=True, inferSchema=True)
df = df.withColumn('sal', f.col('sal').cast(IntegerType())) \
    .withColumn('deptno', f.col('deptno').cast(IntegerType())) \
    .withColumn('comm', f.col('comm').cast(IntegerType()))
df.show()

+-----+------+---------+----+---------+----+----+------+
|empno| ename|      job| mgr| hiredate| sal|comm|deptno|
+-----+------+---------+----+---------+----+----+------+
| 7369| SMITH|    CLERK|7902|17-Dec-80| 800|null|    20|
| 7499| ALLEN| SALESMAN|7698|20-Feb-81|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698|22-Feb-81|1250| 500|    30|
| 7566| JONES|  MANAGER|7839|02-Apr-81|2975|null|    20|
| 7654|MARTIN| SALESMAN|7698|28-Sep-81|1250|1400|    30|
| 7698| BLAKE|  MANAGER|7839|01-May-81|2850|null|    30|
| 7782| CLARK|  MANAGER|7839|09-Jun-81|2450|null|    10|
| 7788| SCOTT|  ANALYST|7566|19-Apr-87|3000|null|    20|
| 7839|  KING|PRESIDENT|null|17-Nov-81|5000|null|    10|
| 7844|TURNER| SALESMAN|7698|08-Sep-81|1500|   0|    30|
| 7876| ADAMS|    CLERK|7788|23-May-87|1100|null|    20|
| 7900| JAMES|    CLERK|7698|03-Dec-81| 950|null|    30|
| 7902|  FORD|  ANALYST|7566|03-Dec-81|3000|null|    20|
| 7934|MILLER|    CLERK|7782|23-Jan-82|1300|null|    10|
+-----+------+---------+----+--

# Filter Operation

In [5]:
# salary less than 1500

df.filter('sal<=1500').show()

+-----+------+--------+----+---------+----+----+------+
|empno| ename|     job| mgr| hiredate| sal|comm|deptno|
+-----+------+--------+----+---------+----+----+------+
| 7369| SMITH|   CLERK|7902|17-Dec-80| 800|null|    20|
| 7521|  WARD|SALESMAN|7698|22-Feb-81|1250| 500|    30|
| 7654|MARTIN|SALESMAN|7698|28-Sep-81|1250|1400|    30|
| 7844|TURNER|SALESMAN|7698|08-Sep-81|1500|   0|    30|
| 7876| ADAMS|   CLERK|7788|23-May-87|1100|null|    20|
| 7900| JAMES|   CLERK|7698|03-Dec-81| 950|null|    30|
| 7934|MILLER|   CLERK|7782|23-Jan-82|1300|null|    10|
+-----+------+--------+----+---------+----+----+------+



In [6]:
df.filter(~((df.sal <= 1500) & (df.sal >= 1000))).show()

+-----+-----+---------+----+---------+----+----+------+
|empno|ename|      job| mgr| hiredate| sal|comm|deptno|
+-----+-----+---------+----+---------+----+----+------+
| 7369|SMITH|    CLERK|7902|17-Dec-80| 800|null|    20|
| 7499|ALLEN| SALESMAN|7698|20-Feb-81|1600| 300|    30|
| 7566|JONES|  MANAGER|7839|02-Apr-81|2975|null|    20|
| 7698|BLAKE|  MANAGER|7839|01-May-81|2850|null|    30|
| 7782|CLARK|  MANAGER|7839|09-Jun-81|2450|null|    10|
| 7788|SCOTT|  ANALYST|7566|19-Apr-87|3000|null|    20|
| 7839| KING|PRESIDENT|null|17-Nov-81|5000|null|    10|
| 7900|JAMES|    CLERK|7698|03-Dec-81| 950|null|    30|
| 7902| FORD|  ANALYST|7566|03-Dec-81|3000|null|    20|
+-----+-----+---------+----+---------+----+----+------+



# GroupBy Operation and Aggregate Functions

In [7]:
df.show()

+-----+------+---------+----+---------+----+----+------+
|empno| ename|      job| mgr| hiredate| sal|comm|deptno|
+-----+------+---------+----+---------+----+----+------+
| 7369| SMITH|    CLERK|7902|17-Dec-80| 800|null|    20|
| 7499| ALLEN| SALESMAN|7698|20-Feb-81|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698|22-Feb-81|1250| 500|    30|
| 7566| JONES|  MANAGER|7839|02-Apr-81|2975|null|    20|
| 7654|MARTIN| SALESMAN|7698|28-Sep-81|1250|1400|    30|
| 7698| BLAKE|  MANAGER|7839|01-May-81|2850|null|    30|
| 7782| CLARK|  MANAGER|7839|09-Jun-81|2450|null|    10|
| 7788| SCOTT|  ANALYST|7566|19-Apr-87|3000|null|    20|
| 7839|  KING|PRESIDENT|null|17-Nov-81|5000|null|    10|
| 7844|TURNER| SALESMAN|7698|08-Sep-81|1500|   0|    30|
| 7876| ADAMS|    CLERK|7788|23-May-87|1100|null|    20|
| 7900| JAMES|    CLERK|7698|03-Dec-81| 950|null|    30|
| 7902|  FORD|  ANALYST|7566|03-Dec-81|3000|null|    20|
| 7934|MILLER|    CLERK|7782|23-Jan-82|1300|null|    10|
+-----+------+---------+----+--

In [8]:
df.printSchema()

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: integer (nullable = true)
 |-- comm: integer (nullable = true)
 |-- deptno: integer (nullable = true)



In [16]:
# Groupby

df.groupBy('job').count().show()

+---------+-----+
|      job|count|
+---------+-----+
|  ANALYST|    2|
| SALESMAN|    4|
|    CLERK|    4|
|  MANAGER|    3|
|PRESIDENT|    1|
+---------+-----+



In [20]:
# grouped to find job having highesh salary
df.groupBy('job').sum('sal').show()

+---------+--------+
|      job|sum(sal)|
+---------+--------+
|  ANALYST|    6000|
| SALESMAN|    5600|
|    CLERK|    4150|
|  MANAGER|    8275|
|PRESIDENT|    5000|
+---------+--------+



In [22]:
# grouped to find deptno having having salary

df.groupBy('deptno').sum('sal').show()

+------+--------+
|deptno|sum(sal)|
+------+--------+
|    20|   10875|
|    10|    8750|
|    30|    9400|
+------+--------+



In [27]:
# finding mean
df.groupBy('job').mean('sal').show()

+---------+------------------+
|      job|          avg(sal)|
+---------+------------------+
|  ANALYST|            3000.0|
| SALESMAN|            1400.0|
|    CLERK|            1037.5|
|  MANAGER|2758.3333333333335|
|PRESIDENT|            5000.0|
+---------+------------------+



In [29]:
df.groupBy('job').agg({'sal':'mean'}).show()

+---------+------------------+
|      job|          avg(sal)|
+---------+------------------+
|  ANALYST|            3000.0|
| SALESMAN|            1400.0|
|    CLERK|            1037.5|
|  MANAGER|2758.3333333333335|
|PRESIDENT|            5000.0|
+---------+------------------+

