In [1]:
data1 = [(7369, "SMITH", "CLERK", 7902, "17-DEC-2005", 800, None, 20),
        (7499, "ALLEN", "SALESMAN",7698,"20-FEB-2006",1600,300,  30),
        (7521, "WARD", "SALESMAN", 7698,"22-FEB-2006",1250,500,  30),
        (7566, "JONES", "MANAGER", 7839, "02-APR-2006",2975,None,20),
        (7654, "MARTIN", "SALESMAN", 7698, "28-SEP-2006",1250,1400,30),
        (7698, "BLAKE", "MANAGER", 7839, "01-MAY-2006", 2850, None,30),
        (7782, "CLARK", "MANAGER", 7839, "09-JUN-2006", 2450, None, 10),
        (7788, "SCOTT", "ANALYST", 7566, "09-DEC-2007", 3000, None, 20),
        (7839, "KING", "PRESIDENT",None, "17-NOV-2006", 5000, None, 10),
        (7844, "Turner", "SALESMAN",7698, "08-SEP-2006", 1500, 0,  30),
        (7876, "ADAMS", "CLERK",   7788,  "12-JAN-2008", 1100, None, 20),
        (7900, "JAMES",  "CLERK",  7698,  "03-DEC-2006", 950, None, 30),
        (7902, "FORD",  "ANALYST", 7566,  "03-DEC-2006", 3000, None, 20),
        (7934, "MILLER", "CLERK",  7782,  "23-JAN-2007", 1300, None, 10)
       ]
schema1 = ["empno","ename","job","mgr","hiredate","sal","comm","deptno"]
        

In [None]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
spark = SparkSession.builder.getOrCreate()
emp_df = spark.createDataFrame(data1,schema1)

In [4]:
emp_df.createOrReplaceTempView('emp')

# Average Salary for All the Employees

In [5]:
spark.sql("select round(avg(sal), 2) as avg_salary from emp").show()

                                                                                

+----------+
|avg_salary|
+----------+
|   2073.21|
+----------+



In [6]:
emp_df.select(round(avg("sal"),2).alias("avg_salary")).show()

+----------+
|avg_salary|
+----------+
|   2073.21|
+----------+



# Average Salary for each department

In [6]:
spark.sql('''
           select deptno, round(avg(sal),2) as avg_salary from emp
           group by deptno
          ''').show()  

+------+----------+
|deptno|avg_salary|
+------+----------+
|    30|   1566.67|
|    20|    2175.0|
|    10|   2916.67|
+------+----------+



In [7]:
emp_df.groupBy('deptno').agg(round(avg("sal"),2).alias("avg_salary")).show()

+------+----------+
|deptno|avg_salary|
+------+----------+
|    30|   1566.67|
|    20|    2175.0|
|    10|   2916.67|
+------+----------+



# Finding the Min/Max Value in a Column.

In [8]:
spark.sql('''select min(sal) as min_sal, max(sal) as max_sal from emp''').show()

+-------+-------+
|min_sal|max_sal|
+-------+-------+
|    800|   5000|
+-------+-------+



In [9]:
spark.sql('''select deptno, min(sal) as min_sal, max(sal) as max_sal from emp group by deptno''').show()

+------+-------+-------+
|deptno|min_sal|max_sal|
+------+-------+-------+
|    30|    950|   2850|
|    20|    800|   3000|
|    10|   1300|   5000|
+------+-------+-------+



# Summing the Values in a Column

In [10]:
spark.sql("select sum(sal) from emp").show()

+--------+
|sum(sal)|
+--------+
|   29025|
+--------+



In [11]:
emp_df.agg(sum('sal').alias("sum_sal")).show()

+-------+
|sum_sal|
+-------+
|  29025|
+-------+



In [12]:
spark.sql("select deptno, sum(sal) as sum_sal from emp group by deptno").show()

+------+-------+
|deptno|sum_sal|
+------+-------+
|    30|   9400|
|    20|  10875|
|    10|   8750|
+------+-------+



emp_df1.groupBy("deptno").agg(sum("sal").alias("sum_sal")).show()

# Counting Rows in a Table 

# Generating a Running Total

In [13]:
spark.sql("select ename, sal, sum(sal) over(order by sal,empno) as running_total from emp").show()

24/12/04 13:13:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/04 13:13:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/04 13:13:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------+----+-------------+
| ename| sal|running_total|
+------+----+-------------+
| SMITH| 800|          800|
| JAMES| 950|         1750|
| ADAMS|1100|         2850|
|  WARD|1250|         4100|
|MARTIN|1250|         5350|
|MILLER|1300|         6650|
|Turner|1500|         8150|
| ALLEN|1600|         9750|
| CLARK|2450|        12200|
| BLAKE|2850|        15050|
| JONES|2975|        18025|
| SCOTT|3000|        21025|
|  FORD|3000|        24025|
|  KING|5000|        29025|
+------+----+-------------+



24/12/04 13:13:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/04 13:13:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [9]:
from pyspark.sql.window import Window 
windowspec = Window.orderBy("sal", "deptno") 
emp_df.withColumn("running_total",sum("sal").over(windowspec)).show()

24/12/26 16:21:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/26 16:21:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/26 16:21:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/26 16:21:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/26 16:21:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----+------+---------+----+-----------+----+----+------+-------------+
|empno| ename|      job| mgr|   hiredate| sal|comm|deptno|running_total|
+-----+------+---------+----+-----------+----+----+------+-------------+
| 7369| SMITH|    CLERK|7902|17-DEC-2005| 800|NULL|    20|          800|
| 7900| JAMES|    CLERK|7698|03-DEC-2006| 950|NULL|    30|         1750|
| 7876| ADAMS|    CLERK|7788|12-JAN-2008|1100|NULL|    20|         2850|
| 7521|  WARD| SALESMAN|7698|22-FEB-2006|1250| 500|    30|         5350|
| 7654|MARTIN| SALESMAN|7698|28-SEP-2006|1250|1400|    30|         5350|
| 7934|MILLER|    CLERK|7782|23-JAN-2007|1300|NULL|    10|         6650|
| 7844|Turner| SALESMAN|7698|08-SEP-2006|1500|   0|    30|         8150|
| 7499| ALLEN| SALESMAN|7698|20-FEB-2006|1600| 300|    30|         9750|
| 7782| CLARK|  MANAGER|7839|09-JUN-2006|2450|NULL|    10|        12200|
| 7698| BLAKE|  MANAGER|7839|01-MAY-2006|2850|NULL|    30|        15050|
| 7566| JONES|  MANAGER|7839|02-APR-2006|2975|NULL|

# Smoothing A Series of Values 

In [66]:
data3 = [("2020-01-01", 647),
         ("2020-01-02", 561),
         ("2020-01-03", 741),
         ("2020-01-04", 978),
         ("2020-01-05", 1062),
         ("2020-01-06", 1072),
         ("2020-01-07", 805), 
         ("2020-01-08", 662),
         ("2020-01-09", 1023), 
         ("2020-01-10", 970) ]
schema3 = ["Date1", "sales"] 
spark.createDataFrame(data=data3, schema=schema3).show() 

+----------+-----+
|     Date1|sales|
+----------+-----+
|2020-01-01|  647|
|2020-01-02|  561|
|2020-01-03|  741|
|2020-01-04|  978|
|2020-01-05| 1062|
|2020-01-06| 1072|
|2020-01-07|  805|
|2020-01-08|  662|
|2020-01-09| 1023|
|2020-01-10|  970|
+----------+-----+



# Calculating A Mode 

In [15]:
spark.sql ('''
             select sal from (
             select sal,dense_rank() over(order by cnt desc) as rnk from
             (select sal, count(*) AS cnt
               from emp 
               where deptno = 20
               group by sal 
               order by sal desc )x
               )
               where rnk = 1
               ''').show()

24/12/04 13:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/04 13:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/04 13:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/04 13:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/04 13:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/04 13:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/04 1

+----+
| sal|
+----+
|3000|
+----+



24/12/04 13:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/04 13:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [11]:
df = emp_df.select("sal").orderBy('sal',ascending=False).where(emp_df.deptno==20)
df=df.groupBy('sal').count().orderBy("count",ascending=False)
df.show()

windowspec = Window.orderBy("count") # descending Order
df=df.withColumn("row_number",row_number().over(windowspec))#.orderBy("dense_rank",ascending=False).show()
df.select("sal").filter(df.row_number==4).show()

+----+-----+
| sal|count|
+----+-----+
|3000|    2|
| 800|    1|
|2975|    1|
|1100|    1|
+----+-----+



24/12/26 16:21:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/26 16:21:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/26 16:21:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/26 16:21:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/26 16:21:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/26 16:21:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/26 1

+----+
| sal|
+----+
|3000|
+----+



In [12]:
df=emp_df.select("sal").where(emp_df.deptno==20).groupBy('sal').count().orderBy("count",ascending=False)
df.first()


Row(sal=3000, count=2)

In [18]:
# Calculating a median 



# Determining the percentage of a Total

In [20]:
df = emp_df1.groupBy('deptno').agg(sum('sal').alias('sum_sal'))
df.show() 
a_sum = df.select(sum("sum_sal")).collect()[0][0]
print(f"a_sum = {a_sum}")
df.withColumn("pct", when (df.deptno==30, round((df.sum_sal/a_sum*100),2))
                     .when (df.deptno==20, round((df.sum_sal/a_sum*100),2))
                     .otherwise (df.sum_sal/a_sum*100)).show() #/sum(df.sum_sal)*100)).show()

+------+-------+
|deptno|sum_sal|
+------+-------+
|    30|   9400|
|    20|  10875|
|    10|   8750|
+------+-------+

a_sum = 29025
+------+-------+----------------+
|deptno|sum_sal|             pct|
+------+-------+----------------+
|    30|   9400|           32.39|
|    20|  10875|           37.47|
|    10|   8750|30.1464254952627|
+------+-------+----------------+



# Computing Averages Without High and Low Values

In [25]:
spark.sql('''select (round(avg(sal),2)) as avg_sal from emp
          where sal not in (
                   (select min(sal) from emp),
                   (select max(sal) from emp)
          )
          ''').show()

+-------+
|avg_sal|
+-------+
|1935.42|
+-------+



In [64]:
minimum = emp_df1.agg({'sal':'min'}).collect()[0][0]
maximum = emp_df1.agg({'sal':'max'}).collect()[0][0]
lst = [minimum, maximum]
emp_df1.select("sal").where(~emp_df1.sal.isin(lst)).agg(round(avg(emp_df1.sal),2).alias('avg_salary')).show()

+----------+
|avg_salary|
+----------+
|   1935.42|
+----------+

