In [37]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark').getOrCreate()

simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [38]:
from pyspark.sql.functions import *
# approx_count_distinct()
df.select(approx_count_distinct("salary")).show()

+-----------------------------+
|approx_count_distinct(salary)|
+-----------------------------+
|                            6|
+-----------------------------+



In [33]:
print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
  .show(truncate=False)

approx_count_distinct: 6
avg: 3400.0
+------------------------------------------------------------+
|collect_list(salary)                                        |
+------------------------------------------------------------+
|[3000, 4600, 4100, 3000, 3000, 3300, 3900, 3000, 2000, 4100]|
+------------------------------------------------------------+

+------------------------------------+
|collect_set(salary)                 |
+------------------------------------+
|[4600, 3000, 3900, 4100, 3300, 2000]|
+------------------------------------+

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|8                                 |
+----------------------------------+

Distinct Count of Department & Salary: 8
count: Row(count(salary)=10)
+-------------+
|first(salary)|
+-------------+
|3000         |
+-------------+

+------------+
|last(salary)|
+------------+
|4100        |
+------------+

+-------------------+
|kurtosis(salary

In [8]:
df.select(approx_count_distinct("salary")).show()

+-----------------------------+
|approx_count_distinct(salary)|
+-----------------------------+
|                            6|
+-----------------------------+



In [11]:
df.select(avg("salary")).collect()[0][0]

3400.0

In [14]:
df.select(collect_list("salary").alias("salary")).show()

+--------------------+
|              salary|
+--------------------+
|[3000, 4600, 4100...|
+--------------------+



In [15]:
df.select(collect_set("salary")).show(truncate=False)

+------------------------------------+
|collect_set(salary)                 |
+------------------------------------+
|[4600, 3000, 3900, 4100, 3300, 2000]|
+------------------------------------+



In [17]:
df.select(countDistinct("department", "salary")).show()

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|                                 8|
+----------------------------------+



In [18]:
df.select(count("salary")).show()

+-------------+
|count(salary)|
+-------------+
|           10|
+-------------+



In [20]:
df.select(first("salary")).show()

+-------------+
|first(salary)|
+-------------+
|         3000|
+-------------+



In [21]:
df.select(last("salary")).show()

+------------+
|last(salary)|
+------------+
|        4100|
+------------+



In [23]:
df.select(max("salary")).show()
df.select(min("salary")).show()
df.select(mean("salary")).show()

+-----------+
|max(salary)|
+-----------+
|       4600|
+-----------+

+-----------+
|min(salary)|
+-----------+
|       2000|
+-----------+

+-----------+
|avg(salary)|
+-----------+
|     3400.0|
+-----------+



In [24]:
df.select(sum("salary")).show()

+-----------+
|sum(salary)|
+-----------+
|      34000|
+-----------+



In [25]:
df.select(sumDistinct("salary")).show()

+--------------------+
|sum(DISTINCT salary)|
+--------------------+
|               20900|
+--------------------+



In [26]:
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)

+-------------------+-------------------+------------------+
|stddev_samp(salary)|stddev_samp(salary)|stddev_pop(salary)|
+-------------------+-------------------+------------------+
|765.9416862050705  |765.9416862050705  |726.636084983398  |
+-------------------+-------------------+------------------+



Sample Standard Deviation: 1.5811388300841898
Sample Standard Deviation (using stddev_samp): 1.5811388300841898
Population Standard Deviation: 1.4142135623730951


In [34]:
df.select(variance("salary"),var_samp("salary"),var_pop("salary")).show(truncate=False)

+-----------------+-----------------+---------------+
|var_samp(salary) |var_samp(salary) |var_pop(salary)|
+-----------------+-----------------+---------------+
|586666.6666666666|586666.6666666666|528000.0       |
+-----------------+-----------------+---------------+



In [39]:
df.select(kurtosis("salary")).show(truncate=False)

+-------------------+
|kurtosis(salary)   |
+-------------------+
|-0.6467803030303032|
+-------------------+

