In [18]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("aggregate_operation").master("local[*]").getOrCreate()

In [19]:
import requests
from os import environ as env
from pathlib import Path
url = 'https://raw.githubusercontent.com/jubins/Spark-And-MLlib-Projects/master/Spark_DataFrame_API_Project/sales_info.csv'
r = requests.get(url, allow_redirects=True)
data_path = env.get("DATA_HOME","data") + "\\FileData\\Csv\\sales_info.csv"
if Path(data_path).exists() == False:
    open(data_path, 'wb').write(r.content)

In [20]:
df=spark.read.csv(data_path,inferSchema=True,header=True)
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [21]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)



In [22]:
df.groupBy('Company').count().show()

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



In [23]:
from pyspark.sql.functions import sum, max, min

df.groupBy('Company').agg(
    sum("Sales").alias("sum_sal"),
    max("Sales").alias("max_sal"),
    min("Sales").alias("min_sal"),
).show()

+-------+-------+-------+-------+
|Company|sum_sal|max_sal|min_sal|
+-------+-------+-------+-------+
|   APPL| 1480.0|  750.0|  130.0|
|   GOOG|  660.0|  340.0|  120.0|
|     FB| 1220.0|  870.0|  350.0|
|   MSFT|  967.0|  600.0|  124.0|
+-------+-------+-------+-------+



In [24]:
df.orderBy('Sales').show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [25]:
spark.stop()