In [79]:
# The following Spark basics are performed on people.json
!pip install pyspark




In [80]:
from pyspark.sql import SparkSession

In [81]:
spark = SparkSession.builder.appName('Aggregate_operations').getOrCreate()

In [82]:
df = spark.read.csv('/content/sales_info.csv', inferSchema = True, header = True)

In [None]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [None]:
df.groupBy("Company").mean().show()

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   APPL|            370.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+



In [None]:
df.agg({'sales':'Avg'}).show()

+-----------------+
|       avg(sales)|
+-----------------+
|360.5833333333333|
+-----------------+



In [None]:
group_data = df.groupBy('Company')
group_data.agg({'Sales': 'min'}).show()

+-------+----------+
|Company|min(Sales)|
+-------+----------+
|   APPL|     130.0|
|   GOOG|     120.0|
|     FB|     350.0|
|   MSFT|     124.0|
+-------+----------+



In [83]:
#importing functions in spark
from pyspark.sql.functions import countDistinct, avg, stddev

In [None]:
df.select(countDistinct('Sales').alias('No_of_sales')).show()

+-----------+
|No_of_sales|
+-----------+
|         11|
+-----------+



In [88]:
# Formatting the number of decimals
from pyspark.sql.functions import format_number
sales_std = df.select(stddev('sales').alias('Std'))
sales_std.show()
sales_std.select(format_number('Std',2).alias('Std_formatted')).show()

+------------------+
|               Std|
+------------------+
|250.08742410799007|
+------------------+

+-------------+
|Std_formatted|
+-------------+
|       250.09|
+-------------+



In [89]:
df.orderBy('Sales').show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [90]:
df.orderBy(df['Company'].desc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   MSFT|    Amy|124.0|
|   MSFT|   Tina|600.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   GOOG|  Frank|340.0|
|   GOOG|Charlie|120.0|
|     FB|  Sarah|350.0|
|     FB|   Carl|870.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|  Chris|350.0|
|   APPL|   Mike|750.0|
+-------+-------+-----+

