In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
from pyspark.sql.types import *

schema = StructType(
  [StructField('studentId', IntegerType()),
   StructField('fname', StringType()),
   StructField('lname', StringType()),
   StructField('dept', StringType()),
   StructField('age', IntegerType()),
   StructField('year', StringType()),
   StructField('hours', IntegerType())] 
)


In [4]:
from pyspark.sql.types import *

df = spark.read.csv("data/student.csv", schema)
df.printSchema()
df.show()


root
 |-- studentId: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- year: string (nullable = true)
 |-- hours: integer (nullable = true)

+---------+-------+-------+--------+---+--------+-----+
|studentId|  fname|  lname|    dept|age|    year|hours|
+---------+-------+-------+--------+---+--------+-----+
|        1|   John|  Smith| Biology| 20|  junior|   12|
|        2|   Mary|  Jones|Business| 19|freshman|   16|
|        3|   Greg|   Phil| Biology| 23|  senior|    8|
|        4|    Sue|Hillman|Business| 18|freshman|   10|
|        5|    Joe| Garcia|    Math| 19|sophmore|   16|
|        6|   Mike|  Kline|    Math| 18|freshman|   12|
|        7|Charles|Mueller|Business| 21|  senior|   16|
|        8|   Jean|  McCay| Biology| 18|freshman|   16|
|        9|    Kay| Givens|Business| 20|sophmore|   12|
+---------+-------+-------+--------+---+--------+-----+



In [5]:
df1 = df.groupBy("dept")
print(type(df1))

<class 'pyspark.sql.group.GroupedData'>


In [7]:
df.groupBy("dept").show()

AttributeError: 'GroupedData' object has no attribute 'show'

In [9]:
df.groupBy("dept").count().show()

+--------+-----+
|    dept|count|
+--------+-----+
|    Math|    2|
|Business|    4|
| Biology|    3|
+--------+-----+



In [10]:
df.groupBy("dept", "age").count().orderBy("age").show()

+--------+---+-----+
|    dept|age|count|
+--------+---+-----+
|    Math| 18|    1|
| Biology| 18|    1|
|Business| 18|    1|
|Business| 19|    1|
|    Math| 19|    1|
| Biology| 20|    1|
|Business| 20|    1|
|Business| 21|    1|
| Biology| 23|    1|
+--------+---+-----+



In [11]:
df.groupBy("dept").agg({'age':'avg', 'hours':'sum', 'studentId' : 'count'}).show()

+--------+----------------+----------+------------------+
|    dept|count(studentId)|sum(hours)|          avg(age)|
+--------+----------------+----------+------------------+
|    Math|               2|        28|              18.5|
|Business|               4|        54|              19.5|
| Biology|               3|        36|20.333333333333332|
+--------+----------------+----------+------------------+

