In [5]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [7]:
from pyspark.sql.types import *

schema = StructType(
  [StructField('studentId', IntegerType()),
   StructField('fname', StringType()),
   StructField('lname', StringType()),
   StructField('dept', StringType()),
   StructField('age', IntegerType()),
   StructField('year', StringType()),
   StructField('hours', IntegerType())] 
)


In [13]:
from pyspark.sql.types import *


df = spark.read.csv("data/student.csv", schema)
df.printSchema()
df.show()

root
 |-- studentId: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- year: string (nullable = true)
 |-- hours: integer (nullable = true)

+---------+-------+-------+--------+---+--------+-----+
|studentId|  fname|  lname|    dept|age|    year|hours|
+---------+-------+-------+--------+---+--------+-----+
|        1|   John|  Smith| Biology| 20|  junior|   12|
|        2|   Mary|  Jones|Business| 19|freshman|   16|
|        3|   Greg|   Phil| Biology| 23|  senior|    8|
|        4|    Sue|Hillman|Business| 18|freshman|   10|
|        5|    Joe| Garcia|    Math| 19|sophmore|   16|
|        6|   Mike|  Kline|    Math| 18|freshman|   12|
|        7|Charles|Mueller|Business| 21|  senior|   16|
|        8|   Jean|  McCay| Biology| 18|freshman|   16|
|        9|    Kay| Givens|Business| 20|sophmore|   12|
+---------+-------+-------+--------+---+--------+-----+



In [10]:
df.orderBy("dept").show()

+---------+-------+-------+--------+---+--------+-----+
|studentId|  fname|  lname|    dept|age|    year|hours|
+---------+-------+-------+--------+---+--------+-----+
|        1|   John|  Smith| Biology| 20|  junior|   12|
|        8|   Jean|  McCay| Biology| 18|freshman|   16|
|        3|   Greg|   Phil| Biology| 23|  senior|    8|
|        7|Charles|Mueller|Business| 21|  senior|   16|
|        2|   Mary|  Jones|Business| 19|freshman|   16|
|        9|    Kay| Givens|Business| 20|sophmore|   12|
|        4|    Sue|Hillman|Business| 18|freshman|   10|
|        5|    Joe| Garcia|    Math| 19|sophmore|   16|
|        6|   Mike|  Kline|    Math| 18|freshman|   12|
+---------+-------+-------+--------+---+--------+-----+



In [11]:
from pyspark.sql.functions import *

df.orderBy(desc("hours")).show()

+---------+-------+-------+--------+---+--------+-----+
|studentId|  fname|  lname|    dept|age|    year|hours|
+---------+-------+-------+--------+---+--------+-----+
|        5|    Joe| Garcia|    Math| 19|sophmore|   16|
|        7|Charles|Mueller|Business| 21|  senior|   16|
|        8|   Jean|  McCay| Biology| 18|freshman|   16|
|        2|   Mary|  Jones|Business| 19|freshman|   16|
|        6|   Mike|  Kline|    Math| 18|freshman|   12|
|        1|   John|  Smith| Biology| 20|  junior|   12|
|        9|    Kay| Givens|Business| 20|sophmore|   12|
|        4|    Sue|Hillman|Business| 18|freshman|   10|
|        3|   Greg|   Phil| Biology| 23|  senior|    8|
+---------+-------+-------+--------+---+--------+-----+



In [9]:

df.agg({'age':'avg', 'hours':'sum', 'studentId': 'count' }).show()

+----------------+----------+------------------+
|count(studentId)|sum(hours)|          avg(age)|
+----------------+----------+------------------+
|               9|       118|19.555555555555557|
+----------------+----------+------------------+



In [14]:
schema2 = StructType(
  [StructField('studentId', IntegerType()),
   StructField('state', StringType())] 
)
df2 = spark.read.csv("data/states.csv", schema2)
df2.printSchema()
df2.show()


root
 |-- studentId: integer (nullable = true)
 |-- state: string (nullable = true)

+---------+----------+
|studentId|     state|
+---------+----------+
|        1|New Mexico|
|        2|  New York|
|        3|California|
|        5|  Colorado|
|        6|Washington|
|        7|  Colorado|
|        9|   Indiana|
+---------+----------+



In [22]:
 df.join(df2, df.studentId == df2.studentId).show()

+---------+-------+-------+--------+---+--------+-----+---------+----------+
|studentId|  fname|  lname|    dept|age|    year|hours|studentId|     state|
+---------+-------+-------+--------+---+--------+-----+---------+----------+
|        1|   John|  Smith| Biology| 20|  junior|   12|        1|New Mexico|
|        2|   Mary|  Jones|Business| 19|freshman|   16|        2|  New York|
|        3|   Greg|   Phil| Biology| 23|  senior|    8|        3|California|
|        5|    Joe| Garcia|    Math| 19|sophmore|   16|        5|  Colorado|
|        6|   Mike|  Kline|    Math| 18|freshman|   12|        6|Washington|
|        7|Charles|Mueller|Business| 21|  senior|   16|        7|  Colorado|
|        9|    Kay| Givens|Business| 20|sophmore|   12|        9|   Indiana|
+---------+-------+-------+--------+---+--------+-----+---------+----------+

