### SparkSession


In [None]:
import findspark as fs
fs.init()
# fs.find()

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession\
.builder\
.appName('MyApp')\
.master('local[2]')\
.getOrCreate()

In [40]:
spark.stop()

In [None]:
df=spark.read.option("header",True).csv('./data/custs_with_header.csv')


In [None]:
df.show()

In [None]:
#get all the records where age>40
df.where('age > 40').show(5)

In [None]:
#get all records where fname starts with 'S'
df.where('fname like "S%"').show(5)

In [None]:
#get all the records where desig is ("Teacher","Pilot","Lawyer")
df.where('desig in ("Teacher","Pilot","Lawyer")').show(5)

In [None]:
#get all the records where age>50 and desig is "Pilot"
df.where('desig = "Pilot" and age > 50' ).show(5)

In [None]:
#get all the records where age is between 40 and 50
df.where('age between 40 and 50').show(5)

In [None]:
#get designation wise count
df.groupBy('desig').count().show(5)

In [None]:
#get top 10 designations count wise
from pyspark.sql.functions import desc
df.groupBy('desig').count().orderBy(desc("count")).show(10)

### Column Based Expression


In [21]:
# from pyspark.sql.functions import col
from pyspark.sql.functions import column

In [24]:

#get all records where fname starts with 'S'
df.where(column("fname").startswith("S")).show(5)

+-------+---------+-------+---+--------------------+
|    cid|    fname|  lname|age|               desig|
+-------+---------+-------+---+--------------------+
|4000003|   Sherri| Melton| 34|         Firefighter|
|4000012|    Sandy| Raynor| 26|              Writer|
|4000030|Stephanie|Hawkins| 50|Human resources a...|
|4000035|  Shelley|  Weeks| 25|            Reporter|
|4000052|  Shirley|Merritt| 21|            Reporter|
+-------+---------+-------+---+--------------------+
only showing top 5 rows



In [26]:
#get all the records where desig is ("Teacher","Pilot","Lawyer")
df.where(column("desig").isin("Teacher","Pilot","Lawyer")).show(5)

+-------+--------+--------+---+-------+
|    cid|   fname|   lname|age|  desig|
+-------+--------+--------+---+-------+
|4000001|Kristina|   Chung| 55|  Pilot|
|4000002|   Paige|    Chen| 74|Teacher|
|4000005|   Karen| Puckett| 74| Lawyer|
|4000007|   Elsie|Hamilton| 43|  Pilot|
|4000026|  Marian| Solomon| 27| Lawyer|
+-------+--------+--------+---+-------+
only showing top 5 rows



In [38]:
#get all the records where age>50 and desig is "Pilot"
df.where((column("desig")=='Pilot') & (column("age")>50)).show(5)

+-------+--------+-------+---+-----+
|    cid|   fname|  lname|age|desig|
+-------+--------+-------+---+-----+
|4000001|Kristina|  Chung| 55|Pilot|
|4000229|    Faye| Norman| 64|Pilot|
|4000251|  Jeremy|  House| 61|Pilot|
|4000271|   Alice|  Nance| 59|Pilot|
|4000336|  Steven|Ballard| 62|Pilot|
+-------+--------+-------+---+-----+
only showing top 5 rows



In [35]:
#get all the records where age is between 40 and 50
df.where(column("age").between(40,50)).show(5)

+-------+-------+--------+---+------------+
|    cid|  fname|   lname|age|       desig|
+-------+-------+--------+---+------------+
|4000006|Patrick|    Song| 42|Veterinarian|
|4000007|  Elsie|Hamilton| 43|       Pilot|
|4000011|Francis|McNamara| 47|   Therapist|
|4000013| Marion|    Moon| 41|   Carpenter|
|4000015|  Julia|   Desai| 49|    Musician|
+-------+-------+--------+---+------------+
only showing top 5 rows



In [39]:
#get all the records where dsig is null
df.where(column("desig").isNull()).show(10)

+-------+-------+---------+---+-----+
|    cid|  fname|    lname|age|desig|
+-------+-------+---------+---+-----+
|4000014|   Beth|  Woodard| 65| null|
|4000046|  Louis|Rosenthal| 31| null|
|4000257| Monica|   Dodson| 58| null|
|4000327|Heather|   Dawson| 43| null|
|4000411| Joanna|   Hoover| 50| null|
|4000463|  Wayne|   Weiner| 70| null|
|4000494|Stephen|   Waller| 75| null|
|4000679|Herbert| Jernigan| 59| null|
|4000695|  Chris| Anderson| 52| null|
|4000890| Hannah|    Casey| 30| null|
+-------+-------+---------+---+-----+
only showing top 10 rows

