In [46]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


In [47]:
spark = SparkSession.builder.appName("solution_ex_17").getOrCreate()

In [48]:
inputPath = './data'
outputPath = './output_ex_17/df_sol'

print(os.path.exists(inputPath))
print(os.path.exists(outputPath))


True
False


In [49]:
# Create a DataFrame from persons.csv
dfPersons = spark.read.load(inputPath,\
                     format="csv",\
                     header=True,\
                     inferSchema=True)

In [50]:
dfPersons.printSchema()
dfPersons.show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)

+-----+---+------+
| name|age|gender|
+-----+---+------+
| Paul| 40|  male|
|David| 15|  male|
|Susan| 40|female|
|Karen| 34|female|
| John| 40|  male|
+-----+---+------+



In [51]:
import time

st = time.time()
male_df_v1 = dfPersons.filter('gender == "male"')
print(time.time()-st)

st = time.time()
male_df_v2 = dfPersons.where('gender == "male"')
print(time.time()-st)

st = time.time()
# with F.col() we are actually telling spark its a column has to search
# some cases 
male_df_v3 = dfPersons.where(F.col('gender') == 'male')
print(time.time()-st)

# male_df_v4 = dfPersons.where(('gender' == 'male') & ('age == 40')) -> raise errors, you need to pass column type conditions
st = time.time()
male_df_v4 = dfPersons.where((F.col('gender') == "male") & (F.col('age') == 40))
print(time.time()-st)

0.06018352508544922
0.026340723037719727
0.19091010093688965
0.09743928909301758


In [52]:
male_df_v1.show()
male_df_v2.show()
male_df_v3.show()
male_df_v4.show()

+-----+---+------+
| name|age|gender|
+-----+---+------+
| Paul| 40|  male|
|David| 15|  male|
| John| 40|  male|
+-----+---+------+

+-----+---+------+
| name|age|gender|
+-----+---+------+
| Paul| 40|  male|
|David| 15|  male|
| John| 40|  male|
+-----+---+------+

+-----+---+------+
| name|age|gender|
+-----+---+------+
| Paul| 40|  male|
|David| 15|  male|
| John| 40|  male|
+-----+---+------+

+----+---+------+
|name|age|gender|
+----+---+------+
|Paul| 40|  male|
|John| 40|  male|
+----+---+------+



In [53]:
male_DF = male_df_v1.selectExpr("name", "age + 1 as age")
male_DF.show()


+-----+---+
| name|age|
+-----+---+
| Paul| 41|
|David| 16|
| John| 41|
+-----+---+



In [54]:
male_DF = male_DF.sort(F.desc("age"), "name")
male_DF.show()

+-----+---+
| name|age|
+-----+---+
| John| 41|
| Paul| 41|
|David| 16|
+-----+---+



In [55]:
# male_DF.rdd.saveAsTextFile(outputPath)
male_DF.write.csv(outputPath)

                                                                                