In [19]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql.functions import col

In [2]:
sparkConf =  SparkConf().setAppName("Trying").setMaster("local[*]")
sc = SparkContext(conf=sparkConf)

In [3]:
spark = SparkSession.builder\
        .appName("anyname")\
        .master("local[*]")\
        .getOrCreate()

In [4]:
df = spark.read.format('csv')\
            .option('delimiter','|') \
            .option('header', 'True') \
            .option('inferSchema', 'True') \
            .load('file:///home/saif/LFS/datasets/emp_all.txt')
# for HDFS hdfs://localhost:9000/user/saif/HFS/Output/....
df.show(5,truncate=False)
df.printSchema()

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
|102|Saif    |2000   |
|103|Mitali  |3000   |
|104|Manas   |4000   |
|105|Ram     |5000   |
+---+--------+-------+
only showing top 5 rows

root
 |-- id: integer (nullable = true)
 |-- name,sal: string (nullable = true)
 |-- country: integer (nullable = true)



### filter
### where

In [5]:
df.filter(df.country == 1000).show(truncate=False)

# #  # OR

# from pyspark.sql.functions import col
# df.filter(col("country") == 1000).show(truncate=False) 

# # OR

df.where(df.country == 1000).show(truncate=False)

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
+---+--------+-------+

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
+---+--------+-------+



#### multiple condition

In [6]:
#AND
df.filter((df.country == 1000) & (df.id == 101)).show(truncate=False)

#OR
df.filter((df.country == 1000) | (df.country == 4000)).show(truncate=False)

#NOT 
df.filter(~(df.country == 1000)).show(truncate=False)

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
+---+--------+-------+

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
|104|Manas   |4000   |
+---+--------+-------+

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|102|Saif    |2000   |
|103|Mitali  |3000   |
|104|Manas   |4000   |
|105|Ram     |5000   |
|106|Sam     |6000   |
+---+--------+-------+



###  Filter on an Array column

In [9]:
dept = [("Finance", 10, [1,2,3,4,5]),
            ("Marketing", 20, [7,6,5,4,7]),
            ("Sales", 30, [17,26,65,84,97]),
            ("IT", 40, [27,36,95,64,57])]
rdd = sc.parallelize(dept)
df = rdd.toDF(["dept", "deptno", "locationId's"])
df.printSchema()
df.show(truncate=False) 

root
 |-- dept: string (nullable = true)
 |-- deptno: long (nullable = true)
 |-- locationId's: array (nullable = true)
 |    |-- element: long (containsNull = true)

+---------+------+--------------------+
|dept     |deptno|locationId's        |
+---------+------+--------------------+
|Finance  |10    |[1, 2, 3, 4, 5]     |
|Marketing|20    |[7, 6, 5, 4, 7]     |
|Sales    |30    |[17, 26, 65, 84, 97]|
|IT       |40    |[27, 36, 95, 64, 57]|
+---------+------+--------------------+



In [10]:
from pyspark.sql.functions import array_contains

df.filter(array_contains(df["locationId's"], 95)).show(truncate=False) 

+----+------+--------------------+
|dept|deptno|locationId's        |
+----+------+--------------------+
|IT  |40    |[27, 36, 95, 64, 57]|
+----+------+--------------------+



### orderby ( ), sort ( )

In [12]:

df.orderBy("deptno", "dept").show(truncate=False)
# or 
# df.orderBy(col("deptno"), col("dept")).show(truncate=False) 

+---------+------+--------------------+
|dept     |deptno|locationId's        |
+---------+------+--------------------+
|Finance  |10    |[1, 2, 3, 4, 5]     |
|Marketing|20    |[7, 6, 5, 4, 7]     |
|Sales    |30    |[17, 26, 65, 84, 97]|
|IT       |40    |[27, 36, 95, 64, 57]|
+---------+------+--------------------+



In [7]:
df.sort(df.deptno.asc(), df.dept.desc()).show(truncate=False) 

+---------+------+--------------------+
|dept     |deptno|locationId's        |
+---------+------+--------------------+
|Finance  |10    |[1, 2, 3, 4, 5]     |
|Marketing|20    |[7, 6, 5, 4, 7]     |
|Sales    |30    |[17, 26, 65, 84, 97]|
|IT       |40    |[27, 36, 95, 64, 57]|
+---------+------+--------------------+



### groupBy()

In [14]:
df.orderBy("deptno").count()

4

### withColumn()
<p>PySpark withColumn ( ) is a transformation function of DataFrame which is used to
change or update the value, convert the datatype of an existing DataFrame column,
add/create a new column</p>

In [22]:
df2 = df.withColumn("salary", col("deptno")*100) 
df2.show()

+---------+------+--------------------+------+
|     dept|deptno|        locationId's|salary|
+---------+------+--------------------+------+
|  Finance|    10|     [1, 2, 3, 4, 5]|  1000|
|Marketing|    20|     [7, 6, 5, 4, 7]|  2000|
|    Sales|    30|[17, 26, 65, 84, 97]|  3000|
|       IT|    40|[27, 36, 95, 64, 57]|  4000|
+---------+------+--------------------+------+

