In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql.functions import col

In [2]:
sparkConf =  SparkConf().setAppName("Trying").setMaster("local")
sc = SparkContext(conf=sparkConf)

In [3]:
spark = SparkSession.builder\
        .appName("anyname")\
        .master("local")\
        .getOrCreate()

In [4]:
df = spark.read.format('csv')\
            .option('delimiter','|') \
            .option('header', 'True') \
            .option('inferSchema', 'True') \
            .load('C:\\Personal\\Projects\\Spark\\dataset\\emp_all.txt')
            
# in windows
# .load('C:\\Personal\\Projects\\Spark\\dataset\\emp_all.txt')
# this is in Ubuntu
# .load('file:///home/saif/LFS/datasets/emp_all.txt')
# for HDFS hdfs://localhost:9000/user/saif/HFS/Output/....
# for edge node file:///home/saif/LFS/datasets/emp_all.txt

df.show(5,truncate=False)
df.printSchema()

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
|102|Saif    |2000   |
|103|Mitali  |3000   |
|104|Manas   |4000   |
|105|Ram     |5000   |
+---+--------+-------+
only showing top 5 rows

root
 |-- id: integer (nullable = true)
 |-- name,sal: string (nullable = true)
 |-- country: integer (nullable = true)



### filter
### where

In [5]:
df.filter(df.country == 1000).show(truncate=False)

# #  # OR

# from pyspark.sql.functions import col
# df.filter(col("country") == 1000).show(truncate=False) 

# # OR

df.where(df.country == 1000).show(truncate=False)

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
+---+--------+-------+

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
+---+--------+-------+



In [7]:
# these are the ways to access the column
# df.country   
# col('country')  # from pyspark.sql.functions import col
# df['country']

#### multiple condition

In [6]:
#AND
df.filter((df.country == 1000) & (df.id == 101)).show(truncate=False)

#OR
df.filter((df.country == 1000) | (df.country == 4000)).show(truncate=False)

#NOT 
df.filter(~(df.country == 1000)).show(truncate=False)

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
+---+--------+-------+

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
|104|Manas   |4000   |
+---+--------+-------+

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|102|Saif    |2000   |
|103|Mitali  |3000   |
|104|Manas   |4000   |
|105|Ram     |5000   |
|106|Sam     |6000   |
+---+--------+-------+



###  Filter on an Array column

In [8]:
dept = [("Finance", 10, [1,2,3,4,5]),
            ("Marketing", 20, [7,6,5,4,7]),
            ("Sales", 30, [17,26,65,84,97]),
            ("IT", 40, [27,36,95,64,57])]
rdd = sc.parallelize(dept)
df = rdd.toDF(["dept", "deptno", "locationId's"])
df.printSchema()
df.show(truncate=False) 

root
 |-- dept: string (nullable = true)
 |-- deptno: long (nullable = true)
 |-- locationId's: array (nullable = true)
 |    |-- element: long (containsNull = true)

+---------+------+--------------------+
|dept     |deptno|locationId's        |
+---------+------+--------------------+
|Finance  |10    |[1, 2, 3, 4, 5]     |
|Marketing|20    |[7, 6, 5, 4, 7]     |
|Sales    |30    |[17, 26, 65, 84, 97]|
|IT       |40    |[27, 36, 95, 64, 57]|
+---------+------+--------------------+



### array_contains()

In [9]:
from pyspark.sql.functions import array_contains

df.filter(array_contains(df["locationId's"], 95)).show(truncate=False) 

+----+------+--------------------+
|dept|deptno|locationId's        |
+----+------+--------------------+
|IT  |40    |[27, 36, 95, 64, 57]|
+----+------+--------------------+



### orderby ( ), sort ( )

In [10]:

df.orderBy("deptno", "dept").show(truncate=False)
# or 
# df.orderBy(col("deptno"), col("dept")).show(truncate=False) 

+---------+------+--------------------+
|dept     |deptno|locationId's        |
+---------+------+--------------------+
|Finance  |10    |[1, 2, 3, 4, 5]     |
|Marketing|20    |[7, 6, 5, 4, 7]     |
|Sales    |30    |[17, 26, 65, 84, 97]|
|IT       |40    |[27, 36, 95, 64, 57]|
+---------+------+--------------------+



In [11]:
df.sort(df.deptno.asc(), df.dept.desc()).show(truncate=False) 

+---------+------+--------------------+
|dept     |deptno|locationId's        |
+---------+------+--------------------+
|Finance  |10    |[1, 2, 3, 4, 5]     |
|Marketing|20    |[7, 6, 5, 4, 7]     |
|Sales    |30    |[17, 26, 65, 84, 97]|
|IT       |40    |[27, 36, 95, 64, 57]|
+---------+------+--------------------+



### groupBy()

In [12]:
df.orderBy("deptno").count()

4

### withColumn()
<p>PySpark withColumn ( ) is a transformation function of DataFrame which is used to
change or update the value, convert the datatype of an existing DataFrame column,
add/create a new column</p>

In [13]:
df2 = df.withColumn("salary", col("deptno")*100) 
df2.show()

+---------+------+--------------------+------+
|     dept|deptno|        locationId's|salary|
+---------+------+--------------------+------+
|  Finance|    10|     [1, 2, 3, 4, 5]|  1000|
|Marketing|    20|     [7, 6, 5, 4, 7]|  2000|
|    Sales|    30|[17, 26, 65, 84, 97]|  3000|
|       IT|    40|[27, 36, 95, 64, 57]|  4000|
+---------+------+--------------------+------+



### union(),
### unionDF = df.union(df2)
### unionDF.show(truncate=False) 
#### DF union ( ) method merges two DF and returns the new DF with all rows from two DF regardless of duplicate data
----
###  Merge without Duplicates
#### disDF = df.union(df2).distinct()
#### disDF.show(truncate=False)
----
### dropDuplicates()
####  df.dropDuplicates()
####   df.dropDuplicates(["department", "salary"])
-----
### drop
#### drop_scol = df.drop("salary")
#### drop_mcol = df.drop("department", "salary") 
-----

## case – when – others

#### Melwin its just like the case in the sql

In [16]:
df = spark.read.format('csv')\
            .option('delimiter',',') \
            .option('header', 'True') \
            .option('inferSchema', 'True') \
            .load('C:\\Personal\\Projects\\Spark\\dataset\\sales.txt')
df.show(5,truncate=False)

+-----+-------+-------------+-----+
|dept |cadre  |costToCompany|state|
+-----+-------+-------------+-----+
|Sales|Trainee|12000        |UK   |
|Sales|Lead   |32000        |AUS  |
|Sales|Lead   |32000        |NY   |
|Sales|Lead   |32000        |IND  |
|Sales|Lead   |32000        |AUS  |
+-----+-------+-------------+-----+
only showing top 5 rows



In [19]:
from pyspark.sql.functions import when

df2 = df.withColumn("low_cost",
                   when(df.costToCompany == 12000,"LOW")
                   .when(df.costToCompany >= 30000,"HIGH")
                   .otherwise("Unknown"))
df2.show(5,truncate=False)

+-----+-------+-------------+-----+--------+
|dept |cadre  |costToCompany|state|low_cost|
+-----+-------+-------------+-----+--------+
|Sales|Trainee|12000        |UK   |LOW     |
|Sales|Lead   |32000        |AUS  |HIGH    |
|Sales|Lead   |32000        |NY   |HIGH    |
|Sales|Lead   |32000        |IND  |HIGH    |
|Sales|Lead   |32000        |AUS  |HIGH    |
+-----+-------+-------------+-----+--------+
only showing top 5 rows



#### expr()

In [22]:
# previous example
from pyspark.sql.functions import expr

df2 = df.withColumn("low_cost",
                    expr("""
                        case 
                            when costToCompany <= 12000 then 'LOW'
                            when costToCompany >= 30000 then 'HIGH'
                            else 'Unknown'
                        end
                    """))
df2.show(5,truncate=False)

+-----+-------+-------------+-----+--------+
|dept |cadre  |costToCompany|state|low_cost|
+-----+-------+-------------+-----+--------+
|Sales|Trainee|12000        |UK   |LOW     |
|Sales|Lead   |32000        |AUS  |HIGH    |
|Sales|Lead   |32000        |NY   |HIGH    |
|Sales|Lead   |32000        |IND  |HIGH    |
|Sales|Lead   |32000        |AUS  |HIGH    |
+-----+-------+-------------+-----+--------+
only showing top 5 rows



### Using & and | operator:
<pre>
 Df = df.withColumn("New_Gender",
 when((col("gender") == "M" ) | (col("gender") == "F"), "Available")
 .otherwise("Not Available"))
 Df.show() 
</pre>

### concat()
<pre>
df2=df.select(concat("fname", lit (","), "mname", lit (","), "lname")
.alias("FullName"),"dob_year","gender","salary")
df2.show(truncate=False)
</pre>


### concat_ws()
<pre>
df3 = df.select(concat_ws('_', col("fname"), col("fname"), col("lname"))
 .alias("FullName"), "dob_year", "gender", "salary")
 df3.show(truncate=False)
</pre>