# Prerequisites

In [0]:
type(spark)

Out[2]: pyspark.sql.session.SparkSession

# 01. Load Data into DataFrame

In [0]:
user_df = spark.read.csv(path="dbfs:/FileStore/walmart/dataset/users/delimited_format/users_001.dat",
                         sep="|",
                         header=True,
                         inferSchema=True)
user_df.limit(4).display()                         

id,age,gen,designation,salary
1,26,M,Technician,85711
2,53,F,Other,94043
3,23,M,Writer,32067
4,26,M,technician,43537


# 02. DataFrame Operations

## Transformations

* When we apply transformation on a DF it will create another DF

### Select

In [0]:
# df_01 = user_df.select("id","designation")
# df_01.limit(4).display()

user_df.select("id","designation").limit(4).display()

id,designation
1,Technician
2,Other
3,Writer
4,technician


In [0]:
# Recommend way to select columns
from pyspark.sql.functions import col, lower

user_df.select(col("id").alias("user_id"), 
               col("designation")).limit(4).display()

user_id,designation
1,Technician
2,Other
3,Writer
4,technician


### Filter

In [0]:
from pyspark.sql.functions import lower
user_df.filter(lower(col("designation"))=="technician").limit(4).display()

id,age,gen,designation,salary
1,26,M,Technician,85711
4,26,M,technician,43537
44,26,M,Technician,46260
77,30,M,Technician,29379


In [0]:
user_df.filter(col("designation").isin("Writer","Other")).limit(10).display()

id,age,gen,designation,salary
2,53,F,Other,94043
3,23,M,Writer,32067
5,33,F,Other,15213
11,39,F,Other,30329
12,28,F,Other,6405
18,35,F,Other,37212
21,26,M,Writer,30068
22,25,M,Writer,40206
28,32,M,Writer,55369
38,28,F,Other,54467


## Actions

### Print Schema

In [0]:
user_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gen: string (nullable = true)
 |-- designation: string (nullable = true)
 |-- salary: integer (nullable = true)



## Select columns

In [0]:
user_df.select("id","designation")

# 03. Write Results