### Data Reading

## Load data

1. CSV File

In [0]:
path_csv = '/Volumes/workspace/learning/csvfile/BigMart Sales.csv'

In [0]:
df_csv = spark.read.format('csv')\
    .option('header',True)\
        .option('inferSchema',True)\
            .load(path_csv)

In [0]:
df_csv.display()

2. JSON file

In [0]:
path_json = '/Volumes/workspace/learning/jsonfile/drivers.json'

In [0]:
df_json = spark.read.format('json')\
    .option('header',True)\
        .option('multiline',False)\
            .load(path_json)

In [0]:
df_json.display()

### SCHEMA - DDL and StructType()


###Schema Defination

--> To Use for Change the data type 

1. DDL Schema

In [0]:
df_csv.printSchema()

In [0]:
schema_csv = '''
Item_Identifier string,
Item_Weight string,
Item_Fat_Content string,
Item_Visibility double,
Item_Type string,
Item_MRP double,
Outlet_Identifier string,
Outlet_Establishment_Year integer,
Outlet_Size string,
Outlet_Location_Type string,
Outlet_Type string,
Item_Outlet_Sales double
'''

In [0]:
df = spark.read.format('csv')\
    .schema(schema_csv)\
        .option('header',True)\
            .load(path_csv)

In [0]:
df.display()

2. StructType() Schema

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import * 

In [0]:
schema_struct = StructType([
    StructField('Item_Identifier',StringType(),True),
    StructField('Item_Weight',StringType(),True),
    StructField('Item_Fat_Content',StringType(),True),
    StructField('Item_Visibility',StringType(),True),
    StructField('Item_Type',StringType(),True),
    StructField('Item_MRP',DoubleType(),True),
    StructField('Outlet_Identifier',StringType(),True),
    StructField('Outlet_Establishment_Year',IntegerType(),True),
    StructField('Outlet_Size',StringType(),True),
    StructField('Outlet_Location_Type',StringType(),True),
    StructField('Outlet_Type',StringType(),True),
    StructField('Item_Outlet_Sales',DoubleType(),True)

])

In [0]:
df= spark.read.format('csv')\
    .schema(schema_struct)\
        .option('header',True)\
            .option('inferSchema',True)\
                .load(path_csv)

In [0]:
df.display()

### Data Transormation 

### SELECT

Method 1

In [0]:
df_csv.select('Item_Identifier','Item_Weight','Item_Fat_Content').display()

Method 2 --> By using Col()

In [0]:
df_csv.select(col('Item_Identifier'),col('Item_Weight'),col('Item_Fat_Content')).display()

### ALIAS

--> Use for rename the column name 


In [0]:
df_csv.select(col('Item_Identifier').alias('Item_ID')).display()

FILTER / WHERE

CASE 1: Filter the data with fat content = Regular

In [0]:
df_csv.printSchema()

In [0]:
df_csv.filter(col('Item_Fat_Content') == 'Regular').display()

CASE 2: Slice the data with item type = Soft Drinks and weight < 10

In [0]:
df_csv.filter((col('Item_Type') == 'Soft Drinks') & (col('Item_Weight') < 10)).display()

CASE 3: Feth the data Tier in (Tier1 or Tier2) and Outlet Size is Null

In [0]:
df_csv.filter((col('Outlet_Size').isNull())& col('Outlet_Location_Type').isin('Tier 1','Tier 2')).display()

### withColumnRenamed

--> use to rename column in data frame level

In [0]:
df_csv.withColumnRenamed('Item_Weight','Item_Wt').display()

###withColumn

Case 1 : Createa new column

In [0]:
df= df_csv.withColumn('Flag',lit("New"))

In [0]:
df.display()

In [0]:
df = df.withColumn('multiply',col('Item_Weight')*col('Item_MRP'))

In [0]:
df.display()

Case 2 : Modify the existing one

In [0]:
df_csv.withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),"Regular",'Reg'))\
    .withColumn('Item_Fat_Content',regexp_replace(col('Item_Fat_Content'),"Low Fat",'LF'))\
        .display()

### Type Casting

In [0]:
df = df_csv.withColumn('Item_Weight', col('Item_Weight').cast(StringType()))

In [0]:
df.display()

In [0]:
df.printSchema()

### Sort/ orderBy

Case 1: sort data by descending order

In [0]:
df.sort(col('Item_weight').desc()).display()

case 2 : Sort by Assending

In [0]:
df.sort(col('Item_Visibility').asc()).display()

Case 3: sorting based on multiple columns

In [0]:
df.sort(['Item_Weight','Item_Visibility'],ascending=[1,0]).display()

### Limit

--> use for specific data to display

In [0]:
df.limit(10).display()

### DROP

case 1: Drop 1 Column

In [0]:
df.drop("Item_visibility").display()  

Case 2: Drop Multiple Columns as a time 

In [0]:
df.drop("Item_visibility","Item_Type").display()

### DROP_DUPLICATES

Case 1 : Drop all the duplicate values in the data 

In [0]:
df.dropDuplicates().display()

Case 2: Drop duplicates in perticular columns 

In [0]:
df.drop_duplicates(subset=['Item_Type']).display()

### D_dup Data 

In [0]:
df.distinct().display()

### UNION and UNION BY NAME

Prepared the data frames

In [0]:
data1 = [('1','cad'),
         ('2','bad'),]
schema1 = 'id string, name string'
df1 = spark.createDataFrame(data1,schema1)

data2 = [('3','mad'),
         ('4','dad'),]
schema2 = 'id string, name string'
df2 = spark.createDataFrame(data2,schema2)

In [0]:
df1.display()

In [0]:
df2.display()

### Union

In [0]:
df1.union(df2).display()

###Union by Name 

In [0]:
data1 = [('cad','1'),
         ('bad','2'),]
schema1 = 'name string, id string'
df1 = spark.createDataFrame(data1,schema1)
df1.display()

In [0]:
df1.union(df2).display( )

In [0]:
df1.unionByName(df2).display()

### STRING FUNCTIONS

1. Initcap()
2. lower()
3. upper()

In [0]:
df.select(initcap('Item_Type')).display()

### Date Functions

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
df = spark.read.format('csv')\
    .option('header','true')\
        .option('inferSchema','true')\
            .load(path_csv)

In [0]:
df.display()

1. Current_date()

In [0]:
df1 = df.withColumn('Curr_date', current_date()); display(df1)

2. Date_add()

In [0]:
from pyspark.sql.functions import date_add

df = df1.withColumn('week_date', date_add('Curr_date', 7))
df.display()

3. Date_sub()

In [0]:
df1 = df.withColumn('week_before', date_sub('Curr_date', 7)).display()

### DATEDIFF

--> to give the date difference 

In [0]:
diff = df1.withColumn('diff', datediff('Curr_date','week_after')).display()