In [0]:
df= spark.read.format('csv').option('Header', True).option('InferSchema', True).load('/FileStore/tables/BigMart_Sales.csv')
df.display()

## JOINS

In [0]:

dataj1 = [('1','gaur','d01'),
          ('2','kit','d02'),
          ('3','sam','d03'),
          ('4','tim','d03'),
          ('5','aman','d05'),
          ('6','nad','d06')] 

schemaj1 = 'emp_id STRING, emp_name STRING, dept_id STRING' 

df1 = spark.createDataFrame(dataj1,schemaj1)
df1.display()

In [0]:
dataj2 = [('d01','HR'),
          ('d02','Marketing'),
          ('d03','Accounts'),
          ('d04','IT'),
          ('d05','Finance')]

schemaj2 = 'dept_id STRING, department STRING'

df2 = spark.createDataFrame(dataj2,schemaj2)
df2.display()

In [0]:
df1.join(df2, df1['dept_id']==df2['dept_id'],'inner').display()


In [0]:
df1.join(df2,df1['dept_id']==df2['dept_id'],'left').display()


In [0]:
df1.join(df2,df1['dept_id']==df2['dept_id'],'right').display()


In [0]:
df1.join(df2,df1['dept_id']==df2['dept_id'],'anti').display()


# WINDOW FUNCTIONS

## ROW_NUMBER

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import *  
df.withColumn('rowCol',row_number().over(Window.orderBy('Item_Identifier'))).display()


## RANK AND DENSE RANK

In [0]:

df.withColumn('rank',rank().over(Window.orderBy(col('Item_Identifier').desc())))\
        .withColumn('denseRank',dense_rank().over(Window.orderBy(col('Item_Identifier').desc()))).display()

In [0]:
df.withColumn('dum',sum('Item_MRP').over(Window.orderBy('Item_Identifier')\
    .rowsBetween(Window.unboundedPreceding,Window.currentRow))).display()


## CUMULATIVE SUM

In [0]:
df.withColumn('cumsum',sum('Item_MRP').over(Window.orderBy('Item_Type'))).display()


In [0]:
df.withColumn('cumsum',sum('Item_MRP').over(Window.orderBy('Item_Type').rowsBetween(Window.unboundedPreceding,Window.currentRow))).display()


In [0]:
df.withColumn('totalsum',sum('Item_MRP').over(Window.orderBy('Item_Type').rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing))).display()


## USER DEFINED FUNCTIONS (UDF)


In [0]:
def my_func(x):
    return x*x 
     
my_udf = udf(my_func)

df.withColumn('mynewcol',my_udf('Item_MRP')).display()

## DATA WRITING


In [0]:

df.write.format('csv')\
        .save('/FileStore/tables/CSV/data.csv')

In [0]:

df.write.format('csv')\
        .mode('append')\
        .option('path','/FileStore/tables/CSV/data.csv')\
        .save()

In [0]:

df.write.format('csv')\
.mode('overwrite')\
.option('path','/FileStore/tables/CSV/data.csv')\
.save()

In [0]:

df.write.format('csv')\
.mode('error')\
.option('path','/FileStore/tables/CSV/data.csv')\
.save()

In [0]:

df.write.format('csv')\
.mode('ignore')\
.option('path','/FileStore/tables/CSV/data.csv')\
.save()

## PARQUET

In [0]:

df.write.format('parquet')\
.mode('overwrite')\
.option('path','/FileStore/tables/CSV/data.csv')\
.save()

In [0]:

df.write.format('parquet')\
.mode('overwrite')\
.saveAsTable('my_table')

df.display()

### SPARK SQL

In [0]:
df.createTempView('my_view')






In [0]:
%sql
select * from my_view where Item_Fat_Content = 'Lf'


In [0]:
df_sql = spark.sql("select * from my_view where Item_Fat_Content = 'Lf'")
df_sql.display()