### Create Sample Dataframe

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# define the schema for the dataframe
schema = StructType([
    StructField('ProductID', IntegerType(), True),
    StructField('ProductName', StringType(), True),
    StructField('ProductCategory', StringType(), True),
    StructField('ProductPrice', DoubleType(), True),
    StructField('ProductQuantity', IntegerType(), True)
    ])
# create a list of rows
data =[
  (1, 'Laptop', 'Electronics', 999.99, 50),
  (2, 'Smartphone', 'Electronics', 699.99, 100),
  (3, 'Headphones', 'Electronics', 49.99, 200),
  (4, 'Book', 'Books', 19.99, 300),
  (5, 'Tablet', 'Electronics', 299.99, 75)
]

# create dataframe
productDF = spark.createDataFrame(data, schema=schema)

# show dataframe
display(productDF)

### Old Approach

In [0]:
productDF.createOrReplaceTempView("v_product")


In [0]:
%sql
select * from v_product

### New Approach

### Query Dataframe using Spark SQL

In [0]:
sqlDF = spark.sql("SELECT * FROM {table}", table=productDF)
display(sqlDF)

In [0]:
sqlDF = spark.sql('select {column} from {table}', column=productDF['ProductName'], table=productDF)
display(sqlDF)

### Transform Dataframe using Spark SQL

In [0]:
trasnformDF = spark.sql('select ProductID, concat(ProductName, ProductCategory), ProductPrice * ProductQuantity as Total_Cost from {table}', table=productDF)
display(trasnformDF)

### Transform Dataframe using Pyspark

In [0]:
from pyspark.sql.functions import col, concat, expr

transformDFNew = productDF.select(col('ProductID'), concat(col('ProductName'), col('ProductCategory')).alias('ProductNameCategory'), expr('ProductPrice * ProductQuantity').alias('Total_Cost'))
display(transformDFNew)

### Join Dataframes

### Create Product and Sales Dataframes

In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import expr

# sample data for products
product_Data = [
    Row(Product_Id = 1, Product_Name = 'Laptop', Product_Price = 800),
    Row(Product_Id = 2, Product_Name = 'Smartphone', Product_Price = 500),
    Row(Product_Id = 3, Product_Name = 'Tablet', Product_Price = 300),
    Row(Product_Id = 4, Product_Name = 'Desktop', Product_Price = 1000),
    Row(Product_Id = 5, Product_Name = 'Printer', Product_Price = 200),
]

# sample data for sales
sales_Data =[
    Row(Sale_Id = 101, Product_Id = 1, Quantity = 5),
    Row(Sale_Id = 102, Product_Id = 2, Quantity = 8),
    Row(Sale_Id = 103, Product_Id = 1, Quantity = 3),
    Row(Sale_Id = 104, Product_Id = 3, Quantity = 6),
    Row(Sale_Id = 105, Product_Id = 4, Quantity = 2),
    Row(Sale_Id = 106, Product_Id = 1, Quantity = 7) 
]

# create dataframe for products and sales
product_df = spark.createDataFrame(product_Data)
sales_df = spark.createDataFrame(sales_Data)

# show dataframes
product_df.display()
sales_df.display()

### Join Dataframes

In [0]:
joinDF = spark.sql('select * from {table1} a join {table2} b on a.{joiningKey} = b.{joiningKey}', table1=product_df, table2=sales_df, joiningKey=product_df['Product_Id'])
joinDF.display()