### Create Sample Dataframe

In [0]:
from pyspark.sql.functions import rand

# sample data for product dimension
data = [
  (1, 'ProductA', 'Category1', 'BrandX', 'Supplier1', 100, 10.99, '2023-01-01', '2023-01-10', True),
  (2, 'ProductB', 'Category2', 'BrandY', 'Supplier2', 50, 15.49, '2023-01-02', '2023-01-12', True),
  (3, 'ProductC', 'Category1', 'BrandX', 'Supplier1', 75, 8.99, '2023-01-03', '2023-01-14', False),
  (4, 'ProductD', 'Category3', 'BrandZ', 'Supplier3', 200, 25.99, '2023-01-04', '2023-01-16', True),
  (5, 'ProductE', 'Category2', 'BrandY', 'Supplier2', 60, 12.99, '2023-01-05', '2023-01-18', True)
]

# define column name
column_names = ['Product_Id', 'Product_Name', 'Category', 'Brand', 'Supplier', 'Quantity', 'Price', 'Start_Date', 'End_Date', 'Active']

# create dataframe
df = spark.createDataFrame(data, column_names)

# show the result
df.display()

### Convert Dataframe to View for SQL Operations

In [0]:
df.createOrReplaceTempView("Product")

### Traditional Approach

In [0]:
%sql
-- select Product_Id, Product_Name, Category, Brand, Supplier, Quantity, Price, Start_Date, End_Date, Active from Product;

select * from Product



### Project Selective Columns

In [0]:
%sql
select Product_Id, Product_Name, Brand, Supplier, Quantity, Price, Start_Date, End_Date from Product;

### Best Approach using Except

### Exclude Columns using Except

In [0]:
%sql
select * except(Category, Active) from Product;

### Using Join

In [0]:
# sample data for product dimension table
data = [
  (1, 'ProductA', 'Category1', 'BrandX'), 
  (2, 'ProductB', 'Category2', 'BrandY'),
  (3, 'ProductC', 'Category1', 'BrandX'),
  (4, 'ProductD', 'Category3', 'BrandZ'),
  (5, 'ProductE', 'Category2', 'BrandY')
]

# sample data for customer table
customer_data = [
  (101, 'Alice', 1, '2023-01-05'),
  (102, 'Bob', 2, '2023-01-08'),
  (103, 'Charlie', 1, '2023-01-12'),
  (104, 'David', 3, '2023-01-15'),
  (105, 'Eve', 4, '2023-01-20'),
]

# define column names for both tables
product_columns = ['ProductID', 'ProductName', 'Category', 'Brand']
customer_columns = ['CustomerID', 'CustomerName', 'Purchase_ProductID', 'PurchaseDate']

# create dataframes for both tables
product_df = spark.createDataFrame(data, product_columns)
customer_df = spark.createDataFrame(customer_data, customer_columns)

# display the dataframes
product_df.display()
customer_df.display()

product_df.createOrReplaceTempView("Product")
customer_df.createOrReplaceTempView("Customer")

In [0]:
%sql
select P.* except(Brand), C.* except(CustomerID) from Product P join Customer C on C.Purchase_ProductID = P.ProductID;

### In Pyspark

In [0]:
display(df.select([col for col in df.columns if col not in {'Category', 'Active'}]))