In [0]:
## Step 1: Importing Libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA

In [0]:
## Step 2: Initializing Spark Session
spark = SparkSession.builder.appName("DimensionalityReduction").getOrCreate()

In [0]:
## Step 3: Loading the Dataset
file_path = "dbfs:/FileStore/shared_uploads/utkarshsatishkumar.shah@sjsu.edu/cinemaTicket_Ref.csv" 
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show()

+---------+-----------+-----------+------------+-----------+---------+---------+------------------+----------+------------------+----------+-----+-------+---+
|film_code|cinema_code|total_sales|tickets_sold|tickets_out|show_time|occu_perc|      ticket_price|ticket_use|          capacity|      date|month|quarter|day|
+---------+-----------+-----------+------------+-----------+---------+---------+------------------+----------+------------------+----------+-----+-------+---+
|     1492|        304|    3900000|          26|          0|        4|     4.26|          150000.0|        26| 610.3286384976526|2018-05-05|    5|      2|  5|
|     1492|        352|    3360000|          42|          0|        5|     8.08|           80000.0|        42| 519.8019801980198|2018-05-05|    5|      2|  5|
|     1492|        489|    2560000|          32|          0|        4|     20.0|           80000.0|        32|             160.0|2018-05-05|    5|      2|  5|
|     1492|        429|    1200000|          1

In [0]:
## Step 4: Inspecting the Data

# Print the schema of the dataframe
df.printSchema()

# Show a sample of the data
df.show(5)

root
 |-- film_code: integer (nullable = true)
 |-- cinema_code: integer (nullable = true)
 |-- total_sales: integer (nullable = true)
 |-- tickets_sold: integer (nullable = true)
 |-- tickets_out: integer (nullable = true)
 |-- show_time: integer (nullable = true)
 |-- occu_perc: double (nullable = true)
 |-- ticket_price: double (nullable = true)
 |-- ticket_use: integer (nullable = true)
 |-- capacity: double (nullable = true)
 |-- date: date (nullable = true)
 |-- month: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- day: integer (nullable = true)

+---------+-----------+-----------+------------+-----------+---------+---------+------------+----------+------------------+----------+-----+-------+---+
|film_code|cinema_code|total_sales|tickets_sold|tickets_out|show_time|occu_perc|ticket_price|ticket_use|          capacity|      date|month|quarter|day|
+---------+-----------+-----------+------------+-----------+---------+---------+------------+----------+-------

In [0]:
## Step 5: Selecting Features for PCA

assembler = VectorAssembler(inputCols=["total_sales", "tickets_sold", "ticket_price"], outputCol="features")
feature_df = assembler.transform(df)
feature_df.show(5)


+---------+-----------+-----------+------------+-----------+---------+---------+------------+----------+------------------+----------+-----+-------+---+--------------------+
|film_code|cinema_code|total_sales|tickets_sold|tickets_out|show_time|occu_perc|ticket_price|ticket_use|          capacity|      date|month|quarter|day|            features|
+---------+-----------+-----------+------------+-----------+---------+---------+------------+----------+------------------+----------+-----+-------+---+--------------------+
|     1492|        304|    3900000|          26|          0|        4|     4.26|    150000.0|        26| 610.3286384976526|2018-05-05|    5|      2|  5|[3900000.0,26.0,1...|
|     1492|        352|    3360000|          42|          0|        5|     8.08|     80000.0|        42| 519.8019801980198|2018-05-05|    5|      2|  5|[3360000.0,42.0,8...|
|     1492|        489|    2560000|          32|          0|        4|     20.0|     80000.0|        32|             160.0|2018-05

In [0]:
## Step 6: Applying PCA

pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(feature_df)
result = model.transform(feature_df)
result.show(5)


+---------+-----------+-----------+------------+-----------+---------+---------+------------+----------+------------------+----------+-----+-------+---+--------------------+--------------------+
|film_code|cinema_code|total_sales|tickets_sold|tickets_out|show_time|occu_perc|ticket_price|ticket_use|          capacity|      date|month|quarter|day|            features|         pcaFeatures|
+---------+-----------+-----------+------------+-----------+---------+---------+------------+----------+------------------+----------+-----+-------+---+--------------------+--------------------+
|     1492|        304|    3900000|          26|          0|        4|     4.26|    150000.0|        26| 610.3286384976526|2018-05-05|    5|      2|  5|[3900000.0,26.0,1...|[-3900044.1392700...|
|     1492|        352|    3360000|          42|          0|        5|     8.08|     80000.0|        42| 519.8019801980198|2018-05-05|    5|      2|  5|[3360000.0,42.0,8...|[-3360023.4852888...|
|     1492|        489|  

In [0]:
## Step 7: Interpreting PCA Results

# Explained variance by each principal component
explained_variance = model.explainedVariance
print("Explained Variance by each component: ", explained_variance)


Explained Variance by each component:  [0.9999989117140172,1.0882743638448862e-06,1.1619019848883753e-11]


In [0]:
## Step 8: Using PCA Results

# Selecting the first two principal components for visualization
pca_result = result.select("pcaFeatures")
pca_result.show(5)


+--------------------+
|         pcaFeatures|
+--------------------+
|[-3900044.1392700...|
|[-3360023.4852888...|
|[-2560023.5201366...|
|[-1200029.4872653...|
|[-1200023.5793779...|
+--------------------+
only showing top 5 rows



In [0]:
## Last Step : Cleanup
spark.stop()