In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **A. Setup Environment**

In [17]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import*
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import PCA
from pyspark.sql.functions import min as spark_min, max as spark_max
from pyspark.sql.functions import explode,  udf,  when, col, sum as spark_sum
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

In [18]:
# Create a new Spark session with optimized configurations
spark = (SparkSession.builder
         .appName("CustomerSegmentationOptimized")
         .config("spark.executor.memory", "8g")          # Allocate 8 GB memory to each executor
         .config("spark.executor.cores", "4")            # Allocate 4 cores to each executor
         .config("spark.driver.memory", "4g")            # Allocate 4 GB memory to the driver
         .config("spark.sql.shuffle.partitions", "200")   # Set shuffle partitions to 200
         .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")  # Use Kryo serializer for better performance
         .getOrCreate())

# Verify the new configuration
spark.conf.get("spark.executor.memory")       # Should return '8g'
spark.conf.get("spark.executor.cores")        # Should return '4'
spark.conf.get("spark.driver.memory")         # Should return '4g'
spark.conf.get("spark.sql.shuffle.partitions")# Should return '200'
spark.conf.get("spark.serializer")            # Should return 'org.apache.spark.serializer.KryoSerializer'

'org.apache.spark.serializer.KryoSerializer'

# **B. Data Ingestion**

In [19]:
file_path = "/content/drive/MyDrive/Course/Information system management/PHVN_DATest_Dataset.json"
df = spark.read.json(file_path)

df.show(5)

+----------+-------------------+-------------+----+--------------------+---------------+----------+--------+
|AppVersion|      EventDateTime|    EventName|Item|            Location|MobileBrandName| SessionID|  Source|
+----------+-------------------+-------------+----+--------------------+---------------+----------+--------+
|     2.1.7|28-08-2023 15:02:42|session_start|  []|{Bien Hoa, Vietna...|          Apple|1693234962|(direct)|
|     2.1.7|28-08-2023 06:35:51|    view_cart|  []|{Buon Ma Thuot, V...|          Apple|1693204441|(direct)|
|     2.1.8|28-08-2023 14:38:38|session_start|  []|{Ho Chi Minh City...|          Apple|1693233518|(direct)|
|     2.1.7|28-08-2023 06:29:58|session_start|  []|{Ho Chi Minh City...|          Apple|1693204198|(direct)|
|     2.1.7|28-08-2023 13:17:23|session_start|  []|{Ho Chi Minh City...|          Apple|1693228643|(direct)|
+----------+-------------------+-------------+----+--------------------+---------------+----------+--------+
only showing top 5 

In [20]:
df.printSchema()

root
 |-- AppVersion: string (nullable = true)
 |-- EventDateTime: string (nullable = true)
 |-- EventName: string (nullable = true)
 |-- Item: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ItemCategory: string (nullable = true)
 |    |    |-- ItemName: string (nullable = true)
 |-- Location: struct (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- Country: string (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- MobileBrandName: string (nullable = true)
 |-- SessionID: string (nullable = true)
 |-- Source: string (nullable = true)



In [6]:
num_rows = df.count()
num_cols = len(df.columns)
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (443711, 8)


# **C. Data Preprocessing**

### **Step 1:** Parse `EventDateTime`

In [7]:
from pyspark.sql.functions import to_timestamp
df = df.withColumn("EventDateTime", to_timestamp("EventDateTime", "dd-MM-yyyy HH:mm:ss"))
df.show()

+----------+-------------------+-------------+----+--------------------+---------------+----------+--------+
|AppVersion|      EventDateTime|    EventName|Item|            Location|MobileBrandName| SessionID|  Source|
+----------+-------------------+-------------+----+--------------------+---------------+----------+--------+
|     2.1.7|2023-08-28 15:02:42|session_start|  []|{Bien Hoa, Vietna...|          Apple|1693234962|(direct)|
|     2.1.7|2023-08-28 06:35:51|    view_cart|  []|{Buon Ma Thuot, V...|          Apple|1693204441|(direct)|
|     2.1.8|2023-08-28 14:38:38|session_start|  []|{Ho Chi Minh City...|          Apple|1693233518|(direct)|
|     2.1.7|2023-08-28 06:29:58|session_start|  []|{Ho Chi Minh City...|          Apple|1693204198|(direct)|
|     2.1.7|2023-08-28 13:17:23|session_start|  []|{Ho Chi Minh City...|          Apple|1693228643|(direct)|
|     2.1.7|2023-08-28 07:16:56|session_start|  []|{, Vietnam, Kien ...|          Apple|1693207016|(direct)|
|     2.1.7|2023-08

### **Step 2:** Normalize `Location`

In [8]:
df = df.selectExpr("AppVersion", "EventDateTime", "EventName", "Item",
                   "Location.City as City", "Location.Country as Country", "Location.Region as Region",
                   "MobileBrandName", "SessionID", "Source")
df.show(5)

+----------+-------------------+-------------+----+----------------+-------+----------------+---------------+----------+--------+
|AppVersion|      EventDateTime|    EventName|Item|            City|Country|          Region|MobileBrandName| SessionID|  Source|
+----------+-------------------+-------------+----+----------------+-------+----------------+---------------+----------+--------+
|     2.1.7|2023-08-28 15:02:42|session_start|  []|        Bien Hoa|Vietnam|        Dong Nai|          Apple|1693234962|(direct)|
|     2.1.7|2023-08-28 06:35:51|    view_cart|  []|   Buon Ma Thuot|Vietnam|         Dak Lak|          Apple|1693204441|(direct)|
|     2.1.8|2023-08-28 14:38:38|session_start|  []|Ho Chi Minh City|Vietnam|Ho Chi Minh City|          Apple|1693233518|(direct)|
|     2.1.7|2023-08-28 06:29:58|session_start|  []|Ho Chi Minh City|Vietnam|Ho Chi Minh City|          Apple|1693204198|(direct)|
|     2.1.7|2023-08-28 13:17:23|session_start|  []|Ho Chi Minh City|Vietnam|Ho Chi Minh Ci

### **Step 3:** Filter `Country` == "Vietnam"

In [9]:
df = df.filter(col("Country") == "Vietnam")
df.show(5)

+----------+-------------------+-------------+----+----------------+-------+----------------+---------------+----------+--------+
|AppVersion|      EventDateTime|    EventName|Item|            City|Country|          Region|MobileBrandName| SessionID|  Source|
+----------+-------------------+-------------+----+----------------+-------+----------------+---------------+----------+--------+
|     2.1.7|2023-08-28 15:02:42|session_start|  []|        Bien Hoa|Vietnam|        Dong Nai|          Apple|1693234962|(direct)|
|     2.1.7|2023-08-28 06:35:51|    view_cart|  []|   Buon Ma Thuot|Vietnam|         Dak Lak|          Apple|1693204441|(direct)|
|     2.1.8|2023-08-28 14:38:38|session_start|  []|Ho Chi Minh City|Vietnam|Ho Chi Minh City|          Apple|1693233518|(direct)|
|     2.1.7|2023-08-28 06:29:58|session_start|  []|Ho Chi Minh City|Vietnam|Ho Chi Minh City|          Apple|1693204198|(direct)|
|     2.1.7|2023-08-28 13:17:23|session_start|  []|Ho Chi Minh City|Vietnam|Ho Chi Minh Ci

In [10]:
# Filter out rows where Region is empty
df = df.filter(col("Region") != "")

# Drop the Country column
df = df.drop("Country")

In [11]:
num_rows = df.count()
num_cols = len(df.columns)
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (440560, 9)


### **Step 4:** Check Missing Values

In [12]:
from pyspark.sql.functions import sum

# Count missing values (nulls) for each column
missing_values = df.select(
    [sum(col(c).isNull().cast("int")).alias(c + "_missing") for c in df.columns]
)

# Show the result
missing_values.show()

+------------------+---------------------+-----------------+------------+------------+--------------+-----------------------+-----------------+--------------+
|AppVersion_missing|EventDateTime_missing|EventName_missing|Item_missing|City_missing|Region_missing|MobileBrandName_missing|SessionID_missing|Source_missing|
+------------------+---------------------+-----------------+------------+------------+--------------+-----------------------+-----------------+--------------+
|                 0|                    0|                0|           0|           0|             0|                   6634|                0|             4|
+------------------+---------------------+-----------------+------------+------------+--------------+-----------------------+-----------------+--------------+



In [13]:
df = df.dropna(subset=["MobileBrandName", "Source"])

In [14]:
num_rows = df.count()
num_cols = len(df.columns)
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (433922, 9)


In [15]:
# Count missing values (nulls) for each column
missing_values = df.select(
    [sum(col(c).isNull().cast("int")).alias(c + "_missing") for c in df.columns]
)

# Show the result
missing_values.show()

+------------------+---------------------+-----------------+------------+------------+--------------+-----------------------+-----------------+--------------+
|AppVersion_missing|EventDateTime_missing|EventName_missing|Item_missing|City_missing|Region_missing|MobileBrandName_missing|SessionID_missing|Source_missing|
+------------------+---------------------+-----------------+------------+------------+--------------+-----------------------+-----------------+--------------+
|                 0|                    0|                0|           0|           0|             0|                      0|                0|             0|
+------------------+---------------------+-----------------+------------+------------+--------------+-----------------------+-----------------+--------------+



### **Step 5:** Check Duplicates

In [21]:
duplicates = df.groupBy(df.columns).count().filter("count > 1")
print(duplicates.count())

19338


In [22]:
df = df.dropDuplicates()

In [23]:
num_rows = df.count()
num_cols = len(df.columns)
print(f"Shape: ({num_rows}, {num_cols})")

Shape: (422685, 8)


In [24]:
df.printSchema()

root
 |-- AppVersion: string (nullable = true)
 |-- EventDateTime: string (nullable = true)
 |-- EventName: string (nullable = true)
 |-- Item: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ItemCategory: string (nullable = true)
 |    |    |-- ItemName: string (nullable = true)
 |-- Location: struct (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- Country: string (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- MobileBrandName: string (nullable = true)
 |-- SessionID: string (nullable = true)
 |-- Source: string (nullable = true)



In [25]:
df.show(5)

+----------+-------------------+-------------+--------------------+--------------------+---------------+----------+--------------+
|AppVersion|      EventDateTime|    EventName|                Item|            Location|MobileBrandName| SessionID|        Source|
+----------+-------------------+-------------+--------------------+--------------------+---------------+----------+--------------+
|     2.1.7|28-08-2023 03:50:44|session_start|                  []|{Ho Chi Minh City...|          Apple|1693194644|      (direct)|
|     2.1.7|28-08-2023 06:35:46|    view_cart|[{combo, FREE SEC...|{Buon Ma Thuot, V...|          Apple|1693204441|      (direct)|
|     2.1.7|28-08-2023 18:00:37|session_start|                  []|{Hanoi, Vietnam, ...|          Apple|1693159237|           SMS|
|   Website|28-08-2023 23:54:39|    page_view|                  []|{Hanoi, Vietnam, ...|          Apple|1693180451|l.facebook.com|
|   Website|28-08-2023 13:49:07|session_start|                  []|{Ho Chi Minh Cit

In [26]:
# Define a function to extract item category
def get_item_category(item_list):
    if item_list and len(item_list) > 0:
        return item_list[0].asDict().get('ItemCategory')
    return None

# Define a UDF for item category
get_item_category_udf = udf(get_item_category, StringType())

# Define a function to extract item name
def get_item_name(item_list):
    if item_list and len(item_list) > 0:
        return item_list[0].asDict().get('ItemName')
    return None

# Define a UDF for item name
get_item_name_udf = udf(get_item_name, StringType())

# Apply the UDFs to create new columns
df = df.withColumn("ItemCategory", get_item_category_udf(col("Item"))) \
                   .withColumn("ItemName", get_item_name_udf(col("Item"))) \
                   .drop("Item") # Drop the original 'Item' column

# Show the transformed DataFrame
df.show()

+----------+-------------------+--------------+--------------------+---------------+----------+--------------+---------------+--------------------+
|AppVersion|      EventDateTime|     EventName|            Location|MobileBrandName| SessionID|        Source|   ItemCategory|            ItemName|
+----------+-------------------+--------------+--------------------+---------------+----------+--------------+---------------+--------------------+
|     2.1.7|28-08-2023 03:50:44| session_start|{Ho Chi Minh City...|          Apple|1693194644|      (direct)|           NULL|                NULL|
|     2.1.7|28-08-2023 06:35:46|     view_cart|{Buon Ma Thuot, V...|          Apple|1693204441|      (direct)|          combo|FREE SECOND PIZZA...|
|     2.1.7|28-08-2023 18:00:37| session_start|{Hanoi, Vietnam, ...|          Apple|1693159237|           SMS|           NULL|                NULL|
|   Website|28-08-2023 23:54:39|     page_view|{Hanoi, Vietnam, ...|          Apple|1693180451|l.facebook.com|  

In [27]:
# Convert the PySpark DataFrame to a Pandas DataFrame
df_pd = df.toPandas()

# Now you can use to_csv on the Pandas DataFrame
df_pd.to_csv('clean_data.csv', index=False)

from google.colab import files
files.download('clean_data.csv')