### Objective

In this notebook, learners will practice data cleaning techniques using PySpark. They will:

- Load CSV data into a DataFrame  
- Standardize text fields by trimming and lowering case
- Convert columns to appropriate data types  

These steps are foundational for preparing real-world data for analysis or machine learning.


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType, DoubleType
import random
from datetime import datetime, timedelta
import uuid
from pyspark.sql.functions import when, lower, trim, col


# Initialize Spark session
spark = SparkSession.builder \
    .appName("DataCleaning") \
    .getOrCreate()

In [None]:
# Define a helper function to generate random dates
def generate_random_dates(start_date, end_date, n):
    start_timestamp = datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S")
    end_timestamp = datetime.strptime(end_date, "%Y-%m-%d %H:%M:%S")
    delta = end_timestamp - start_timestamp

    dates = []
    for _ in range(n):
        random_days = random.randint(0, delta.days)
        random_seconds = random.randint(0, 86400)  # random seconds within a day
        random_date = start_timestamp + timedelta(days=random_days, seconds=random_seconds)
        dates.append(random_date.strftime("%Y-%m-%d %H:%M:%S"))

    return dates

# Define possible values for product, category, and brand
products = [1003461, 5000088, 17302664, 3601530, 1004775, 1306894, 1306421, 1590604, 12708937, 1004258]
categories = ["electronics.smartphone", "appliances.sewing_machine", "appliances.kitchen.washer", "computers.notebook", "furniture.living_room.sofa"]
brands = ["xiaomi", "janome", "creed", "lg", "hp", "rondell", "michelin", "apple", "samsung", "huawei"]
event_types = ["view", "click", "purchase"]

# Generate sample data
n = 1000  # number of rows
start_date = "2019-11-01 00:00:00"
end_date = "2019-11-30 23:59:59"

# Generate random data
dates = generate_random_dates(start_date, end_date, n)
event_types = [random.choice(event_types) for _ in range(n)]
product_ids = [random.choice(products) for _ in range(n)]
category_codes = [random.choice(categories) for _ in range(n)]
brands = [random.choice(brands) for _ in range(n)]
prices = [round(random.uniform(20.0, 1000.0), 2) for _ in range(n)]  # random prices between 20 and 1000
user_ids = [random.randint(100000000, 999999999) for _ in range(n)]
user_sessions = [str(uuid.uuid4()) for _ in range(n)]  # generate random UUIDs

# Create the DataFrame
data = list(zip(dates, event_types, product_ids, category_codes, brands, prices, user_ids, user_sessions))
columns = ["event_time", "event_type", "product_id", "category_code", "brand", "price", "user_id", "user_session"]

df = spark.createDataFrame(data, columns)

# Show a sample of the data
df.show(5)

+-------------------+----------+----------+--------------------+--------+------+---------+--------------------+
|         event_time|event_type|product_id|       category_code|   brand| price|  user_id|        user_session|
+-------------------+----------+----------+--------------------+--------+------+---------+--------------------+
|2019-11-12 23:25:59|      view|   1004258|furniture.living_...| samsung|271.43|643516476|248723df-6dec-4bf...|
|2019-11-05 01:18:10|  purchase|  12708937|  computers.notebook|michelin|315.69|962838478|a8f877c7-ce3a-41d...|
|2019-11-17 18:39:50|  purchase|   1003461|appliances.kitche...|   apple|954.49|394739190|08a5bbaa-6654-47a...|
|2019-11-18 11:05:51|     click|   3601530|  computers.notebook|michelin|301.05|293348677|3bbc9541-7227-4be...|
|2019-11-01 20:47:02|  purchase|   1590604|electronics.smart...|  janome|914.37|400669603|60810324-9779-4e4...|
+-------------------+----------+----------+--------------------+--------+------+---------+--------------

In [None]:
# Apply trimming and lowercase to string columns
df_cleaned = df.withColumn("event_type", lower(trim(col("event_type")))) \
               .withColumn("category_code", lower(trim(col("category_code")))) \
               .withColumn("brand", lower(trim(col("brand"))))

df_cleaned.select("event_type", "category_code", "brand").show(5)

+----------+--------------------+--------+
|event_type|       category_code|   brand|
+----------+--------------------+--------+
|      view|furniture.living_...| samsung|
|  purchase|  computers.notebook|michelin|
|  purchase|appliances.kitche...|   apple|
|     click|  computers.notebook|michelin|
|  purchase|electronics.smart...|  janome|
+----------+--------------------+--------+
only showing top 5 rows



In [None]:
# Replace "N/A", "unknown", empty strings with null
for col_name in ["category_code", "brand"]:
    df_cleaned = df_cleaned.withColumn(
        col_name,
        when(col(col_name).isin("N/A", "unknown", ""), None).otherwise(col(col_name))
    )

# Fill missing values with defaults
df_filled = df_cleaned.fillna({
    "category_code": "unspecified",
    "brand": "unbranded",
    "price": 0.0
})

df_filled.select("category_code", "brand", "price").show(5)

+--------------------+--------+------+
|       category_code|   brand| price|
+--------------------+--------+------+
|furniture.living_...| samsung|271.43|
|  computers.notebook|michelin|315.69|
|appliances.kitche...|   apple|954.49|
|  computers.notebook|michelin|301.05|
|electronics.smart...|  janome|914.37|
+--------------------+--------+------+
only showing top 5 rows



In [None]:
# Convert 'event_time' to Timestamp and ensure 'price' is float
df_typed = df_filled.withColumn("event_time", col("event_time").cast(TimestampType())) \
                    .withColumn("price", col("price").cast(DoubleType()))

df_typed.printSchema()
df_typed.select("event_time", "price").show(5)

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: long (nullable = true)
 |-- category_code: string (nullable = false)
 |-- brand: string (nullable = false)
 |-- price: double (nullable = false)
 |-- user_id: long (nullable = true)
 |-- user_session: string (nullable = true)

+-------------------+------+
|         event_time| price|
+-------------------+------+
|2019-11-12 23:25:59|271.43|
|2019-11-05 01:18:10|315.69|
|2019-11-17 18:39:50|954.49|
|2019-11-18 11:05:51|301.05|
|2019-11-01 20:47:02|914.37|
+-------------------+------+
only showing top 5 rows



### Summary

In this notebook, we learned how to:

- Load raw CSV data into a PySpark DataFrame  
- Standardize and clean string fields using `trim()` and `lower()`  
- Replace placeholder strings with nulls, and fill in default values  
- Convert string columns like `event_time` into proper timestamp format