# Objective

With the purpose of testing dqx capabilities in data quality checks we are going to modify the Netflix dataset to include new "trash" data to test dqx boundaries. 

In [0]:
# read original data

raw_data = spark.read.options(header=True, inferSchema=True).csv("/Volumes/workspace/default/files/DQX_demo_data/netflix1.csv")

In [0]:
from pyspark.sql.functions import col, rand, when, lit, concat, udf
from pyspark.sql.types import StringType
import random

# Load dataset
df = spark.read.csv("/Volumes/workspace/default/files/DQX_demo_data/netflix1.csv", header=True, inferSchema=False)

# --- 1. show_id: malformed IDs ---
df = df.withColumn(
    "show_id",
    when(rand() < 0.05, concat(lit("xx"), col("show_id"))).
    when(rand() < 0.03, lit(None)).otherwise(col("show_id"))
)

# --- 2. type: wrong / lowercase values ---
df = df.withColumn(
    "type",
    when(rand() < 0.05, lit("movie"))
    .when(rand() < 0.02, lit("Documentary"))
    .otherwise(col("type"))
)

# --- 3. title: nulls or typos ---
df = df.withColumn(
    "title",
    when(rand() < 0.05, lit(None))
    .when(rand() < 0.05, concat(col("title"), lit("@@@")))
    .otherwise(col("title"))
)

# --- 4. director: missing or numbers ---
df = df.withColumn(
    "director",
    when(rand() < 0.05, lit(None))
    .when(rand() < 0.03, concat(col("director"), lit("123")))
    .otherwise(col("director"))
)

# --- 5. country: inconsistent spellings ---
df = df.withColumn(
    "country",
    when(rand() < 0.05, lit("U.S."))
    .when(rand() < 0.03, lit(None))
    .otherwise(col("country"))
)

# --- 6. date_added: invalid formats ---
df = df.withColumn(
    "date_added",
    when(rand() < 0.05, lit("2021-13-40"))
    .when(rand() < 0.03, lit(None))
    .otherwise(col("date_added"))
)

# --- 7. rating: invalid categories ---
df = df.withColumn(
    "rating",
    when(rand() < 0.05, lit("PG-99"))
    .when(rand() < 0.03, lit("tv-ma"))
    .otherwise(col("rating"))
)

# --- 8. duration: wrong formats ---
df = df.withColumn(
    "duration",
    when(rand() < 0.05, lit("-100 min"))
    .when(rand() < 0.03, lit("90 minutes"))
    .otherwise(col("duration"))
)

# --- 9. listed_in: dirty categories ---
df = df.withColumn(
    "listed_in",
    when(rand() < 0.05, lit("Dramas$$$"))
    .when(rand() < 0.03, lit(None))
    .otherwise(col("listed_in"))
)

# --- Add duplicates ---
duplicates = df.sample(withReplacement=True, fraction=0.05)
df_dirty = df.union(duplicates)
display(df_dirty)

df_dirty.coalesce(1).write.mode("overwrite").option("header", True).csv("/Volumes/workspace/default/files/DQX_demo_data/netflix_low_quality")