# Data Cleansing

## Import necessary libraries

In [17]:
%pip install -qq -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [18]:
# Add current directory to Python path for imports
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

# Add the parent directory (project root) to Python path so we can import from src
project_root = os.path.dirname(os.getcwd())
if project_root not in sys.path:
    sys.path.append(project_root)

In [19]:
from pyspark.sql import SparkSession

## Loading Datasets

In [20]:
from src.utils import read_config_path

# Initialize Spark session
spark = SparkSession.builder.appName("CleansingData").getOrCreate()

# Load data using configuration file
filepath = read_config_path(key="raw_data_path")

df = spark.read.csv(
    filepath,
    header=True,
    inferSchema=True,
    multiLine=True,
    escape='"',
    quote='"',
)

df.show(10)

+-----------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+-------------+--------------------+---------+----+------------+--------------------+
|  ticket_id|               type|        organization|             comment|               photo|         photo_after|            coords|             address|subdistrict|district|     province|           timestamp|    state|star|count_reopen|       last_activity|
+-----------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+-------------+--------------------+---------+----+------------+--------------------+
|2021-FYJTFP|        {ความสะอาด}|          เขตบางซื่อ|             ขยะเยอะ|https://storage.g...|                NULL|100.53084,13.81865|12/14 ถนน กรุงเทพ...|       NULL|    NULL|กรุงเทพมหานคร|2021-09-03 19:51:..

---

## Testing Transformers

### Ingestion Preprocessor

In [21]:
from src.pipelines_spark import IngestionPreprocessorSpark

preprocessor = IngestionPreprocessorSpark()
cleaned_df = preprocessor.transform(df)

cleaned_df.show(10)

+-----------+-------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+-------------+--------------------+---------+--------------------+
|  ticket_id|               type|        organization|             comment|            coords|             address|subdistrict|district|     province|           timestamp|    state|       last_activity|
+-----------+-------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+-------------+--------------------+---------+--------------------+
|2021-CGPMUN|{น้ำท่วม,ร้องเรียน}|เขตประเวศ,ฝ่ายโยธ...|น้ำท่วมเวลาฝนตกแล...|100.66709,13.67891|189 เฉลิมพระเกียร...|    หนองบอน|  ประเวศ|กรุงเทพมหานคร|2021-09-19 21:56:...|เสร็จสิ้น|2022-06-21 15:21:...|
|2021-7XATFA|            {สะพาน}|             เขตสาทร|สะพานลอยปรับปรุงไ...|100.52649,13.72060|191/1 ถนน สาทรเหน...|    ยานนาวา|    สาทร|กรุงเทพมหานคร|2021-09-26 12:03:...|เสร็จสิ้น|2022-06

In [22]:
cleaned_df.printSchema()

root
 |-- ticket_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- coords: string (nullable = true)
 |-- address: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- district: string (nullable = true)
 |-- province: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- state: string (nullable = true)
 |-- last_activity: timestamp (nullable = true)



### Date Transformer

In [23]:
from src.pipelines_spark import DateTransformerSpark

dt = DateTransformerSpark()
df_transformed = dt.transform(df)

df_transformed.show(10)

+-----------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+-------------+---------+----+------------+--------------+---------------+--------------+------------------+-------------------+------------------+
|  ticket_id|               type|        organization|             comment|               photo|         photo_after|            coords|             address|subdistrict|district|     province|    state|star|count_reopen|timestamp_date|timestamp_month|timestamp_year|last_activity_date|last_activity_month|last_activity_year|
+-----------+-------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+-------------+---------+----+------------+--------------+---------------+--------------+------------------+-------------------+------------------+
|2021-FYJTFP|        {ควา

### Province Transformer

In [24]:
from src.pipelines_spark import ProvinceTransformerSpark

ProvinceMatcher = ProvinceTransformerSpark(spark)

df_transformed = ProvinceMatcher.transform(df, "province")

df_transformed.show()

+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+-------------+--------------------+--------------+----+------------+--------------------+
|  ticket_id|                type|        organization|             comment|               photo|         photo_after|            coords|             address|subdistrict|district|     province|           timestamp|         state|star|count_reopen|       last_activity|
+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+-----------+--------+-------------+--------------------+--------------+----+------------+--------------------+
|2021-FYJTFP|         {ความสะอาด}|          เขตบางซื่อ|             ขยะเยอะ|https://storage.g...|                NULL|100.53084,13.81865|12/14 ถนน กรุงเทพ...|       NULL|    NULL|กรุงเทพมหานคร|

In [25]:
# from src.pipelines_spark import ProvinceTransformerSpark

# pt = ProvinceTransformerSpark()
# df_transformed = pt.transform(df)

# df_transformed

# filtered_values = pt.get_filtered_values()

# print(f"Filtered values (not found in whitelist): {filtered_values}")

# df_province_count = (
#     df_transformed
#     .groupBy(col("province"))
#     .count()
#     .orderBy(col("count").desc())
# )

# df_province_count.show(10)

### District and Subdistrict Transformer

In [26]:
# from src.pipelines import DistrictSubdistrictTransformer

# dst = DistrictSubdistrictTransformer()
# df_transformed = pd.DataFrame(dst.fit_transform(df))

# df_area_count = pd.DataFrame(df_transformed[["district", "subdistrict"]].value_counts())
# df_area_count.head(10)

### Coordinate Transformer

In [27]:
# from src.pipelines import CoordinateTransformer

# ct = CoordinateTransformer()
# df_transformed = pd.DataFrame(ct.fit_transform(df))

# df_transformed.head(10)

### Address Transformer

In [28]:
# from src.pipelines import AddressTransformer

# at = AddressTransformer()
# df_transformed = pd.DataFrame(at.fit_transform(df))

# df_transformed.head(10)

### State to Status Transformer

In [29]:
from src.pipelines_spark import StateToStatusTransformerSpark

stst = StateToStatusTransformerSpark()
df_transformed = stst.transform(df_transformed)

df_status_count = df_transformed.groupBy("status").count()
df_status_count.show()

+-----------+------+
|     status| count|
+-----------+------+
|       done|645700|
|    pending| 11706|
|in-progress|129620|
+-----------+------+



---

## Applying Cleansing Pipeline

In [30]:
# from src.pipelines import CleansingPipeline

# cleansing_pipeline = CleansingPipeline()

# df_cleansed = pd.DataFrame(cleansing_pipeline.fit_transform(df))
# df_cleansed.head(10)

In [31]:
# df_cleansed.info()

In [32]:
# pd.DataFrame.to_csv(
#     df_cleansed,
#     os.path.join("..", "data", "processed", "cleansed_data.csv"),
#     index=False,
# )

# spark.stop()

---