In [1]:
# Add current directory to Python path for imports
import os
import sys

# Add the parent directory (project root) to Python path so we can import from src
project_root = os.path.dirname(os.getcwd())
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
import numpy as np
import pandas as pd
import yaml

In [4]:
# Load data using configuration file
configs_path = os.path.join("..", "configs", "configs.yaml")

with open(configs_path, encoding="utf-8") as config_file:
    config = yaml.safe_load(config_file)
    data_filename = config["data"]["raw_data_path"]

# Make path relative to notebook location
data_path = os.path.join("..", data_filename)

df = pd.read_csv(data_path)
print(f"Config loaded from: {configs_path}")
print(f"Data loaded successfully from: {data_path}")
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("First few rows:")
print(df.head(3))

Data loaded successfully from: ..\data\raw\bangkok_traffy.csv
DataFrame shape: (787026, 16)
Columns: ['ticket_id', 'type', 'organization', 'comment', 'photo', 'photo_after', 'coords', 'address', 'subdistrict', 'district', 'province', 'timestamp', 'state', 'star', 'count_reopen', 'last_activity']


# Data Insight

In [4]:
raw_types = df.type.unique().tolist()
unique_types = set()

for t in raw_types:
    if isinstance(t, str):
        cleaned_types = t.strip().replace("{", "").replace("}", "")
        unique_types.update(cleaned_types.split(","))
    elif t is not np.nan:
        unique_types.add(t)

unique_types = sorted(unique_types)
unique_types

['',
 'PM2.5',
 'การเดินทาง',
 'กีดขวาง',
 'คนจรจัด',
 'คลอง',
 'ความปลอดภัย',
 'ความสะอาด',
 'จราจร',
 'ต้นไม้',
 'ถนน',
 'ทางเท้า',
 'ท่อระบายน้ำ',
 'น้ำท่วม',
 'ป้าย',
 'ป้ายจราจร',
 'ร้องเรียน',
 'สอบถาม',
 'สะพาน',
 'สัตว์จรจัด',
 'สายไฟ',
 'ห้องน้ำ',
 'เสนอแนะ',
 'เสียงรบกวน',
 'แสงสว่าง']

In [5]:
count_types = dict.fromkeys(unique_types, 0)

for t in df.type.tolist():
    if isinstance(t, str):
        cleaned_types = t.strip().replace("{", "").replace("}", "")
        for ct in cleaned_types.split(","):
            if ct == "":
                continue
            count_types[ct] += 1
    elif t is not np.nan:
        count_types[t] += 1

count_types_df = pd.DataFrame(list(count_types.items()), columns=["Type", "Count"])
count_types_df.sort_values(by="Count", ascending=False, inplace=True)
count_types_df.reset_index(drop=True, inplace=True)

print(count_types_df.Count.sum())
display(count_types_df)

1075469


Unnamed: 0,Type,Count
0,ถนน,256397
1,ทางเท้า,108874
2,ความปลอดภัย,81678
3,แสงสว่าง,72986
4,ความสะอาด,67555
5,กีดขวาง,66979
6,ร้องเรียน,50920
7,จราจร,47041
8,ท่อระบายน้ำ,42740
9,น้ำท่วม,41796


In [6]:
df.province.value_counts()

province
กรุงเทพมหานคร           660434
จังหวัดกรุงเทพมหานคร    125189
จังหวัดLac                 293
นนทบุรี                    194
สมุทรปราการ                191
                         ...  
อุตรดิตถ์                    1
เลย                          1
จังหวัดSteiermark            1
หนองคาย                      1
แม่ฮ่องสอน                   1
Name: count, Length: 81, dtype: int64

In [7]:
df1 = df.dropna(subset=["subdistrict", "district", "province"])
df1 = df1[df1.province == "กรุงเทพมหานคร"]
df1 = df1.reset_index(drop=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660385 entries, 0 to 660384
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ticket_id      651697 non-null  object 
 1   type           660288 non-null  object 
 2   organization   660275 non-null  object 
 3   comment        651697 non-null  object 
 4   photo          660295 non-null  object 
 5   photo_after    531175 non-null  object 
 6   coords         660385 non-null  object 
 7   address        651697 non-null  object 
 8   subdistrict    660385 non-null  object 
 9   district       660385 non-null  object 
 10  province       660385 non-null  object 
 11  timestamp      660385 non-null  object 
 12  state          660385 non-null  object 
 13  star           267997 non-null  float64
 14  count_reopen   660385 non-null  int64  
 15  last_activity  660385 non-null  object 
dtypes: float64(1), int64(1), object(14)
memory usage: 80.6+ MB


---

# Cleansing

In [8]:
from src import ProvinceTransformer

pt = ProvinceTransformer(columns=["province"])
df_transformed = pd.DataFrame(pt.fit_transform(df))
filtered_values = pt.get_filtered_values()

print("Filtered values (not found in whitelist):")
for val in filtered_values:
    print(val)

df_province_count = pd.DataFrame(df_transformed.province.value_counts())
df_province_count

Loading province whitelist from: c:\Users\pun\Desktop\CU\CEDT-Y2-S1\2110403 - Introduction to Data Science and Data Engineering\CEDT-2110403-DSDE-Project\configs\thailand_province_whitelist.json
Filtered values (not found in whitelist):

nan
Tillabéri
Sahel
Riau
Borno
Perak
Steiermark
Lac
Filtered values (not found in whitelist):

nan
Tillabéri
Sahel
Riau
Borno
Perak
Steiermark
Lac


Unnamed: 0_level_0,count
province,Unnamed: 1_level_1
กรุงเทพมหานคร,785662
นนทบุรี,261
สมุทรปราการ,244
ปทุมธานี,110
สมุทรสาคร,45
นครปฐม,30
นครราชสีมา,21
ภูเก็ต,14
เพชรบุรี,13
ราชบุรี,12
