# 1. Data Pre-processing
## 1.1. Define constants

In [30]:
# DATA_PATH = "/Users/nhantran/Library/CloudStorage/GoogleDrive-little.tea.07@gmail.com/My Drive/Workspace/VinBigData/gsm/customer-insights"
DATA_PATH = "/Users/nhantran/Library/CloudStorage/GoogleDrive-little.tea.07@gmail.com/My Drive/Workspace/VinBigData/gsm/customer-insights"

SAMPLE_BOOKING_GSM = "data_booking-gsm_part_0.csv"
SAMPLE_BOOKING_PARTNER = "data_booking-partner_part_0.csv"

DFF_CHUNK_SIZE = 25e6  # 25MB per chunk
EXTRACTED_DATA_PATH = "./extracted-data"

## 1.2. Import libraries and settings

In [None]:
# import pandas as pd

# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)

# 2. Data Exploration
## 2.1. Read the data from csv

In [None]:
import dask.dataframe as dd

# Read the CSV file into a Dask DataFrame
ddf = dd.read_csv(
    f"{DATA_PATH}/{SAMPLE_BOOKING_GSM}",
    blocksize=DFF_CHUNK_SIZE,
    dtype={
        "business_note": "str",
        "message_error": "str",
        "note": "str",
        "other_reason": "str",
        "promotion_code": "str",
        "promotion_session_id": "str",
    },
)
ddf.compute()
ddf.head(2)

## 2.2. Data cleaning
### 2.2.1. Target on `start_address`

In [None]:
# Split the 'address' column into parts using the comma as the delimiter
start_address_parts = ddf["start_address"].str.split(",")

# Define the column names for the administrative levels
columns = [
    "start_address_admin_lvl_4",
    "start_address_admin_lvl_3",
    "start_address_admin_lvl_2",
    "start_address_admin_lvl_1",
    "start_address_admin_lvl_0",
]

# Create a Dask DataFrame with computed columns
ddf_start_address = dd.concat(
    [start_address_parts.str[i].str.strip() for i in range(len(columns))], axis=1
)
ddf_start_address.columns = columns

# Compute
ddf_start_address.compute()

# Print the top 5 rows of dff
print(ddf_start_address.head(2))

In [None]:
# Split the 'address' column into parts using the comma as the delimiter
end_address_parts = ddf["end_address"].str.split(",")

# Define the column names for the administrative levels
columns = [
    "end_address_admin_lvl_4",
    "end_address_admin_lvl_3",
    "end_address_admin_lvl_2",
    "end_address_admin_lvl_1",
    "end_address_admin_lvl_0",
]

# Create a Dask DataFrame with computed columns
ddf_end_address = dd.concat(
    [end_address_parts.str[i].str.strip() for i in range(len(columns))], axis=1
)
ddf_end_address.columns = columns

# Compute
ddf_end_address.compute()

# Print the top 5 rows of dff
print(ddf_end_address.head(2))

In [None]:
# Merge `ddf_start_address` and `ddf_end_address` to the original dff
ddf_final = dd.concat([ddf, ddf_start_address, ddf_end_address], axis=1)
ddf_final.compute()
ddf_final.head(2)

In [None]:
# show the number of rows and columns
print(f"rows: {ddf_final.shape[0].compute()}")
print(f"columns: {ddf_final.shape[1]}")

In [None]:
# list all unique values in the `start_address_admin_lvl_0` column
# print(f"Unique values in start_address_admin_lvl_0: {ddf_final['start_address_admin_lvl_0'].unique().compute()}")
print(f"{ddf_final['start_address_admin_lvl_0'].compute()}")

In [31]:
import pandas as pd
import spacy

activated = spacy.prefer_gpu()
EXTRACTED_DATA_PATH = "./extracted-data"

# Load your CSV files containing street names, ward names, district names, and province names
street_df = pd.read_csv(f"{EXTRACTED_DATA_PATH}/streets.csv")
# street_df.head(2)
ward_df = pd.read_csv(f"{EXTRACTED_DATA_PATH}/wards.csv")
district_df = pd.read_csv(f"{EXTRACTED_DATA_PATH}/districts.csv")
province_df = pd.read_csv(f"{EXTRACTED_DATA_PATH}/provinces.csv")

# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")


def parse_address(address):
    doc = nlp(address)
    street = ""
    ward = ""
    district = ""
    province = ""
    for token in doc:
        # Check if the token is a recognized administrative level
        if token.ent_type_ == "GPE":
            if not street:
                street = token.text
            elif not ward:
                ward = token.text
            elif not district:
                district = token.text
            elif not province:
                province = token.text
        else:
            # If the token is not a recognized entity, check CSV files for matches
            if not street:
                street = get_street_from_csv(token.text)
            if not ward:
                ward = get_ward_from_csv(token.text)
            if not district:
                district = get_district_from_csv(token.text)
            if not province:
                province = get_province_from_csv(token.text)
    return street, ward, district, province


def get_street_from_csv(token):
    for street_name in street_df["streets"]:
        if str(token).lower() in str(street_name).lower():
            return street_name
    return ""


def get_ward_from_csv(token):
    for ward_name in ward_df["full_name"]:
        if token.lower() in ward_name.lower():
            return ward_name
    return ""


def get_district_from_csv(token):
    for district_name in district_df["full_name"]:
        if token.lower() in district_name.lower():
            return district_name
    return ""


def get_province_from_csv(token):
    for province_name in province_df["full_name"]:
        if token.lower() in province_name.lower():
            return province_name
    return ""


def standardize_address(address):
    street, ward, district, province = parse_address(address)
    standardized_address = (
        f"street: {street},\nward: {ward},\ndistrict: {district},\nprovince: {province}"
    )
    return standardized_address


# Example usage:
input_address = "215 Minh Khai, Vinh Tuy, Hai Ba Trung, Ha Noi"
standardized = standardize_address(input_address)
print(standardized)

street: Einstein,
ward: London,
district: United,
province: Kingdom


In [34]:
# Example usage:
# input_address = "215 Minh Khai, Vinh Tuy, Hai Ba Trung, Ha Noi"
input_address = "215 Minh Khai, Vĩnh Tuy, Hai Bà Trưng, Hà Nội"
standardized = standardize_address(input_address)
print(standardized)

street: 215,
ward: Phường Minh Khai,
district: Huyện Yên Minh,
province: Thành phố Hồ Chí Minh
