# Standardize

## Dependencies

In [None]:
import requests
import json
import time
import random
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from datetime import datetime, timedelta

import sys
from pathlib import Path

# Automatically detect the repo root (parent of notebook folder)
repo_root = Path().resolve().parent  # if notebook is in 'notebooks/' folder
sys.path.append(str(repo_root))

from config.config import get_environment

from config.config import data_import_json, data_export_json, data_import_pandas, data_export_pandas

## ENV

In [None]:
ENV = get_environment(
    env_path="../environments",
    env_name="env.json"
)

# content_date = datetime.now().date() + timedelta(days=0)
content_date = ENV['CONTENT_DATE']
version = ENV['VERSION']

website = ENV['SOURCE']['NAME']
# website = ENV['TARGET']["1"]['NAME']
# website = ENV['TARGET']["2"]['NAME']

### Dependencies columns

In [None]:
# Base columns for Standardize
base_columns = [
    'scrape_date', # datetime.date in str # as is
    'website', # str
    'category', # str
    'brand', # str
    'item_id', # str
    'item_name', # str
    'item_variant', # str
    'item_url', # str
    'url_image', # str
    'in_stock', # int bool (1/0)
    'review_total', # int
    'review_rating', # float
    'currency', # str
    'price', # float
    'price_after_disc', # float
    'price_disc', # float
    'is_package', # int bool (1/0)
]

## Functions

### Standardize Functions

In [None]:
# Standardize Sociolla columns based on base columns
def standardize_sociolla(
        df_input: pd.DataFrame,
        base_columns: list[str]=base_columns
    ):

    # Handle variant_id list, convert each element to str then join by _
    df_input['variant_id'] = df_input['variant_id'].apply(
        lambda v: '_'.join(
            [
                re.sub(r'\.0\b', '', str(s)) for s in v
            ]
        ) if isinstance(v, list)
        else v
    )

    # Convert int to str safely
    convert_cols = ['id', 'variant_id']
    for col in convert_cols:
        df_input[col] = df_input[col].fillna('').astype(str).str.replace(r'\.0\b', '', regex=True)

    # Convert to str
    convert_cols = ['brand']
    for col in convert_cols:
        df_input[col] = df_input[col].fillna('').astype(str)

    # Handle variant_name list, convert each element to str then join by ' '
    df_input['variant_name'] = df_input['variant_name'].apply(
        lambda v: ' '.join(
            [
                str(s) for s in v
            ]
        ) if isinstance(v, list)
        else v
    )

    # Non-specify as ''
    df_input['variant_name'] = df_input['variant_name'].fillna('').astype(str).replace('Non Specify', '')

    # Assign webiste
    df_input['website'] = website

    # Generate item_id
    df_input['item_id'] = df_input[['id', 'variant_id']].apply(tuple, axis=1).str.join('_')

    # Generate item_name
    df_input['item_name'] = df_input[['name', 'variant_name']].apply(tuple, axis=1).str.join(' ').str.strip()

    # Generate item_variant
    df_input['item_variant'] = np.where(
        df_input['variant_name'] != '',
        df_input['variant_name'],
        None
    )

    # Generate item_url
    df_input['item_url'] = df_input['url']

    # Generate in_stock
    df_input['in_stock'] = np.where(
        df_input['stock'] > 0,
        int(1),
        int(0)
    )

    # Convert is_package to int
    df_input['is_package'] = np.where(
        df_input['is_package'],
        int(1),
        int(0)
    )

    # Fill empty review with 0
    fillna_cols = ['review_total', 'review_rating']
    for col in fillna_cols:
        df_input[col] = df_input[col].fillna(0)

    # Generate price_disc
    df_input['price_disc'] = df_input['price'] - df_input['price_after_disc']

    # Round float 2 decimals
    round_cols = ['review_rating', 'price', 'price_after_disc', 'price_disc']
    for col in round_cols:
        df_input[col] = df_input[col].round(2)

    # Handle Duplicated Items (Prio in_stock desc and stock desc)
    df_input.sort_values(by=['in_stock', 'stock'], ascending=[False, False], inplace=True)
    # Remove duplicated, keep first based on sorted prio
    df_input = df_input[~df_input['item_id'].duplicated()].copy(deep=True).reset_index(drop=True)

    df_input = df_input[base_columns].copy(deep=True)

    return df_input

In [None]:
# Standardize Guardian columns based on base columns
def standardize_guardian(
        df_input: pd.DataFrame,
        base_columns: list[str]=base_columns
    ):

    # Convert int to str safely
    convert_cols = ['id']
    for col in convert_cols:
        df_input[col] = df_input[col].fillna('').astype(str).str.replace(r'\.0\b', '', regex=True)

    # Convert to str
    convert_cols = ['brand']
    for col in convert_cols:
        df_input[col] = df_input[col].fillna('').astype(str)

    # Assign webiste
    df_input['website'] = website

    # Generate item_id
    df_input['item_id'] = df_input['id']

    # Generate item_name
    df_input['item_name'] = df_input['name']

    # Generate item_variant
    df_input['item_variant'] = df_input['variant_name']

    # Generate item_url
    df_input['item_url'] = df_input['url']

    # Generate in_stock
    df_input['in_stock'] = np.where(
        df_input['stock'] > 0,
        int(1),
        int(0)
    )

    # Fill empty review with 0
    fillna_cols = ['review_total', 'review_rating']
    for col in fillna_cols:
        df_input[col] = df_input[col].fillna(0)

    # Round float 2 decimals
    round_cols = ['review_rating', 'price', 'price_after_disc', 'price_disc']
    for col in round_cols:
        df_input[col] = df_input[col].round(2)

    # Handle Duplicated Items (Prio in_stock desc and stock desc)
    df_input.sort_values(by=['in_stock', 'stock'], ascending=[False, False], inplace=True)
    # Remove duplicated, keep first based on sorted prio
    df_input = df_input[~df_input['item_id'].duplicated()].copy(deep=True).reset_index(drop=True)

    df_input = df_input[base_columns].copy(deep=True)

    return df_input

In [None]:
# Standardize Watsons columns based on base columns
def standardize_watsons(
        df_input: pd.DataFrame,
        base_columns: list[str]=base_columns
    ):

    # Convert int to str safely
    convert_cols = ['id']
    for col in convert_cols:
        df_input[col] = df_input[col].fillna('').astype(str).str.replace(r'\.0\b', '', regex=True)

    # Convert to str
    convert_cols = ['brand']
    for col in convert_cols:
        df_input[col] = df_input[col].fillna('').astype(str)

    # Assign webiste
    df_input['website'] = website

    # Generate item_id
    df_input['item_id'] = df_input['id']

    # Generate item_name
    df_input['item_name'] = df_input['name']

    # Generate item_variant
    df_input['item_variant'] = df_input['variant_name']

    # Generate item_url
    df_input['item_url'] = df_input['url']

    # Generate in_stock
    df_input['in_stock'] = np.where(
        ~df_input['stock'].isin(['outOfStock']),
        int(1),
        int(0)
    )

    # Fill empty review with 0
    fillna_cols = ['review_total', 'review_rating']
    for col in fillna_cols:
        df_input[col] = df_input[col].fillna(0)

    # Fill empty price_after_disc with price
    df_input['price_after_disc'] = df_input['price_after_disc'].fillna(df_input['price'])

    # Generate price_disc
    df_input['price_disc'] = df_input['price'] - df_input['price_after_disc']

    # Round float 2 decimals
    round_cols = ['review_rating', 'price', 'price_after_disc', 'price_disc']
    for col in round_cols:
        df_input[col] = df_input[col].round(2)

    # Handle Duplicated Items (Prio in_stock desc and stock asc (inStock -> lowStock -> outOfStock))
    df_input.sort_values(by=['in_stock', 'stock'], ascending=[False, True], inplace=True)
    # Remove duplicated, keep first based on sorted prio
    df_input = df_input[~df_input['item_id'].duplicated()].copy(deep=True).reset_index(drop=True)

    df_input = df_input[base_columns].copy(deep=True)

    return df_input

## Standardize

### Input

In [None]:
df_input = data_import_pandas(
    website=website,
    folder_name=f'parser/{website}',
    version=version,
    content_date=content_date, # "0000-00-00"
    additional_info="parsed",
)

### Execute Standardize

In [None]:
if website in ['sociolla']:
    df_std = standardize_sociolla(
        df_input=df_input,
        base_columns=base_columns
    )

elif website in ['guardian']:
    df_std = standardize_guardian(
        df_input=df_input,
        base_columns=base_columns
    )

elif website in ['watsons']:
    df_std = standardize_watsons(
        df_input=df_input,
        base_columns=base_columns
    )

## Output per Website

In [None]:
data_export_pandas(
    df_output=df_std,
    website=website,
    folder_name=f'standardized/{website}',
    version=version,
    content_date=content_date, # "0000-00-00"
    additional_info="standardized",
    incl_excel=True
)

## Merge across websites

In [None]:
try:
    df_all = data_import_pandas(
        website="all",
        folder_name='standardized/all',
        version=version,
        content_date=content_date, # "0000-00-00"
        additional_info="standardized"
    )

except FileNotFoundError:
    df_all = pd.DataFrame()
    df_all['website'] = None

In [None]:
# Remove current website from the exported data
df_all = df_all[df_all['website'] != website].copy(deep=True).reset_index(drop=True)

# Merge current website
df_all = pd.concat([
    df_all,
    df_std
]).reset_index(drop=True)

In [None]:
# Export merged across websites

data_export_pandas(
    df_output=df_all,
    website="all",
    folder_name='standardized/all',
    version=version,
    content_date=content_date, # "0000-00-00"
    additional_info="standardized"
)