In [7]:
!pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Using cached googletrans-4.0.0rc1-py3-none-any.whl
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Using cached httpx-0.13.3-py3-none-any.whl (55 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Using cached hstspreload-2024.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Using cached chardet-3.0.4-py2.py3-none-any.whl (133 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Using cached idna-2.10-py2.py3-none-any.whl (58 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Using cached rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Using cached httpcore-0.9.1-py3-none-any.whl (42 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0-rc1)
  Using cached h11-0.9.0-py2.py3-none-any.whl (53 kB)
Collecting h2==3.* 


[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pandas as pd

In [9]:
TRAIN_FILE = "./data/train.csv"
TEST_FILE = "./data/test.csv"

df_train = pd.read_csv(TRAIN_FILE)
df_test = pd.read_csv(TEST_FILE)

X, y = df_train.drop("is_converted", axis=1), df_train["is_converted"]
X_test = df_test.drop(["is_converted", "id"], axis=1)

In [10]:
def preprocess_product_dict(df: pd.DataFrame) -> dict:
    product_category_remap = {
        # IS: Integrated Solutions Inquiry - INTEGRATED SOLUTIONS
        "IS" : [
            "monitor signage",
            "commercial tv",
            "monitor",
            "monitor tv",
            "pc",
            "projector",
            "robot",
            "system ac",
            "ems",
            "rac",
            "tv",
            "refrigerator",
            "washing machine",
            "aircare",
            "vaccum cleaner",
            "styler",
            "dryer",
            "built-in/cooking",
            "home beauty",
            "water care",
            "audio/video",
            "lg thinq home",
            "smart home",
            "lg one:quick_flex",    # healthcare solutions
            "one:quick",    # healthcare solutions
            "quick",    # healthcare solutions
            "services",
        ],
        # CD: B2B Product Inquiry - Commercial Display
        "CD": [
            "^(?=.*\d)[A-Za-z0-9.\s-]+$",    # tv panel model name filter
            "medical display",
            "led bloc",
            "uhd signage",
            "commercial display",
            "oled signage",
            "led signage",
            "video wall signage",
            "interactive signage",
            "high brightness signage",
            "special signage",
            "standard signage",
            "hotel tv",
            "hospital tv",
            "accessories",
            "software solution",
            "signage care solution",
            "webos",
            "pro:centric",
            "one:quick series",
            "signage",
            "display",
            'led',
            'wall',    # video wall,
            'MAGNIT',
            "centric",
            "all",
            "idb",
            "virtual",
            "virtual production",
            "createboard",
        ],
        # IT: B2B Product Inquiry - IT PRODUCTS
        "IT": [
            "monitor",
            "laptop",
            "projector",
            "cloud device",
        ],
        # HE: B2B Product Inquiry - HVAC/ESS
        "HE": [
            "control",
            "ventilation",
            "vrf",
            "split",
            "multi-split",
            "single-split",
            "chiller",
            "heating",
            "energy storage system",
            "ess",
            "solar",
            "heat",
            "water",
            "air condition",
            "air",
            "ceiling",
            "cassette",
            "ondition",
            "cool",
            "multi",
            "support",
        ],
        # CM: B2B Product Inquiry - Compressor and Motor
        "CM": [
            "reciprocating compressor",
            "rotary compressor",
            "scroll compressor",
            "compressor",
            "motor",
        ],
        # RB: B2B Product Inquiry - Robot
        "RB": [
            "LG CLOi UV-C Bot",
            "LG CLOi ServeBot",
            "Shelf type",
            "Drawer type",
            "LG CLOi GuideBot",
            "CLOi",
            "UV-C",
            "Bot",
        ],
        # AM: B2B Product Inquiry - ADVANCED MATERIALS
        "AM": [
            "Antimicrobial",
            "Porcelain enamel",
            "Porcelain",
            "enamel",
            "Specialty glass",
            "Specialty",
            "glass",
        ],
        # others
        "others": [
            "[\(\)]",
            "MISSING_VALUE",    # missing value
            "commercial laundry"    # B2B Product Inquiry
            "others",
            "ohters",
            "other",
            "ohter",
            'otros',
            'outros',
            'error',
            "etc",
        ]
    }
    
    return product_category_remap

In [12]:
from googletrans import Translator


def translate_sentence(sentence: str, dest: str = "en") -> str:
    translator = Translator()
    return translator.translate(sentence, src="auto", dest=dest).text


def detect_language(sentence: str) -> str:
    translator = Translator()
    return translator.detect(sentence).lang


def apply_translation(text):
    global already
    if text in already: return already[text]
    
    lang = detect_language(text)
    tran = translate_sentence(text)
    already[text] = tran

    return tran

In [15]:
from tqdm import tqdm
tqdm.pandas()

def preprocess_product_category(df: pd.DataFrame) -> pd.DataFrame:
    # making dictionary depending on LG official website
    product_category_remap = preprocess_product_dict(df)
    
    # filling missing value: "0"
    df_pc = df['product_category'].fillna("MISSING_VALUE", inplace=False)

    # preprocessing by category
    for key, value in product_category_remap.items():
        regex = '|'.join(value)
        
        df_pc.loc[df_pc.str.contains(
            regex,         # chekcing string pattern
            na=False,     # missing value return is boolean
            case=False    # ignore upper and lower
        )] = key
        
    # 'product_category' 열에서 product_category_remap에 없는 값을 필터링
    filtered_categories = df_pc[~df_pc.isin(product_category_remap)]    
    # tqdm을 사용하여 진행 상태를 표시하면서 apply_translation 함수 적용
    filtered_categories = filtered_categories.progress_apply(apply_translation)
    # 기존 df에 번역한 값을 저장
    df_pc.loc[filtered_categories.index] = filtered_categories
    
    # 20개보다 적게 있는 자잘한 값들을 전부 etc에 추가
    etc = (
        df_pc
        .value_counts()
        .loc[df_pc.value_counts() < 20]
        .index.tolist()
    )
    product_category_remap['others'].extend(etc)
        
    # preprocessing by category
    for key, value in product_category_remap.items():
        regex = '|'.join(value)
        
        df_pc.loc[df_pc.str.contains(
            regex,         # chekcing string pattern
            na=False,     # missing value return is boolean
            case=False    # ignore upper and lower
        )] = key 
  
    return df_pc

In [16]:
# translator가 오래 걸려, 저장용 dictionary 선언
already = dict()
df_pc = preprocess_product_category(df_train)
X['product_category'] = df_pc

# translator가 오래 걸려, 저장용 dictionary 선언
already = dict()
df_pc = preprocess_product_category(df_test)
X_test['product_category'] = df_pc

100%|██████████| 710/710 [00:35<00:00, 19.93it/s] 
100%|██████████| 1/1 [00:00<00:00,  2.06it/s]


In [17]:
X['product_category'].value_counts()

product_category
others    21624
HE        14625
IS        13666
CD         9297
IT           87
Name: count, dtype: int64

In [18]:
X_test['product_category'].value_counts()

product_category
others    2066
HE        1663
CD         801
IS         710
IT          31
Name: count, dtype: int64